Python HTMLParser примеры, wpull.document.htmlparse.lxml_.HTMLParser Python примеры использования

Пример #1

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_html_soup(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['Refresh'] = 'yes'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'soup.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({'http://example.com/ABOUTM~1.JPG'}, inline_urls)
        self.assertEqual(
            {
                'http://example.com/BLOG',
                'http://example.com/web ring/Join.htm',
            }, linked_urls)

Пример #2

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_rss_as_html(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'application/rss+xml'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'rss.xml')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)

        self.assertTrue(scrape_result)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links
        self.assertFalse(inline_urls)
        self.assertEqual(
            {
                'http://www.someexamplerssdomain.com/main.html',
                'http://www.wikipedia.org/'
            }, linked_urls)

Пример #3

0

Показать файл

Файл: sitemap_test.py Проект: fakegit/ludios_wpull

    def test_sitemap_encoding(self):
        parser = HTMLParser()
        reader = SitemapReader(parser)

        bom_map = {
            'utf_16_le': codecs.BOM_UTF16_LE,
            'utf_16_be': codecs.BOM_UTF16_BE,
            'utf_32_le': codecs.BOM_UTF32_LE,
            'utf_32_be': codecs.BOM_UTF32_BE,
        }

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if name.startswith('utf_16') or name.startswith('utf_32'):
                # FIXME: libxml/lxml doesn't like it when we pass in a codec
                # name but don't specify the endian but BOM is included
                continue

            data = io.BytesIO(
                bom_map.get(name, b'') +
                '<?xml version="1.0" encoding="UTF-8"?>'
                '<urlset><url><loc>blah</loc></url></urlset>'.encode(name)
            )

            print('->', name)

            links = tuple(reader.iter_links(data, encoding=name))
            link = links[0]
            self.assertEqual('blah', link)

Пример #4

0

Показать файл

    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper(HTMLParser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/',
        }, linked_urls)
        self.assertFalse(inline_urls)

Пример #5

0

Показать файл

    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper(HTMLParser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/sitemap1.xml.gz',
        }, linked_urls)
        self.assertFalse(inline_urls)

Пример #6

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_html_scraper_links_base_href(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'basehref.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('utf-8', scrape_result.encoding)

        self.assertEqual(
            {
                'http://cdn.example.com/stylesheet1.css',
                'http://www.example.com/stylesheet2.css',
                'http://example.com/a/stylesheet3.css',
                'http://example.com/a/dir/image1.png',
                'http://example.com/dir/image2.png',
                'http://example.net/image3.png',
                'http://example.com/dir/image4.png',
            }, inline_urls)
        self.assertEqual({'http://example.com/a/'}, linked_urls)

Пример #7

0

Показать файл

    def test_html_encoding(self):
        html_parser = HTMLParser()
        reader = HTMLReader(html_parser)

        bom_map = {
            'utf_16_le': codecs.BOM_UTF16_LE,
            'utf_16_be': codecs.BOM_UTF16_BE,
            'utf_32_le': codecs.BOM_UTF32_LE,
            'utf_32_be': codecs.BOM_UTF32_BE,
        }

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if name.startswith('utf_16') or name.startswith('utf_32'):
                # FIXME: libxml/lxml doesn't like it when we pass in a codec
                # name but don't specify the endian but BOM is included
                continue

            print('->', name)

            data = io.BytesIO(bom_map.get(name, b'') + '<img>'.encode(name))
            elements = tuple(reader.iter_elements(data, encoding=name))

            html_element = elements[0]
            self.assertEqual('html', html_element.tag)

Пример #8

0

Показать файл

    def test_html_script_comment(self):
        test_string = b'''<script><!-- blah --></script>'''

        reader = HTMLReader(HTMLParser())
        elements = reader.iter_elements(io.BytesIO(test_string),
                                        encoding='ascii')
        elements = tuple(elements)

        self.assertTrue(
            all(isinstance(element, Element) for element in elements))

Пример #9

0

Показать файл

    def test_sitemap_scraper_invalid_robots(self):
        scraper = SitemapScraper(HTMLParser())
        request = Request('http://example.com/robots.txt')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b'dsfju3wrji kjasSItemapsdmjfkl wekie;er :Ads fkj3m /Dk')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertFalse(linked_urls)
        self.assertFalse(inline_urls)

Пример #10

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_bad_xml(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'foxstripcomics_bad_xml.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        # No crash
        scraper.scrape(request, response, link_type=LinkType.html)

Пример #11

0

Показать файл

    def test_sitemap_scraper_robots(self):
        scraper = SitemapScraper(HTMLParser())
        request = Request('http://example.com/robots.txt')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'Sitemap: http://example.com/sitemap00.xml')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://example.com/sitemap00.xml',
        }, linked_urls)
        self.assertFalse(inline_urls)

Пример #12

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_html_encoding_lxml_name_mismatch(self):
        '''It should accept encoding names with underscore.'''
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html; charset=EUC_KR'

        with wpull.util.reset_file_offset(response.body):
            response.body.write('힖'.encode('euc_kr'))

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)
        self.assertEqual('euc_kr', scrape_info['encoding'])

Пример #13

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_html_scraper_reject_type(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'many_urls.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request,
                                       response,
                                       link_type=LinkType.css)
        self.assertFalse(scrape_result)

Пример #14

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_html_garbage(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html'

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b'\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f'
                b'\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo'
                b'\x7f\xff\xff\xffV\xc1\xff\x7f\xff7')

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)

Пример #15

0

Показать файл

    def test_html_parse_doctype(self):
        html_parser = HTMLParser()

        self.assertIn(
            'html',
            html_parser.parse_doctype(
                io.BytesIO(b'<!DOCTYPE HTML><html></html>')))
        self.assertIn(
            'XHTML',
            html_parser.parse_doctype(
                io.BytesIO(b'''
                <!DOCTYPE html PUBLIC
                "-//W3C//DTD XHTML 1.0 Transitional//EN"
                "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
                <html></html>
                ''')))
        self.assertFalse(html_parser.parse_doctype(
            io.BytesIO(b'hello world!')))
        self.assertFalse(html_parser.parse_doctype(io.BytesIO(b'')))
        self.assertFalse(html_parser.parse_doctype(io.BytesIO(b'\x00')))
        self.assertFalse(html_parser.parse_doctype(io.BytesIO(b'A\xfe')))

Пример #16

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_html_serious_bad_encoding(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(),
                              element_walker,
                              encoding_override='utf8')
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html; charset=utf8'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'xkcd_1_evil.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)

Пример #17

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_html_not_quite_charset(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'videogame_top.htm')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn('http://example.com/copyright_2001_2006_rtype.gif',
                      inline_urls)
        self.assertIn('http://www.geocities.jp/gamehouse_grindcrusher/',
                      linked_urls)

Пример #18

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_html_wrong_charset(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'kcna.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('utf-16-le', scrape_result.encoding)

        self.assertEqual(
            {
                'http://example.com/utm/__utm.js',
                'http://example.com/Knewskage.gif',
                'http://example.com/Lline.gif',
                'http://example.com/Sline.gif',
                'http://example.com/korean01.gif',
                'http://example.com/korean02.gif',
                'http://example.com/english01.gif',
                'http://example.com/english02.gif',
                'http://example.com/Tongsinkage.gif',
                'http://example.com/Knewskage.gif',
            }, inline_urls)
        self.assertEqual(
            {
                'http://example.com/index-k.htm',
                'http://example.com/index-e.htm',
            }, linked_urls)

Пример #19

0

Показать файл

    def test_html_layout(self):
        html_parser = HTMLParser()
        reader = HTMLReader(html_parser)

        elements = tuple(
            reader.iter_elements(io.BytesIO(b'''
            <html>
                <head>
                    <title>hi</title>
                </head>
                <body>
                    <img>
                </body>
            </html>'''),
                                 encoding='ascii'))

        print(elements)

        self.assertEqual('html', elements[0].tag)
        self.assertEqual('head', elements[1].tag)
        self.assertEqual('title', elements[2].tag)
        self.assertEqual('body', elements[3].tag)
        self.assertEqual('img', elements[4].tag)

Пример #20

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_html_krokozyabry(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html; charset=KOI8-R'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'krokozyabry.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('koi8-r', scrape_result.encoding)

        self.assertEqual(set(), inline_urls)
        self.assertEqual({'http://example.com/Кракозябры'}, linked_urls)

Пример #21

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_xhtml_invalid(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'xhtml_invalid.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual(
            {
                'http://example.com/image.png',
                'http://example.com/script.js',
            }, inline_urls)
        self.assertEqual({'http://example.com/link'}, linked_urls)

Пример #22

0

Показать файл

    def test_html_early_html(self):
        reader = HTMLReader(HTMLParser())

        for test_string in [
                b'''<!DOCTYPE HTML><html></html><img>''',
                b'''<html></html><img>''',
                b'''<!DOCTYPE HTML><img><html></html>''',
                b'''<img><html></html>''',
                b'''<!DOCTYPE HTML>
                <html><body></body></html><p><img>''',
                b'''
                <html><body></body></html><p><img>''',
                b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
                b'''
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
                b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
                b'''
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
        ]:
            elements = []
            print()
            print('a' * 10)
            print(test_string)
            for element in reader.iter_elements(io.BytesIO(test_string),
                                                encoding='ascii'):
                if isinstance(element, Element):
                    print(element)
                    elements.append(element)

            element_tags = tuple(element.tag for element in elements)
            self.assertIn('img', element_tags)

Пример #23

0

Показать файл

Файл: html_test.py Проект: fakegit/ludios_wpull

    def test_html_scraper_links(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()
        response.fields['Refresh'] = '3; url=header_refresh.html'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'many_urls.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('utf-8', scrape_result.encoding)

        self.assertEqual(
            frozenset({
                'http://example.com/style_import_url.css',
                'http://example.com/style_import_quote_url.css',
                'http://example.com/style_single_quote_import.css',
                'http://example.com/style_double_quote_import.css',
                'http://example.com/bg.png',
                'http://example.com/link_href.css',
                'http://example.com/script.js',
                'http://example.com/body_background.png',
                'http://example.com/images/table_background.png',
                'http://example.com/images/td_background.png',
                'http://example.com/images/th_background.png',
                'http://example.com/style_url1.png',
                'http://example.com/style_url2.png',
                'http://example.com/applet/',  # returned by lxml
                'http://example.com/applet/applet_code.class',
                'http://example.com/applet/applet_src.class',
                'http://example.com/bgsound.mid',
                'http://example.com/audio_src.wav',
                'http://example.com/audio_poster.jpeg',
                'http://example.net/source_src.wav',
                'http://example.com/video_src.webm',
                'http://example.com/video_poster.jpeg',
                'http://example.net/track_src.vtt',
                'http://example.net/source_src.webm',
                'http://example.com/embed_src.mov',
                'http://example.com/fig_src.png',
                'http://example.com/frame_src.html',
                'http://example.com/iframe_src.html',
                'http://example.com/img_href.png',
                'http://example.com/img_lowsrc.png',
                'http://example.com/img_src.png',
                'http://example.com/img_data.png',
                'http://example.com/img_srcset_1.jpeg',
                'http://example.com/img_srcset_2.jpeg',
                'http://example.com/img_srcset_3.jpeg',
                'http://example.com/input_src.png',
                'http://example.com/layer_src.png',
                'http://example.com/object/',  # returned by lxml
                'http://example.com/object/object_data.swf',
                'http://example.com/object/object_archive.dat',
                'mailto:internet',
                'object_not_url_codebase',
                'http://example.com/param_ref_value.php',
                'http://example.com/overlay_src.html',
                'http://example.com/script_variable.png',
            }),
            inline_urls)
        self.assertEqual(
            frozenset({
                'http://example.com/og_image.png',
                'http://example.com/og_url.html',
                'http://example.com/og_audio.mp3',
                'http://example.com/og_video.webm',
                'http://example.com/twitter_image.png',
                'http://example.com/twitter_image0.png',
                'http://example.com/twitter_image1.png',
                'http://example.com/twitter_image2.png',
                'http://example.com/twitter_image3.png',
                'http://example.com/twitter_player.html',
                'http://example.com/twitter_stream.mp4',
                'http://example.net/soup.html',
                'http://example.com/a_href.html',
                'http://example.com/area_href.html',
                'http://example.com/frame_src.html',
                'http://example.com/embed_href.html',
                'http://example.com/embed_src.mov',
                'http://example.com/form_action.html',
                'http://example.com/iframe_src.html',
                'http://example.com/layer_src.png',
                'http://example.com/overlay_src.html',
                'ftp://ftp.protocol.invalid/',
                'mailto:[email protected]',
                'http://a-double-slash.example',
                'http://example.com/header_refresh.html',
                'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6',
                'http://example.com/document_write.html',
                'http://example.com/http_document_write.html',
                'http://example.com/http_document_write2.html',
                'http://example.com/http document write.html',
                'http://example.com/script_variable.html',
                'http://example.com/http_script_variable.html',
                'https://example.com/https_script_variable.html',
                'ftp://example.com/ftp_script_variable.html',
                'http://example.com/end_dir_script_variable/',
                'http://example.com/start_dir_script_variable',
                'http://example.com/../relative_dir_script_variable'
                if sys.version_info <
                (3, 5) else 'http://example.com/relative_dir_script_variable',
                'http://example.com/script_json.html',
                'http://example.com/http_script_json.html?a=b',
                'http://example.com/a_javascript_link.html',
                'http://example.com/a_onclick_link.html',
            }), linked_urls)

        for url in inline_urls | linked_urls:
            self.assertIsInstance(url, str)

Python HTMLParser примеры использования