Exemplo n.º 1
0
 def do_parse(num):
     collect()
     before = len(gc.get_objects())
     for i in range(num):
         parse(HTML)
     collect()
     return len(gc.get_objects()) - before
Exemplo n.º 2
0
    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        set_soup_module(sys.modules[BeautifulSoup.__module__])
        soup = parse(usrc, return_root=False)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = parse(replace, return_root=False)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Exemplo n.º 3
0
    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        set_soup_module(sys.modules[BeautifulSoup.__module__])
        soup = parse(usrc, return_root=False)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = parse(replace, return_root=False)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Exemplo n.º 4
0
    def test_soup_leak(self):
        HTML = '<p a=1>\n<a b=2 id=3>y</a>z<x:x class=4>1</x:x>'
        parse(HTML)  # So that BS and html_parser set up any internal objects

        def do_parse(num):
            collect()
            before = len(gc.get_objects())
            for i in range(num):
                parse(HTML)
            collect()
            return len(gc.get_objects()) - before

        for num in (1, 10, 100):
            self.assertLess(do_parse(num), 2)
Exemplo n.º 5
0
 def test_doctype_stays_intact(self):
     base = '\n<html><body><p>xxx</p></body></html>'
     for dt in ('html', 'html PUBLIC "-//W3C//DTD HTML 4.01//EN" '
                '"http://www.w3.org/TR/html4/strict.dtd"'):
         dt = '<!DOCTYPE {}>'.format(dt)
         soup = parse(dt + base, return_root=False, keep_doctype=True)
         parsed_doctype = str(soup).split('\n', 1)[0]
         self.ae(dt, parsed_doctype)
Exemplo n.º 6
0
 def test_soup_list_attrs(self):
     if is_bs3():
         self.skipTest('No bs4 module found')
     root = parse('<a class="a b" rel="x y">')
     self.ae(root.body.a.attrs, {
         'class': 'a b'.split(),
         'rel': 'x y'.split()
     })
Exemplo n.º 7
0
 def test_attr_soup(self):
     root = parse('<p a=1 b=2 ID=3><a a=a>')
     self.ae(dict(root.body.p.attrs), {'a': '1', 'b': '2', 'id': '3'})
     self.ae(dict(root.body.p.a.attrs), {'a': 'a'})
     self.ae(type('')(root.find(name='a', a='a')), '<a a="a"></a>')
     root = parse('<p a=1><svg><image xlink:href="h">')
     self.ae(
         type('')(root), '<html><head></head><body>'
         '<p a="1"><svg><image xlink:href="h"/></svg></p>'
         '</body></html>')
     root = parse('<html xml:lang="en" lang="fr"><p>')
     self.ae(dict(root.attrs), {'xml:lang': 'en', 'lang': 'fr'})
     root = parse('<p><x xmlns:a="b">')
     self.ae(
         type('')(root),
         '<html><head></head><body><p><x xmlns:a="b"></x></p></body></html>'
     )
Exemplo n.º 8
0
 def test_simple_soup(self):
     root = parse('<p>\n<a>y</a>z<x:x>1</x:x>')
     self.ae(
         type('')(root),
         '<html><head></head><body><p>\n<a>y</a>z<x:x>1</x:x></p></body></html>'
     )
     root = parse('<svg><image>')
     self.ae(
         type('')(root),
         '<html><head></head><body><svg><image></image></svg></body></html>'
     )
     root = parse('<p><!-- ---->')
     self.ae(
         type('')(root),
         '<html><head></head><body><p><!-- ----></p></body></html>')
     root = parse('<p><i><b>')
     self.ae(
         type('')(root),
         '<html><head></head><body><p><i><b></b></i></p></body></html>')
Exemplo n.º 9
0
def parse_html(markup):
    if isinstance(markup, str):
        markup = chardet.strip_encoding_declarations(markup)
        markup = chardet.substitute_entites(markup)
    else:
        markup = chardet.xml_to_unicode(markup,
                                        strip_encoding_pats=True,
                                        resolve_entities=True)[0]
    markup = cleantext.clean_xml_chars(markup)
    return html5_soup.parse(markup, return_root=False)
Exemplo n.º 10
0
def parse_html(markup):
    from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites
    from calibre.utils.cleantext import clean_xml_chars
    if isinstance(markup, unicode_type):
        markup = strip_encoding_declarations(markup)
        markup = substitute_entites(markup)
    else:
        markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
    markup = clean_xml_chars(markup)
    from html5_parser.soup import parse
    return parse(markup, return_root=False)