Пример #1
0
 def html(self):
     try:
         return self.html5lib.parseFragment(self.content, treebuilder="etree")
     except ImportError as err:
         raise ImproperlyConfigured("Error while importing html5lib: %s" % err)
     except Exception as err:
         raise ParserError("Error while initializing Parser: %s" % err)
Пример #2
0
class Html5LibParser(ParserBase):
    def __init__(self, content):
        super(Html5LibParser, self).__init__(content)
        import html5lib
        self.html5lib = html5lib

    def _serialize(self, elem):
        fragment = self.html5lib.treebuilders.simpletree.DocumentFragment()
        fragment.appendChild(elem)
        return self.html5lib.serialize(fragment,
                                       quote_attr_values=True,
                                       omit_optional_tags=False)

    def _find(self, *names):
        for node in self.html.childNodes:
            if node.type == 5 and node.name in names:
                yield node

    @cached_property
    def html(self):
        try:
            return self.html5lib.parseFragment(self.content)
        except ImportError, err:
            raise ImproperlyConfigured("Error while importing html5lib: %s" %
                                       err)
        except Exception, err:
            raise ParserError("Error while initializing Parser: %s" % err)
Пример #3
0
class BeautifulSoupParser(ParserBase):
    @cached_property
    def soup(self):
        try:
            from BeautifulSoup import BeautifulSoup
            return BeautifulSoup(self.content)
        except ImportError, err:
            raise ImproperlyConfigured(
                "Error while importing BeautifulSoup: %s" % err)
        except Exception, err:
            raise ParserError("Error while initializing Parser: %s" % err)
Пример #4
0
 def __init__(self, content):
     HTMLParser.__init__(self)
     self.content = content
     self._css_elems = []
     self._js_elems = []
     self._current_tag = None
     try:
         self.feed(self.content)
         self.close()
     except Exception, err:
         raise ParserError("Error while initializing HtmlParser: %s" % err)
Пример #5
0
 def soup(self):
     try:
         if six.PY3:
             from bs4 import BeautifulSoup
         else:
             from BeautifulSoup import BeautifulSoup
         return BeautifulSoup(self.content)
     except ImportError as err:
         raise ImproperlyConfigured(
             "Error while importing BeautifulSoup: %s" % err)
     except Exception as err:
         raise ParserError("Error while initializing Parser: %s" % err)
Пример #6
0
class LxmlParser(ParserBase):
    def __init__(self, content):
        try:
            from lxml.html import fromstring, soupparser
            from lxml.etree import tostring
            self.fromstring = fromstring
            self.soupparser = soupparser
            self.tostring = tostring
        except ImportError, err:
            raise ImproperlyConfigured("Error while importing lxml: %s" % err)
        except Exception, err:
            raise ParserError("Error while initializing Parser: %s" % err)
Пример #7
0
    def __init__(self, content):
        try:
            from lxml.html import fromstring
            from lxml.etree import tostring
        except ImportError as err:
            raise ImproperlyConfigured("Error while importing lxml: %s" % err)
        except Exception as err:
            raise ParserError("Error while initializing parser: %s" % err)

        self.fromstring = fromstring
        self.tostring = tostring
        super(LxmlParser, self).__init__(content)
Пример #8
0
 def __init__(self, content):
     six.moves.html_parser.HTMLParser.__init__(self, **HTML_PARSER_ARGS)
     self.content = content
     self._css_elems = []
     self._js_elems = []
     self._current_tag = None
     try:
         self.feed(self.content)
         self.close()
     except Exception as err:
         lineno = err.lineno
         line = self.content.splitlines()[lineno]
         raise ParserError("Error while initializing HtmlParser: %s (line: %s)" % (err, repr(line)))
Пример #9
0
    def __init__(self, content):
        try:
            from lxml.html import fromstring
            from lxml.etree import tostring
        except ImportError as err:
            raise ImproperlyConfigured("Error while importing lxml: %s" % err)
        except Exception as err:
            raise ParserError("Error while initializing parser: %s" % err)

        if not six.PY3:
            # soupparser uses Beautiful Soup 3 which does not run on python 3.x
            try:
                from lxml.html import soupparser
            except ImportError as err:
                soupparser = None
            except Exception as err:
                raise ParserError("Error while initializing parser: %s" % err)
        else:
            soupparser = None

        self.soupparser = soupparser
        self.fromstring = fromstring
        self.tostring = tostring
        super(LxmlParser, self).__init__(content)
Пример #10
0
class LxmlParser(ParserBase):
    @cached_property
    def tree(self):
        content = '<root>%s</root>' % self.content
        try:
            from lxml.html import fromstring, soupparser
            from lxml.etree import tostring
            self.tostring = tostring
            tree = fromstring(content)
            try:
                ignore = tostring(tree, encoding=unicode)
            except UnicodeDecodeError:
                tree = soupparser.fromstring(content)
        except ImportError, err:
            raise ImproperlyConfigured("Error while importing lxml: %s" % err)
        except Exception, err:
            raise ParserError("Error while initializing Parser: %s" % err)
Пример #11
0
 def tree(self):
     try:
         from lxml import html
         from lxml.etree import tostring
     except ImportError, e:
         raise ParserError("Error while initializing Parser: %s" % e)
Пример #12
0
 def soup(self):
     try:
         from BeautifulSoup import BeautifulSoup
     except ImportError, e:
         raise ParserError("Error while initializing Parser: %s" % e)