Exemplo n.º 1
0
def fragment_fromstring(html,
                        create_parent=False,
                        guess_charset=False,
                        parser=None):
    """Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If create_parent is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')

    if create_parent:
        container = create_parent or 'div'
        html = '<%s>%s</%s>' % (container, html, container)

    children = fragments_fromstring(html, True, guess_charset, parser)
    if not children:
        raise etree.ParserError('No elements found')
    if len(children) > 1:
        raise etree.ParserError('Multiple elements found')

    result = children[0]
    if result.tail and result.tail.strip():
        raise etree.ParserError('Element followed by text: %r' % result.tail)
    result.tail = None
    return result
Exemplo n.º 2
0
def fragments_fromstring(html,
                         no_leading_text=False,
                         guess_charset=False,
                         parser=None):
    """Parses several HTML elements, returning a list of elements.

    The first item in the list may be a string.  If no_leading_text is true,
    then it will be an error if there is leading text, and it will always be
    a list of only elements.

    If `guess_charset` is `True` and the text was not unicode but a
    bytestring, the `chardet` library will perform charset guessing on the
    string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')

    if parser is None:
        parser = html_parser

    children = parser.parseFragment(html, 'div', useChardet=guess_charset)
    if children and isinstance(children[0], _strings):
        if no_leading_text:
            if children[0].strip():
                raise etree.ParserError('There is leading text: %r' %
                                        children[0])
            del children[0]
    return children
Exemplo n.º 3
0
def fragments_fromstring(html, no_leading_text=False,
                         guess_charset=None, parser=None):
    """Parses several HTML elements, returning a list of elements.

    The first item in the list may be a string.  If no_leading_text is true,
    then it will be an error if there is leading text, and it will always be
    a list of only elements.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError('string required')

    if parser is None:
        parser = html_parser

    options = {}
    if guess_charset is None and isinstance(html, bytes):
        # html5lib does not accept useChardet as an argument, if it
        # detected the htmls argument would produce unicode objects.
        guess_charset = False
    if guess_charset is not None:
        options['useChardet'] = guess_charset
    children = parser.parseFragment(html, 'div', **options)
    if children and isinstance(children[0], _strings):
        if no_leading_text:
            if children[0].strip():
                raise etree.ParserError('There is leading text: %r' %
                                        children[0])
            del children[0]
    return children
Exemplo n.º 4
0
    def lxmlize(self,
                url,
                encoding=None,
                user_agent=requests.utils.default_user_agent(),
                cookies=None,
                xml=False):
        self.user_agent = user_agent

        response = self.get(url, cookies=cookies)
        if encoding:
            response.encoding = encoding

        try:
            text = response.text
            if xml:
                text = text.replace('<?xml version="1.0" encoding="utf-8"?>',
                                    '')  # XXX ca_bc
                page = etree.fromstring(text)
            else:
                page = lxml.html.fromstring(text)
        except etree.ParserError:
            raise etree.ParserError('Document is empty {}'.format(url))

        meta = page.xpath('//meta[@http-equiv="refresh"]')
        if meta:
            _, url = meta[0].attrib['content'].split('=', 1)
            return self.lxmlize(url, encoding)
        elif xml:
            return page
        else:
            page.make_links_absolute(url)
            return page
Exemplo n.º 5
0
    def lxmlize(self,
                url,
                encoding=None,
                user_agent=requests.utils.default_user_agent(),
                cookies=None):
        self.user_agent = user_agent

        response = self.get(url, cookies=cookies)
        if encoding:
            response.encoding = encoding

        try:
            text = response.text
            text = text.replace(
                '"www.facebook.com/',
                '"https://www.facebook.com/')  # XXX ca_candidates
            text = re.sub('(?<=<!DOCTYPE html>)<script .+?</script>.',
                          '',
                          text,
                          flags=re.DOTALL)  # XXX ca_qc_longueuil
            page = lxml.html.fromstring(text)
        except etree.ParserError:
            raise etree.ParserError('Document is empty {}'.format(url))

        meta = page.xpath('//meta[@http-equiv="refresh"]')
        if meta:
            _, url = meta[0].attrib['content'].split('=', 1)
            return self.lxmlize(url, encoding)
        else:
            page.make_links_absolute(url)
            return page
Exemplo n.º 6
0
def fragment_fromstring(html,
                        create_parent=False,
                        guess_charset=None,
                        parser=None):
    """Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    """
    if not isinstance(html, _strings):
        raise TypeError("string required")

    accept_leading_text = bool(create_parent)

    elements = fragments_fromstring(
        html,
        guess_charset=guess_charset,
        parser=parser,
        no_leading_text=not accept_leading_text,
    )

    if create_parent:
        if not isinstance(create_parent, _strings):
            create_parent = "div"
        new_root = Element(create_parent)
        if elements:
            if isinstance(elements[0], _strings):
                new_root.text = elements[0]
                del elements[0]
            new_root.extend(elements)
        return new_root

    if not elements:
        raise etree.ParserError("No elements found")
    if len(elements) > 1:
        raise etree.ParserError("Multiple elements found")
    result = elements[0]
    if result.tail and result.tail.strip():
        raise etree.ParserError("Element followed by text: %r" % result.tail)
    result.tail = None
    return result