Пример #1
0
def parse(src):
    """ Returns an element tree create by `LXML <http://lxml.de/>`_.
       :param src: A readable object such as a :class:`wex.response.Response`.
    """

    if not hasattr(src, 'read'):
        return src

    etree = _ElementTree()
    try:
        stream = HTMLStream(src)
        # Sometimes we get URLs containing characters that aren't
        # acceptable to lxml (e.g. "http:/foo.com/bar?this=array[]").
        # When this happens lxml will quote the whole URL.
        # We don't want to have to check for this so we just always
        # quote it here and then unquote it in the `base_url` function.
        quoted_base_url = quote(src.url) if src.url else src.url
        while True:
            try:
                parser = HTMLParser()
                fp = replace_invalid_ncr(stream)
                etree.parse(fp, parser=parser, base_url=quoted_base_url)
                break
            except UnicodeDecodeError as exc:
                stream.next_encoding()
    except IOError as exc:
        logger = logging.getLogger(__name__)
        logger.warning("IOError parsing %s (%s)", src.url, exc)

    root = etree.getroot()
    if root is None:
        etree._setroot(UNPARSEABLE)

    return etree
    def setUp(self):
        self.text = open("tests/testing_data/texts/sample.xml", "rb")
        self.TEI = MyCapytain.resources.texts.local.capitains.cts.CapitainsCtsText(
            resource=self.text,
            urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2")
        self.treeroot = etree._ElementTree()

        with open("tests/testing_data/texts/text_or_xpath.xml") as f:
            self.text_complex = MyCapytain.resources.texts.local.capitains.cts.CapitainsCtsText(
                resource=f, urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2")

        with open("tests/testing_data/texts/seneca.xml") as f:
            self.seneca = MyCapytain.resources.texts.local.capitains.cts.CapitainsCtsText(
                resource=f)
Пример #3
0
    def setUp(self):
        self.text = open("tests/testing_data/texts/sample.xml", "rb")
        self.TEI = MyCapytain.resources.texts.locals.tei.Text(
            resource=self.text,
            urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2"
        )
        self.treeroot = etree._ElementTree()

        with open("tests/testing_data/texts/text_or_xpath.xml") as f:
            self.text_complex = MyCapytain.resources.texts.locals.tei.Text(
                resource=f,
                urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2"
            )

        with open("tests/testing_data/texts/seneca.xml") as f:
            self.seneca = MyCapytain.resources.texts.locals.tei.Text(
                resource=f
            )
Пример #4
0
def parse(src):
    """ Returns an element tree create by `LXML <http://lxml.de/>`_.
       :param src: A readable object such as a :class:`wex.response.Response`.
    """

    if not hasattr(src, 'read'):
        return src

    etree = _ElementTree()
    try:
        stream = HTMLStream(src)
        # Sometimes we get URLs containing characters that aren't
        # acceptable to lxml (e.g. "http:/foo.com/bar?this=array[]").
        # When this happens lxml will quote the whole URL.
        # We don't want to have to check for this so we just always
        # quote it here and then unquote it in the `base_url` function.
        quoted_base_url = quote_base_url(src.url) if src.url else src.url
        while True:
            try:
                fp = replace_invalid_ncr(stream)
                # fp is a Unicode stream
                # The lxml FAQ tells us that it is inefficient to do this
                # http://lxml.de/FAQ.html#can-lxml-parse-from-file-objects-opened-in-unicode-text-mode
                # but actually it seems just fine as long as you tell the parser to use 'utf-8'!?
                parser = HTMLParser(encoding='utf-8')
                etree.parse(fp, parser=parser, base_url=quoted_base_url)
                break
            except UnicodeDecodeError as exc:
                stream.next_encoding()
    except IOError as exc:
        logger = logging.getLogger(__name__)
        logger.warning("IOError parsing %s (%s)", src.url, exc)

    root = etree.getroot()
    if root is None:
        etree._setroot(UNPARSEABLE)

    return etree
Пример #5
0
def parse(src):
    """ Returns an element tree create by `LXML <http://lxml.de/>`_.
       :param src: A readable object such as a :class:`wex.response.Response`.
    """

    if not hasattr(src, 'read'):
        return src

    etree = _ElementTree()
    try:
        stream = HTMLStream(src)
        # Sometimes we get URLs containing characters that aren't
        # acceptable to lxml (e.g. "http:/foo.com/bar?this=array[]").
        # When this happens lxml will quote the whole URL.
        # We don't want to have to check for this so we just always
        # quote it here and then unquote it in the `base_url` function.
        quoted_base_url = quote_base_url(src.url) if src.url else src.url
        while True:
            try:
                fp = replace_invalid_ncr(stream)
                # fp is a Unicode stream
                # The lxml FAQ tells us that it is inefficient to do this
                # http://lxml.de/FAQ.html#can-lxml-parse-from-file-objects-opened-in-unicode-text-mode
                # but actually it seems just fine as long as you tell the parser to use 'utf-8'!?
                parser = HTMLParser(encoding='utf-8')
                etree.parse(fp, parser=parser, base_url=quoted_base_url)
                break
            except UnicodeDecodeError as exc:
                stream.next_encoding()
    except IOError as exc:
        logger = logging.getLogger(__name__)
        logger.warning("IOError parsing %s (%s)", src.url, exc)

    root = etree.getroot()
    if root is None:
        etree._setroot(UNPARSEABLE)

    return etree
Пример #6
0
def ElementTree(element: GenericElement[AnyStr] = None,
                *,
                file: Union[AnyStr, IO[Any]] = None,
                parser: XMLParser = None) -> GenericElementTree[AnyStr]:
    return cast(Any, _ElementTree(cast(Any, element), file=file,
                                  parser=parser))