def test_default_parser_HTML_broken(self): self.assertRaises(self.etree.XMLSyntaxError, self.etree.parse, BytesIO(self.broken_html_str)) self.etree.set_default_parser(self.etree.HTMLParser()) tree = self.etree.parse(BytesIO(self.broken_html_str)) self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), self.html_str) self.etree.set_default_parser() self.assertRaises(self.etree.XMLSyntaxError, self.etree.parse, BytesIO(self.broken_html_str))
def test_void_elements(self): for tag in ("area", "base", "br", "col", "hr", "img", "input", "link", "meta", "param"): with etree.htmlfile(self._file) as xf: xf.write(etree.Element(tag)) self.assertXml('<%s>' % tag) self._file = BytesIO()
def test_xslt_encoding_override(self): tree = self.parse(_bytes('<a><b>\\uF8D2</b><c>\\uF8D2</c></a>' ).decode("unicode_escape")) style = self.parse('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:output encoding="UTF-8"/> <xsl:template match="/"> <foo><xsl:value-of select="/a/b/text()" /></foo> </xsl:template> </xsl:stylesheet>''') st = etree.XSLT(style) res = st(tree) expected = _bytes("""\ <?xml version='1.0' encoding='UTF-16'?>\ <foo>\\uF8D2</foo>""").decode("unicode_escape") f = BytesIO() res.write(f, encoding='UTF-16') if is_python3: result = str(f.getvalue(), 'UTF-16').replace('\n', '') else: result = unicode(str(f.getvalue()), 'UTF-16').replace('\n', '') self.assertEquals(expected, result)
def test_parse_encoding_8bit_explicit(self): text = _str('Søk på nettet') html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') tree = self.etree.parse(BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEquals(p.text, text)
def test_parse_encoding_8bit_override(self): text = _str('Søk på nettet') wrong_head = _str(''' <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> </head>''') html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, text)).encode('iso-8859-1') self.assertRaises(self.etree.ParseError, self.etree.parse, BytesIO(html_latin1)) tree = self.etree.parse(BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEquals(p.text, text)
def test_parser_based_lookup(self): class TestElement(etree.ElementBase): FIND_ME = "parser_based" lookup = etree.ParserBasedElementClassLookup() etree.set_element_class_lookup(lookup) class MyLookup(etree.CustomElementClassLookup): def lookup(self, t, d, ns, name): return TestElement parser = etree.XMLParser() parser.set_element_class_lookup(MyLookup()) root = etree.parse(BytesIO(xml_str), parser).getroot() self.assertEqual(root.FIND_ME, TestElement.FIND_ME) self.assertEqual(root[0].FIND_ME, TestElement.FIND_ME) root = etree.parse(BytesIO(xml_str)).getroot() self.assertFalse(hasattr(root, 'FIND_ME')) self.assertFalse(hasattr(root[0], 'FIND_ME'))
def test_html_iterparse_tag(self): iterparse = self.etree.iterparse f = BytesIO( '<html><head><title>TITLE</title><body><p>P</p></body></html>') iterator = iterparse(f, html=True, tag=["p", "title"]) self.assertEqual(None, iterator.root) events = list(iterator) root = iterator.root self.assertTrue(root is not None) self.assertEqual([('end', root[0][0]), ('end', root[1][0])], events)
def test_html_iterparse_start(self): iterparse = self.etree.iterparse f = BytesIO( '<html><head><title>TITLE</title><body><p>P</p></body></html>') iterator = iterparse(f, html=True, events=('start', )) self.assertEquals(None, iterator.root) events = list(iterator) root = iterator.root self.assert_(root is not None) self.assertEquals([('start', root), ('start', root[0]), ('start', root[0][0]), ('start', root[1]), ('start', root[1][0])], events)
def test_xmlschema_iterparse_fail(self): schema = self.parse(''' <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"> <xsd:element name="a" type="AType"/> <xsd:complexType name="AType"> <xsd:sequence> <xsd:element name="b" type="xsd:string" /> </xsd:sequence> </xsd:complexType> </xsd:schema> ''') schema = etree.XMLSchema(schema) self.assertRaises( etree.XMLSyntaxError, list, etree.iterparse(BytesIO('<a><c></c></a>'), schema=schema))
def test_relaxng_stringio(self): tree_valid = self.parse('<a><b></b></a>') tree_invalid = self.parse('<a><c></c></a>') schema_file = BytesIO('''\ <element name="a" xmlns="http://relaxng.org/ns/structure/1.0"> <zeroOrMore> <element name="b"> <text /> </element> </zeroOrMore> </element> ''') schema = etree.RelaxNG(file=schema_file) self.assert_(schema.validate(tree_valid)) self.assert_(not schema.validate(tree_invalid))
def test_dtd_invalid_duplicate_id(self): root = etree.XML(_bytes(''' <a><b id="id1"/><b id="id2"/><b id="id1"/></a> ''')) dtd = etree.DTD(BytesIO(_bytes(""" <!ELEMENT a (b*)> <!ATTLIST b id ID #REQUIRED > <!ELEMENT b EMPTY> """))) self.assertFalse(dtd.validate(root)) self.assertTrue(dtd.error_log) self.assertTrue([error for error in dtd.error_log if 'id1' in error.message])
def test_void_elements(self): # http://www.w3.org/TR/html5/syntax.html#elements-0 void_elements = set([ "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr" ]) # FIXME: These don't get serialized as void elements. void_elements.difference_update( ['area', 'embed', 'keygen', 'source', 'track', 'wbr']) for tag in sorted(void_elements): with etree.htmlfile(self._file) as xf: xf.write(etree.Element(tag)) self.assertXml('<%s>' % tag) self._file = BytesIO()
def test_xml_mode_element_inside_html(self): # The htmlfile already outputs in xml mode for .element calls. This # test actually illustrates a bug with etree.htmlfile(self._file) as xf: with xf.element("root"): with xf.element('foo', attrib={'selected': 'bar'}): pass self.assertXml( '<root>' # '<foo selected></foo>' # FIXME: this is the correct output # in html mode '<foo selected="bar"></foo>' '</root>') self._file = BytesIO()
def test_xmlschema_iterparse_incomplete(self): schema = self.parse(''' <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"> <xsd:element name="a" type="AType"/> <xsd:complexType name="AType"> <xsd:sequence> <xsd:element name="b" type="xsd:string" /> </xsd:sequence> </xsd:complexType> </xsd:schema> ''') schema = etree.XMLSchema(schema) xml = BytesIO('<a><b></b></a>') event, element = next(iter(etree.iterparse(xml, schema=schema))) self.assertEqual('end', event) self.assertEqual('b', element.tag)
def test_xmlschema_iterparse(self): schema = self.parse(''' <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"> <xsd:element name="a" type="AType"/> <xsd:complexType name="AType"> <xsd:sequence> <xsd:element name="b" type="xsd:string" /> </xsd:sequence> </xsd:complexType> </xsd:schema> ''') schema = etree.XMLSchema(schema) xml = BytesIO('<a><b></b></a>') events = [(event, el.tag) for (event, el) in etree.iterparse(xml, schema=schema)] self.assertEquals([('end', 'b'), ('end', 'a')], events)
def test_html_iterparse_broken(self): iterparse = self.etree.iterparse f = BytesIO('<head><title>TEST></head><p>P<br></div>') iterator = iterparse(f, html=True) self.assertEqual(None, iterator.root) events = list(iterator) root = iterator.root self.assertTrue(root is not None) self.assertEqual('html', root.tag) self.assertEqual('head', root[0].tag) self.assertEqual('body', root[1].tag) self.assertEqual('p', root[1][0].tag) self.assertEqual('br', root[1][0][0].tag) self.assertEqual([('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]), ('end', root[1][0]), ('end', root[1]), ('end', root)], events)
def test_xml_mode_write_inside_html(self): elt = etree.Element("foo", attrib={'selected': 'bar'}) with etree.htmlfile(self._file) as xf: with xf.element("root"): xf.write(elt) # 1 assert elt.text is None xf.write(elt, method='xml') # 2 elt.text = "" xf.write(elt, method='xml') # 3 self.assertXml('<root>' '<foo selected></foo>' # 1 '<foo selected="bar"/>' # 2 '<foo selected="bar"></foo>' # 3 '</root>') self._file = BytesIO()
def test_xmlschema_stringio(self): schema_file = BytesIO(''' <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"> <xsd:element name="a" type="AType"/> <xsd:complexType name="AType"> <xsd:sequence> <xsd:element name="b" type="xsd:string" /> </xsd:sequence> </xsd:complexType> </xsd:schema> ''') schema = etree.XMLSchema(file=schema_file) parser = etree.XMLParser(schema=schema) tree_valid = self.parse('<a><b></b></a>', parser=parser) self.assertEqual('a', tree_valid.getroot().tag) self.assertRaises(etree.XMLSyntaxError, self.parse, '<a><c></c></a>', parser=parser)
def test_html_iterparse_stop_short(self): iterparse = self.etree.iterparse f = BytesIO( '<html><head><title>TITLE</title><body><p>P</p></body></html>') iterator = iterparse(f, html=True) self.assertEqual(None, iterator.root) event, element = next(iterator) self.assertEqual('end', event) self.assertEqual('title', element.tag) self.assertEqual(None, iterator.root) del element event, element = next(iterator) self.assertEqual('end', event) self.assertEqual('head', element.tag) self.assertEqual(None, iterator.root) del element del iterator
def test_write_compressed_text(self): Element = self.etree.Element SubElement = self.etree.SubElement ElementTree = self.etree.ElementTree text = _str("qwrtioüöä") root = Element('root') root.text = text child = SubElement(root, 'sub') child.text = 'TEXT' child.tail = 'TAIL' SubElement(root, 'sub').text = text tree = ElementTree(root) out = BytesIO() tree.write(out, method='text', encoding='utf8', compression=9) out.seek(0) f = gzip.GzipFile(fileobj=out) try: result = f.read().decode('utf8') finally: f.close() self.assertEqual(text + 'TEXTTAIL' + text, result)
class ETreeXMLSchemaResolversTestCase(HelperTestCase): resolver_schema_int = BytesIO("""\ <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:etype="http://codespeak.net/lxml/test/external" targetNamespace="http://codespeak.net/lxml/test/internal"> <xsd:import namespace="http://codespeak.net/lxml/test/external" schemaLocation="XXX.xsd" /> <xsd:element name="a" type="etype:AType"/> </xsd:schema>""") resolver_schema_int2 = BytesIO("""\ <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:etype="http://codespeak.net/lxml/test/external" targetNamespace="http://codespeak.net/lxml/test/internal"> <xsd:import namespace="http://codespeak.net/lxml/test/external" schemaLocation="YYY.xsd" /> <xsd:element name="a" type="etype:AType"/> </xsd:schema>""") resolver_schema_ext = """\ <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema" targetNamespace="http://codespeak.net/lxml/test/external"> <xsd:complexType name="AType"> <xsd:sequence><xsd:element name="b" type="xsd:string" minOccurs="0" maxOccurs="unbounded" /></xsd:sequence> </xsd:complexType> </xsd:schema>""" class simple_resolver(etree.Resolver): def __init__(self, schema): self.schema = schema def resolve(self, url, id, context): assert url == 'XXX.xsd' return self.resolve_string(self.schema, context) # tests: def test_xmlschema_resolvers(self): """Test that resolvers work with schema.""" parser = etree.XMLParser() parser.resolvers.add(self.simple_resolver(self.resolver_schema_ext)) schema_doc = etree.parse(self.resolver_schema_int, parser=parser) schema = etree.XMLSchema(schema_doc) def test_xmlschema_resolvers_root(self): """Test that the default resolver will get called if there's no specific parser resolver.""" root_resolver = self.simple_resolver(self.resolver_schema_ext) etree.get_default_parser().resolvers.add(root_resolver) schema_doc = etree.parse(self.resolver_schema_int) schema = etree.XMLSchema(schema_doc) etree.get_default_parser().resolvers.remove(root_resolver) def test_xmlschema_resolvers_noroot(self): """Test that the default resolver will not get called when a more specific resolver is registered.""" class res_root(etree.Resolver): def resolve(self, url, id, context): assert False return None root_resolver = res_root() etree.get_default_parser().resolvers.add(root_resolver) parser = etree.XMLParser() parser.resolvers.add(self.simple_resolver(self.resolver_schema_ext)) schema_doc = etree.parse(self.resolver_schema_int, parser=parser) schema = etree.XMLSchema(schema_doc) etree.get_default_parser().resolvers.remove(root_resolver) def test_xmlschema_nested_resolvers(self): """Test that resolvers work in a nested fashion.""" resolver_schema = self.resolver_schema_ext class res_nested(etree.Resolver): def __init__(self, ext_schema): self.ext_schema = ext_schema def resolve(self, url, id, context): assert url == 'YYY.xsd' return self.resolve_string(self.ext_schema, context) class res(etree.Resolver): def __init__(self, ext_schema_1, ext_schema_2): self.ext_schema_1 = ext_schema_1 self.ext_schema_2 = ext_schema_2 def resolve(self, url, id, context): assert url == 'XXX.xsd' new_parser = etree.XMLParser() new_parser.resolvers.add(res_nested(self.ext_schema_2)) new_schema_doc = etree.parse(self.ext_schema_1, parser=new_parser) new_schema = etree.XMLSchema(new_schema_doc) return self.resolve_string(resolver_schema, context) parser = etree.XMLParser() parser.resolvers.add( res(self.resolver_schema_int2, self.resolver_schema_ext)) schema_doc = etree.parse(self.resolver_schema_int, parser=parser) schema = etree.XMLSchema(schema_doc)
def test_dtd_invalid(self): root = etree.XML("<b><a/></b>") dtd = etree.DTD(BytesIO("<!ELEMENT b EMPTY>")) self.assertRaises(etree.DocumentInvalid, dtd.assertValid, root)
def test_dtd_assertValid(self): root = etree.XML("<b><a/></b>") dtd = etree.DTD(BytesIO("<!ELEMENT b (a)><!ELEMENT a EMPTY>")) dtd.assertValid(root)
def test_dtd_broken(self): self.assertRaises(etree.DTDParseError, etree.DTD, BytesIO("<!ELEMENT b HONKEY>"))
def test_dtd_stringio(self): root = etree.XML(_bytes("<b/>")) dtd = etree.DTD(BytesIO("<!ELEMENT b EMPTY>")) self.assertTrue(dtd.validate(root))
def test_html_iterparse_broken_no_recover(self): iterparse = self.etree.iterparse f = BytesIO('<p>P<br></div>') iterator = iterparse(f, html=True, recover=False) self.assertRaises(self.etree.XMLSyntaxError, list, iterator)
def test_module_parse_html_error(self): parser = self.etree.HTMLParser(recover=False) parse = self.etree.parse f = BytesIO("<html></body>") self.assertRaises(self.etree.XMLSyntaxError, parse, f, parser)
def run_parse(): thread_root = self.etree.parse(BytesIO(xml)).getroot() result.append(thread_root[0]) result.append(thread_root[-1])
expr = etree.ETXPath(_bytes("/a/{http://nsa/\\uf8d2}b").decode("unicode_escape")) r = expr(x) self.assertEquals(1, len(r)) self.assertEquals(_bytes('{http://nsa/\\uf8d2}b').decode("unicode_escape"), r[0].tag) expr = etree.ETXPath(_bytes("/a/{http://nsb/\\uf8d1}b").decode("unicode_escape")) r = expr(x) self.assertEquals(1, len(r)) self.assertEquals(_bytes('{http://nsb/\\uf8d1}b').decode("unicode_escape"), r[0].tag) SAMPLE_XML = etree.parse(BytesIO(""" <body> <tag>text</tag> <section> <tag>subtext</tag> </section> <tag /> <tag /> </body> """)) def tag(elem): return elem.tag def tag_or_value(elem): return getattr(elem, 'tag', elem) def stringTest(ctxt, s1): return "Hello "+s1 def stringListTest(ctxt, s1):
def test_module_parse_html_norecover(self): parser = self.etree.HTMLParser(recover=False) parse = self.etree.parse f = BytesIO(self.broken_html_str) self.assertRaises(self.etree.XMLSyntaxError, parse, f, parser)