def test_invalid_decl_1(self): docinfo = etree.Element('test').getroottree().docinfo def set_public_id(value): docinfo.public_id = value self.assertRaises(ValueError, set_public_id, _str('ä')) self.assertRaises(ValueError, set_public_id, _str('qwerty ä asdf'))
def test_parse_encoding_8bit_explicit(self): text = _str('Søk på nettet') html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') tree = self.etree.parse(BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEquals(p.text, text)
def test_parse_encoding_8bit_explicit(self): text = _str('Søk på nettet') html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') tree = self.etree.parse( BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEqual(p.text, text)
def test_uniname(self): Element = etree.Element def el(name): return Element(name) self.assertRaises(ValueError, el, ':') self.assertRaises(ValueError, el, '0a') self.assertRaises(ValueError, el, _str('\u203f')) # should not Raise el(_str('\u0132'))
def test_etree_parse_io_error(self): # this is a directory name that contains characters beyond latin-1 dirnameEN = _str('Directory') dirnameRU = _str('Каталог') filename = _str('nosuchfile.xml') dn = tempfile.mkdtemp(prefix=dirnameEN) try: self.assertRaises(IOError, self.etree.parse, os.path.join(dn, filename)) finally: os.rmdir(dn) dn = tempfile.mkdtemp(prefix=dirnameRU) try: self.assertRaises(IOError, self.etree.parse, os.path.join(dn, filename)) finally: os.rmdir(dn)
def test_parse_encoding_8bit_override(self): text = _str('Søk på nettet') wrong_head = _str(''' <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> </head>''') html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, text)).encode('iso-8859-1') self.assertRaises(self.etree.ParseError, self.etree.parse, BytesIO(html_latin1)) tree = self.etree.parse(BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEquals(p.text, text)
def test_unicode_text(self): e = etree.Element('e') def settext(text): e.text = text self.assertRaises(ValueError, settext, _str('ab\ufffe')) self.assertRaises(ValueError, settext, _str('ö\ffff')) self.assertRaises(ValueError, settext, _str('\u0123\ud800')) self.assertRaises(ValueError, settext, _str('x\ud8ff')) self.assertRaises(ValueError, settext, _str('\U00010000\udfff')) self.assertRaises(ValueError, settext, _str('abd\x00def')) # should not Raise settext(_str('\ud7ff\ue000\U00010000\U0010FFFFäöas')) for char_val in range(0xD800, 0xDFFF+1): self.assertRaises(ValueError, settext, 'abc' + _chr(char_val)) self.assertRaises(ValueError, settext, _chr(char_val)) self.assertRaises(ValueError, settext, _chr(char_val) + 'abc') self.assertRaises(ValueError, settext, _bytes('\xe4')) self.assertRaises(ValueError, settext, _bytes('\x80')) self.assertRaises(ValueError, settext, _bytes('\xff')) self.assertRaises(ValueError, settext, _bytes('\x08')) self.assertRaises(ValueError, settext, _bytes('\x19')) self.assertRaises(ValueError, settext, _bytes('\x20\x00')) # should not Raise settext(_bytes('\x09\x0A\x0D\x20\x60\x7f'))
def test_unicode_text(self): e = etree.Element('e') def settext(text): e.text = text self.assertRaises(ValueError, settext, _str('ab\ufffe')) self.assertRaises(ValueError, settext, _str('ö\ffff')) self.assertRaises(ValueError, settext, _str('\u0123\ud800')) self.assertRaises(ValueError, settext, _str('x\ud8ff')) self.assertRaises(ValueError, settext, _str('\U00010000\udfff')) self.assertRaises(ValueError, settext, _str('abd\x00def')) # should not Raise settext(_str('\ud7ff\ue000\U00010000\U0010FFFFäöas')) for char_val in range(0xD800, 0xDFFF + 1): self.assertRaises(ValueError, settext, 'abc' + _chr(char_val)) self.assertRaises(ValueError, settext, _chr(char_val)) self.assertRaises(ValueError, settext, _chr(char_val) + 'abc') self.assertRaises(ValueError, settext, _bytes('\xe4')) self.assertRaises(ValueError, settext, _bytes('\x80')) self.assertRaises(ValueError, settext, _bytes('\xff')) self.assertRaises(ValueError, settext, _bytes('\x08')) self.assertRaises(ValueError, settext, _bytes('\x19')) self.assertRaises(ValueError, settext, _bytes('\x20\x00')) # should not Raise settext(_bytes('\x09\x0A\x0D\x20\x60\x7f'))
def test_parse_encoding_8bit_override(self): text = _str('Søk på nettet') wrong_head = _str(''' <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> </head>''') html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, text) ).encode('iso-8859-1') self.assertRaises(self.etree.ParseError, self.etree.parse, BytesIO(html_latin1)) tree = self.etree.parse( BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEqual(p.text, text)
def test_parse_utf8_bom(self): utext = _str('Søk på nettet') uxml = '<?xml version="1.0" encoding="UTF-8"?><p>%s</p>' % utext bom = _bytes('\\xEF\\xBB\\xBF').decode( "unicode_escape").encode("latin1") self.assertEqual(3, len(bom)) f = tempfile.NamedTemporaryFile() try: f.write(bom) f.write(uxml.encode("utf-8")) f.flush() tree = self.etree.parse(f.name) finally: f.close() self.assertEqual(utext, tree.getroot().text)
def test_iterparse_utf8_bom(self): utext = _str('Søk på nettet') uxml = '<?xml version="1.0" encoding="UTF-8"?><p>%s</p>' % utext bom = _bytes('\\xEF\\xBB\\xBF').decode( "unicode_escape").encode("latin1") self.assertEqual(3, len(bom)) f = tempfile.NamedTemporaryFile() try: f.write(bom) f.write(uxml.encode("utf-8")) f.flush() elements = [el for _, el in self.etree.iterparse(f.name)] self.assertEqual(1, len(elements)) root = elements[0] finally: f.close() self.assertEqual(utext, root.text)
def test_parse_utf8_bom(self): utext = _str('Søk på nettet') uxml = '<?xml version="1.0" encoding="UTF-8"?><p>%s</p>' % utext bom = _bytes('\\xEF\\xBB\\xBF').decode( "unicode_escape").encode("latin1") self.assertEqual(3, len(bom)) f = tempfile.NamedTemporaryFile(delete=False) try: try: f.write(bom) f.write(uxml.encode("utf-8")) finally: f.close() tree = self.etree.parse(f.name) finally: os.unlink(f.name) self.assertEqual(utext, tree.getroot().text)
def test_iterparse_utf8_bom(self): utext = _str('Søk på nettet') uxml = '<?xml version="1.0" encoding="UTF-8"?><p>%s</p>' % utext bom = _bytes('\\xEF\\xBB\\xBF').decode("unicode_escape").encode( "latin1") self.assertEqual(3, len(bom)) f = NamedTemporaryFile(delete=False) try: try: f.write(bom) f.write(uxml.encode("utf-8")) finally: f.close() elements = [el for _, el in self.etree.iterparse(f.name)] self.assertEqual(1, len(elements)) root = elements[0] finally: os.unlink(f.name) self.assertEqual(utext, root.text)
def test_iterparse_utf16_bom(self): utext = _str('Søk på nettet') uxml = '<?xml version="1.0" encoding="UTF-16"?><p>%s</p>' % utext boms = _bytes('\\xFE\\xFF \\xFF\\xFE').decode( "unicode_escape").encode("latin1") self.assertEqual(5, len(boms)) xml = uxml.encode("utf-16") self.assertTrue(xml[:2] in boms, repr(xml[:2])) f = tempfile.NamedTemporaryFile() try: f.write(xml) f.flush() elements = [el for _, el in self.etree.iterparse(f.name)] self.assertEqual(1, len(elements)) root = elements[0] finally: f.close() self.assertEqual(utext, root.text)
def test_iterparse_utf16_bom(self): utext = _str('Søk på nettet') uxml = '<?xml version="1.0" encoding="UTF-16"?><p>%s</p>' % utext boms = _bytes('\\xFE\\xFF \\xFF\\xFE').decode( "unicode_escape").encode("latin1") self.assertEqual(5, len(boms)) xml = uxml.encode("utf-16") self.assertTrue(xml[:2] in boms, repr(xml[:2])) f = tempfile.NamedTemporaryFile(delete=False) try: try: f.write(xml) finally: f.close() elements = [el for _, el in self.etree.iterparse(f.name)] self.assertEqual(1, len(elements)) root = elements[0] finally: os.unlink(f.name) self.assertEqual(utext, root.text)
def test_write_compressed_text(self): Element = self.etree.Element SubElement = self.etree.SubElement ElementTree = self.etree.ElementTree text = _str("qwrtioüöä") root = Element('root') root.text = text child = SubElement(root, 'sub') child.text = 'TEXT' child.tail = 'TAIL' SubElement(root, 'sub').text = text tree = ElementTree(root) out = BytesIO() tree.write(out, method='text', encoding='utf8', compression=9) out.seek(0) f = gzip.GzipFile(fileobj=out) try: result = f.read().decode('utf8') finally: f.close() self.assertEqual(text+'TEXTTAIL'+text, result)
def test_tree_io_latin1(self): Element = self.etree.Element ElementTree = self.etree.ElementTree element = Element('top') element.text = _str("qwrtioüöäßá") tree = ElementTree(element) self.buildNodes(element, 10, 3) f = open(self.getTestFilePath('testdump.xml'), 'wb') tree.write(f, encoding='iso-8859-1') f.close() f = open(self.getTestFilePath('testdump.xml'), 'rb') tree = ElementTree(file=f) f.close() f = open(self.getTestFilePath('testdump2.xml'), 'wb') tree.write(f, encoding='iso-8859-1') f.close() f = open(self.getTestFilePath('testdump.xml'), 'rb') data1 = f.read() f.close() f = open(self.getTestFilePath('testdump2.xml'), 'rb') data2 = f.read() f.close() self.assertEqual(data1, data2)
def test_write_compressed_text(self): Element = self.etree.Element SubElement = self.etree.SubElement ElementTree = self.etree.ElementTree text = _str("qwrtioüöä") root = Element('root') root.text = text child = SubElement(root, 'sub') child.text = 'TEXT' child.tail = 'TAIL' SubElement(root, 'sub').text = text tree = ElementTree(root) out = BytesIO() tree.write(out, method='text', encoding='utf8', compression=9) out.seek(0) f = gzip.GzipFile(fileobj=out) try: result = f.read().decode('utf8') finally: f.close() self.assertEqual(text + 'TEXTTAIL' + text, result)
def test_tree_io_latin1(self): Element = self.etree.Element ElementTree = self.etree.ElementTree element = Element('top') element.text = _str("qwrtioüöäßá") tree = ElementTree(element) self.buildNodes(element, 10, 3) f = open(self.getTestFilePath('testdump.xml'), 'wb') tree.write(f, encoding='iso-8859-1') f.close() f = open(self.getTestFilePath('testdump.xml'), 'rb') tree = ElementTree(file=f) f.close() f = open(self.getTestFilePath('testdump2.xml'), 'wb') tree.write(f, encoding='iso-8859-1') f.close() f = open(self.getTestFilePath('testdump.xml'), 'rb') data1 = f.read() f.close() f = open(self.getTestFilePath('testdump2.xml'), 'rb') data2 = f.read() f.close() self.assertEquals(data1, data2)
def test_unicode_nstag_invalid(self): # sadly, Klingon is not well-formed tag = _str("{http://abc/}%s") % invalid_tag self.assertRaises(ValueError, etree.Element, tag)
def test_unicode_ns_invalid(self): # namespace URIs must conform to RFC 3986 tag = _str("{http://%s/}abc") % uni self.assertRaises(ValueError, etree.Element, tag)
def test_unicode_nstag(self): tag = _str("{http://abc/}%s") % uni el = etree.Element(tag) self.assertEquals(tag, el.tag)
def test_unicode_repr4(self): x = etree.Entity(_str('ä')) repr(x)
def test_unicode_nstag(self): tag = _str("{%s}%s") % (uni, uni) el = etree.Element(tag) self.assertEquals(tag, el.tag)
class HtmlParserTestCase(HelperTestCase): """HTML parser test cases """ etree = etree html_str = _bytes( "<html><head><title>test</title></head><body><h1>page title</h1></body></html>" ) html_str_pretty = _bytes("""\ <html> <head><title>test</title></head> <body><h1>page title</h1></body> </html> """) broken_html_str = _bytes( "<html><head><title>test<body><h1>page title</h3></p></html>") uhtml_str = _str( "<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>" ) def tearDown(self): super(HtmlParserTestCase, self).tearDown() self.etree.set_default_parser() def test_module_HTML(self): element = self.etree.HTML(self.html_str) self.assertEqual(self.etree.tostring(element, method="html"), self.html_str) def test_module_HTML_unicode(self): element = self.etree.HTML(self.uhtml_str) self.assertEqual( unicode( self.etree.tostring(element, method="html", encoding='UTF8'), 'UTF8'), unicode(self.uhtml_str.encode('UTF8'), 'UTF8')) def test_module_HTML_pretty_print(self): element = self.etree.HTML(self.html_str) self.assertEqual( self.etree.tostring(element, method="html", pretty_print=True), self.html_str_pretty) def test_module_parse_html_error(self): parser = self.etree.HTMLParser(recover=False) parse = self.etree.parse f = BytesIO("<html></body>") self.assertRaises(self.etree.XMLSyntaxError, parse, f, parser) def test_html_element_name_empty(self): parser = self.etree.HTMLParser() Element = parser.makeelement el = Element('name') self.assertRaises(ValueError, Element, '{}') self.assertRaises(ValueError, setattr, el, 'tag', '{}') self.assertRaises(ValueError, Element, '{test}') self.assertRaises(ValueError, setattr, el, 'tag', '{test}') def test_html_element_name_colon(self): parser = self.etree.HTMLParser() Element = parser.makeelement pname = Element('p:name') self.assertEquals(pname.tag, 'p:name') pname = Element('{test}p:name') self.assertEquals(pname.tag, '{test}p:name') pname = Element('name') pname.tag = 'p:name' self.assertEquals(pname.tag, 'p:name') def test_html_element_name_quote(self): parser = self.etree.HTMLParser() Element = parser.makeelement self.assertRaises(ValueError, Element, 'p"name') self.assertRaises(ValueError, Element, "na'me") self.assertRaises(ValueError, Element, '{test}"name') self.assertRaises(ValueError, Element, "{test}name'") el = Element('name') self.assertRaises(ValueError, setattr, el, 'tag', "pname'") self.assertRaises(ValueError, setattr, el, 'tag', '"pname') self.assertEquals(el.tag, "name") def test_html_element_name_space(self): parser = self.etree.HTMLParser() Element = parser.makeelement self.assertRaises(ValueError, Element, ' name ') self.assertRaises(ValueError, Element, 'na me') self.assertRaises(ValueError, Element, '{test} name') el = Element('name') self.assertRaises(ValueError, setattr, el, 'tag', ' name ') self.assertEquals(el.tag, "name") def test_html_subelement_name_empty(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') self.assertRaises(ValueError, SubElement, el, '{}') self.assertRaises(ValueError, SubElement, el, '{test}') def test_html_subelement_name_colon(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') pname = SubElement(el, 'p:name') self.assertEquals(pname.tag, 'p:name') pname = SubElement(el, '{test}p:name') self.assertEquals(pname.tag, '{test}p:name') def test_html_subelement_name_quote(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') self.assertRaises(ValueError, SubElement, el, "name'") self.assertRaises(ValueError, SubElement, el, 'na"me') self.assertRaises(ValueError, SubElement, el, "{test}na'me") self.assertRaises(ValueError, SubElement, el, '{test}"name') def test_html_subelement_name_space(self): parser = self.etree.HTMLParser() Element = parser.makeelement SubElement = self.etree.SubElement el = Element('name') self.assertRaises(ValueError, SubElement, el, ' name ') self.assertRaises(ValueError, SubElement, el, 'na me') self.assertRaises(ValueError, SubElement, el, '{test} name') def test_module_parse_html_norecover(self): parser = self.etree.HTMLParser(recover=False) parse = self.etree.parse f = BytesIO(self.broken_html_str) self.assertRaises(self.etree.XMLSyntaxError, parse, f, parser) def test_parse_encoding_8bit_explicit(self): text = _str('Søk på nettet') html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') tree = self.etree.parse(BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEquals(p.text, text) def test_parse_encoding_8bit_override(self): text = _str('Søk på nettet') wrong_head = _str(''' <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> </head>''') html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, text)).encode('iso-8859-1') self.assertRaises(self.etree.ParseError, self.etree.parse, BytesIO(html_latin1)) tree = self.etree.parse(BytesIO(html_latin1), self.etree.HTMLParser(encoding="iso-8859-1")) p = tree.find("//p") self.assertEquals(p.text, text) def test_module_HTML_broken(self): element = self.etree.HTML(self.broken_html_str) self.assertEqual(self.etree.tostring(element, method="html"), self.html_str) def test_module_HTML_cdata(self): # by default, libxml2 generates CDATA nodes for <script> content html = _bytes('<html><head><style>foo</style></head></html>') element = self.etree.HTML(html) self.assertEquals(element[0][0].text, "foo") def test_module_HTML_access(self): element = self.etree.HTML(self.html_str) self.assertEqual(element[0][0].tag, 'title') def test_module_parse_html(self): parser = self.etree.HTMLParser() filename = tempfile.mktemp(suffix=".html") write_to_file(filename, self.html_str, 'wb') try: f = open(filename, 'rb') tree = self.etree.parse(f, parser) f.close() self.assertEqual( self.etree.tostring(tree.getroot(), method="html"), self.html_str) finally: os.remove(filename) def test_module_parse_html_filelike(self): parser = self.etree.HTMLParser() f = SillyFileLike(self.html_str) tree = self.etree.parse(f, parser) html = self.etree.tostring(tree.getroot(), method="html", encoding='UTF-8') self.assertEqual(html, self.html_str) ## def test_module_parse_html_filelike_unicode(self): ## parser = self.etree.HTMLParser() ## f = SillyFileLike(self.uhtml_str) ## tree = self.etree.parse(f, parser) ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) def test_html_file_error(self): parser = self.etree.HTMLParser() parse = self.etree.parse self.assertRaises(IOError, parse, "__some_hopefully_nonexisting_file__.html", parser) def test_default_parser_HTML_broken(self): self.assertRaises(self.etree.XMLSyntaxError, self.etree.parse, BytesIO(self.broken_html_str)) self.etree.set_default_parser(self.etree.HTMLParser()) tree = self.etree.parse(BytesIO(self.broken_html_str)) self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), self.html_str) self.etree.set_default_parser() self.assertRaises(self.etree.XMLSyntaxError, self.etree.parse, BytesIO(self.broken_html_str)) def test_html_iterparse(self): iterparse = self.etree.iterparse f = BytesIO( '<html><head><title>TITLE</title><body><p>P</p></body></html>') iterator = iterparse(f, html=True) self.assertEquals(None, iterator.root) events = list(iterator) root = iterator.root self.assert_(root is not None) self.assertEquals([('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), ('end', root[1]), ('end', root)], events) def test_html_iterparse_file(self): iterparse = self.etree.iterparse iterator = iterparse(fileInTestDir("css_shakespear.html"), html=True) self.assertEquals(None, iterator.root) events = list(iterator) root = iterator.root self.assert_(root is not None) self.assertEquals(249, len(events)) self.assertEquals( [], [event for (event, element) in events if event != 'end']) def test_html_iterparse_start(self): iterparse = self.etree.iterparse f = BytesIO( '<html><head><title>TITLE</title><body><p>P</p></body></html>') iterator = iterparse(f, html=True, events=('start', )) self.assertEquals(None, iterator.root) events = list(iterator) root = iterator.root self.assert_(root is not None) self.assertEquals([('start', root), ('start', root[0]), ('start', root[0][0]), ('start', root[1]), ('start', root[1][0])], events)
def test_unicode_repr1(self): x = etree.Element(_str('å')) # must not raise UnicodeEncodeError repr(x)
def test_unicode_xml_declared_unknown_fails(self): uxml = _str('<?xml version="1.0" encoding="unknown"?>') + \ _str('<p>%s</p>') % uni self.assertRaises(etree.XMLSyntaxError, etree.XML, uxml)
def test_unicode_repr2(self): x = etree.Comment(_str('ö')) repr(x)
def test_unicode_xml(self): self._assert_unicode(_str('<p>%s</p>') % uni)
def test_unicode_repr3(self): x = etree.ProcessingInstruction(_str('Å'), _str('\u0131')) repr(x)
def test_unicode_qname(self): qname = etree.QName(uni, uni) tag = _str("{%s}%s") % (uni, uni) self.assertEquals(qname.text, tag) self.assertEquals(unicode(qname), tag)
def test_unicode_parse_stringio(self): el = etree.parse(StringIO(_str('<p>%s</p>') % uni)).getroot() self.assertEquals(uni, el.text)
# -*- coding: utf-8 -*- import unittest, doctest, sys, os.path this_dir = os.path.dirname(__file__) if this_dir not in sys.path: sys.path.insert(0, this_dir) # needed for Py3 from common_imports import StringIO, etree, SillyFileLike, HelperTestCase from common_imports import _str, _bytes try: unicode except NameError: unicode = str ascii_uni = _str('a') klingon = _bytes("\\uF8D2").decode("unicode_escape") # not valid for XML names invalid_tag = _str("test") + klingon uni = _bytes('\\xc3\\u0680\\u3120').decode("unicode_escape") # some non-ASCII characters uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>" ).decode("unicode_escape") class UnicodeTestCase(HelperTestCase): def test_unicode_xml(self): tree = etree.XML(_str('<p>%s</p>') % uni) self.assertEquals(uni, tree.text)
def test__str(self): # test the testing framework, namely _str from common_imports self.assertEqual(_str('\x10'), _str('\u0010')) self.assertEqual(_str('\x10'), _str('\U00000010')) self.assertEqual(_str('\u1234'), _str('\U00001234'))
def test_unicode_xml(self): tree = etree.XML(_str('<p>%s</p>') % uni) self.assertEquals(uni, tree.text)
def test_unicode_xml_broken(self): uxml = _str('<?xml version="1.0" encoding="UTF-8"?>') + \ _str('<p>%s</p>') % uni self.assertRaises(ValueError, etree.XML, uxml)
def test_unicode_xml_declared_latin1_works(self): self._assert_unicode( _str('<?xml version="1.0" encoding="latin1"?>') + \ _str('<p>%s</p>') % uni )