def test_sniffer(): tests = [ (r"", "UTF-8"), (r"""<?xml version='1.0' encoding='ISO-8859-1' ?>""", "ISO-8859-1"), (r"""<?xml version="1.0" encoding='ISO-8859-1' ?>""", "ISO-8859-1"), (r"""<?xml version="1.0" encoding='ISO-8859-1' standalone='no'?>""", "ISO-8859-1"), (r"""<?xml version='1.1' encoding="ISO-8859-1" standalone="yes" ?>""", "ISO-8859-1"), (r"""<?xml version="1.0" encoding="EBCDIC-some-cp" ?>""".encode("cp037"), "EBCDIC-some-cp"), # and now the really viciously pedantic refusals... (r""" <?xml version="1.0" encoding="ISO-8859-1" ?>""", "UTF-8"), # bad: space before decl (r"""<?xml version=1.0 encoding="ISO-8859-1" ?>""", "UTF-8"), # bad: no quotes around version value (r"""<?xml encoding="ISO-8859-1" version="1.0" ?>""", "UTF-8"), # bad: wrong order for attributes ( r"""<?xml version="1.0" encoding="ISO-8859-1" standalone=no ?>""", "UTF-8", ), # bad: no quotes around standalone value (r"""<?xml version=" 1.0" encoding="ISO-8859-1" ?>""", "UTF-8"), # bad: whitespace before version value (r"""<?xml version="1.0 " encoding="ISO-8859-1" ?>""", "UTF-8"), # bad: whitespace after version value (r"""<?xml version="1.0" encoding=" ISO-8859-1" ?>""", "UTF-8"), # bad: whitespace before encoding value (r"""<?xml version="1.0" encoding="ISO-8859-1 " ?>""", "UTF-8"), # bad: whitespace after encoding value (r"""<?xml version="1.0" encoding=Big5 ?>""", "UTF-8"), # bad: no quotes around encoding value ] for i, (s, e) in enumerate(tests): try: r = sniff_encoding(s) except ValidationError as exc: assert False, (exc, i) else: assert r == e, (r, i)
def test_sniffer_exc(): s = six.u('<?xml version="1.0" encoding="Cp037" ?>').encode("utf-8-sig") e_exc = r"Multiply-specified encoding (BOM: utf_8_sig, XML decl: Cp037) at line 1, column 1 (char 1)" try: r = sniff_encoding(s) except ValidationError as exc: assert str(exc) == e_exc, exc else: assert False, r
def test_fix_xmldecl(): # Slow compared to the other tests, but still only a few seconds. for encoding in encodings.aliases.aliases.values(): if encoding in ( "rot_13", "quopri_codec", "zlib_codec", "base64_codec", "uu_codec", "tactis", "hex_codec", "bz2_codec", ): continue try: "".encode(encoding) except LookupError: # not trying to handle unknown encodings yet continue xmldecl = fix_xmldecl(six.u(" <?xml>").encode(encoding), encoding, add_encoding=True) if encoding.lower().startswith("utf"): if "16" in encoding: if "le" in encoding.lower(): assert xmldecl.startswith(codecs.BOM_UTF16_LE) if "be" in encoding.lower(): assert xmldecl.startswith(codecs.BOM_UTF16_BE) sniffed = sniff_encoding(xmldecl) assert sniffed == encoding, (xmldecl, encoding, sniffed) xmldecl = fix_xmldecl(six.u(" <?xml>").encode(encoding), encoding, add_encoding=True) if encoding.lower().startswith("utf"): if "16" in encoding: if "le" in encoding.lower(): assert xmldecl.startswith(codecs.BOM_UTF16_LE) if "be" in encoding.lower(): assert xmldecl.startswith(codecs.BOM_UTF16_BE) sniffed = sniff_encoding(xmldecl) assert sniffed == encoding, (xmldecl, encoding, sniffed)