def test_xml_encoding(self):
     """A character encoding is found via the meta tag."""
     encodings = _get_html_media_encodings(
         b"""
     <?xml version="1.0" encoding="ascii"?>
     <html>
     </html>
     """,
         "text/html",
     )
     self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
 def test_unknown_invalid(self):
     """A character encoding should be ignored if it is unknown or invalid."""
     encodings = _get_html_media_encodings(
         b"""
     <html>
     <head><meta charset="invalid">
     </head>
     </html>
     """,
         'text/html; charset="invalid"',
     )
     self.assertEqual(list(encodings), ["utf-8", "cp1252"])
 def test_meta_charset_underscores(self):
     """A character encoding contains underscore."""
     encodings = _get_html_media_encodings(
         b"""
     <html>
     <head><meta charset="Shift_JIS">
     </head>
     </html>
     """,
         "text/html",
     )
     self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
 def test_duplicates(self):
     """Ensure each encoding is only attempted once."""
     encodings = _get_html_media_encodings(
         b"""
     <?xml version="1.0" encoding="utf8"?>
     <html>
     <head><meta charset="UTF-8">
     </head>
     </html>
     """,
         'text/html; charset="UTF_8"',
     )
     self.assertEqual(list(encodings), ["utf-8", "cp1252"])
Exemplo n.º 5
0
 def test_meta_xml_encoding(self) -> None:
     """Meta tags take precedence over XML encoding."""
     encodings = _get_html_media_encodings(
         b"""
     <?xml version="1.0" encoding="ascii"?>
     <html>
     <head><meta charset="UTF-16">
     </head>
     </html>
     """,
         "text/html",
     )
     self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
 def test_content_type(self):
     """A character encoding is found via the Content-Type header."""
     # Test a few variations of the header.
     headers = (
         'text/html; charset="ascii";',
         "text/html;charset=ascii;",
         'text/html;  charset="ascii"',
         "text/html; charset=ascii",
         'text/html; charset="ascii;',
         'text/html; charset=ascii";',
     )
     for header in headers:
         encodings = _get_html_media_encodings(b"", header)
         self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
    def test_meta_charset(self):
        """A character encoding is found via the meta tag."""
        encodings = _get_html_media_encodings(
            b"""
        <html>
        <head><meta charset="ascii">
        </head>
        </html>
        """,
            "text/html",
        )
        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])

        # A less well-formed version.
        encodings = _get_html_media_encodings(
            b"""
        <html>
        <head>< meta charset = ascii>
        </head>
        </html>
        """,
            "text/html",
        )
        self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
 def test_fallback(self):
     """A character encoding cannot be found in the body or header."""
     encodings = _get_html_media_encodings(b"", "text/html")
     self.assertEqual(list(encodings), ["utf-8", "cp1252"])