def test_xml_encoding(self): """A character encoding is found via the meta tag.""" encodings = _get_html_media_encodings( b""" <?xml version="1.0" encoding="ascii"?> <html> </html> """, "text/html", ) self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
def test_unknown_invalid(self): """A character encoding should be ignored if it is unknown or invalid.""" encodings = _get_html_media_encodings( b""" <html> <head><meta charset="invalid"> </head> </html> """, 'text/html; charset="invalid"', ) self.assertEqual(list(encodings), ["utf-8", "cp1252"])
def test_meta_charset_underscores(self): """A character encoding contains underscore.""" encodings = _get_html_media_encodings( b""" <html> <head><meta charset="Shift_JIS"> </head> </html> """, "text/html", ) self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
def test_duplicates(self): """Ensure each encoding is only attempted once.""" encodings = _get_html_media_encodings( b""" <?xml version="1.0" encoding="utf8"?> <html> <head><meta charset="UTF-8"> </head> </html> """, 'text/html; charset="UTF_8"', ) self.assertEqual(list(encodings), ["utf-8", "cp1252"])
def test_meta_xml_encoding(self) -> None: """Meta tags take precedence over XML encoding.""" encodings = _get_html_media_encodings( b""" <?xml version="1.0" encoding="ascii"?> <html> <head><meta charset="UTF-16"> </head> </html> """, "text/html", ) self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
def test_content_type(self): """A character encoding is found via the Content-Type header.""" # Test a few variations of the header. headers = ( 'text/html; charset="ascii";', "text/html;charset=ascii;", 'text/html; charset="ascii"', "text/html; charset=ascii", 'text/html; charset="ascii;', 'text/html; charset=ascii";', ) for header in headers: encodings = _get_html_media_encodings(b"", header) self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
def test_meta_charset(self): """A character encoding is found via the meta tag.""" encodings = _get_html_media_encodings( b""" <html> <head><meta charset="ascii"> </head> </html> """, "text/html", ) self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"]) # A less well-formed version. encodings = _get_html_media_encodings( b""" <html> <head>< meta charset = ascii> </head> </html> """, "text/html", ) self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
def test_fallback(self): """A character encoding cannot be found in the body or header.""" encodings = _get_html_media_encodings(b"", "text/html") self.assertEqual(list(encodings), ["utf-8", "cp1252"])