def test_with_header_encoding(self): "Test with Content-Type decalred in a HTTP response header." for encoding in ['utf-8', 'utf-16', 'iso-8859-1', 'windows-1252']: http_headers = {'Content-Type': 'text/html; charset=' + encoding} html = multiline_string(u''' <html> <body> <p>\u00E1</p> </body> </html> ''') self.assertEqual(html, decode_html(html.encode(encoding), http_headers))
def test_just_ascii(self): "Test without any Content-Type declaration, with ASCII-only HTML." for encoding in ['utf-8', 'utf-16', 'iso-8859-1', 'windows-1252']: html = multiline_string(u''' <html> <body> <p>Hello ASCII!</p> </body> </html> ''') decoded_html = decode_html(html.encode(encoding)) self.assertEqual(html, decoded_html) self.assertTrue(isinstance(decoded_html, six.text_type))
def test_ascii_without_declaration(self): html = multiline_string(u''' <html> <head> <title>Half</title> </head> <body> <p>ASCII only!</p> </body> </html> ''') response = self._make_urlopen_response(html, 'ascii') result_html = get_response_html(response) self.assertEqual(result_html, html)
def test_with_meta_equiv_tag(self): "Test with Content-Type declared in a <meta http-equiv=...> HTML tag." for encoding in ['utf-8', 'utf-16', 'iso-8859-1', 'windows-1252']: html = multiline_string(u''' <html> <head> <meta http-equiv="Content-Type" content="text/html; charset="{charset}"> </head> <body> <p>\u00E1</p> </body> </html> ''').format(charset=encoding) self.assertEqual(html, decode_html(html.encode(encoding), http_headers={'Content-Type': 'text/html'}))
def test_ascii_without_declaration(self): html = multiline_string(u''' <html> <head> <title>Half</title> </head> <body> <p>ASCII only!</p> </body> </html> ''') response = requests.models.Response() response.status_code = 200 response._content = html.encode('ascii') result_html = get_response_html(response) self.assertEqual(result_html, html)
def test_inline_vs_header_charsets(self): html_template = multiline_string(u''' <html> <head> <title>Half</title> <meta http-equiv="Content-Type" content="text/html; charset="{charset}"> </head> <body> <p>{content}</p> </body> </html> ''') for encoding in ['utf-8', 'utf-16', 'latin-1', 'windows-1252']: for http_header_encoding in [ 'utf-8', 'utf-16', 'latin-1', 'windows-1252', None ]: chars = encoding2chars.get(encoding, windows1252_chars | latin1_chars) html = html_template.format( charset=encoding, content=html_escape(''.join(sorted(chars))), ) session = requests.Session() session.hooks['response'].append(request_hook) headers = { 'Content-Type': 'text/html; charset={charset}'.format( charset=http_header_encoding) if http_header_encoding is not None else 'text/html', } httpretty.register_uri(httpretty.GET, 'http://www.example.com/', body=html.encode(encoding), adding_headers=headers) response = session.get('http://www.example.com/') self.assertEqual( response.text, html, msg="encoding={}, http_header_encoding={}".format( encoding, http_header_encoding), )
def test_with_xhtml_doctype_encoding(self): "Test with Content-Type declared in the XML <?xml ...> tag." for encoding in ['utf-8', 'utf-16', 'iso-8859-1', 'windows-1252']: # lxml.html raises ValueError if given unicode # and the document has an XML encoding declaration html = multiline_string(u''' <?xml version="1.0" encoding="{charset}"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title>Half</title> </head> <body> <p>\u00BD</p> </body> </html> ''').format(charset=encoding) self.assertEqual(html, decode_html(html.encode(encoding)))
def test_with_doctype_encoding(self): # lxml.html (used by lxmlHTMLParser) raises ValueError if given unicode # and the document has an XML encoding declaration html = multiline_string(u''' <?xml version="1.0" encoding="utf-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title>Money!</title> </head> <body> <p>\u20AA</p> </body> </html> ''') encoded_html = html.encode("utf-8") parsed = make_lxml_html(encoded_html) self.assertEqual(u'\u20AA', parsed.xpath('//p/text()')[0])
def test_inline_vs_header_charsets(self): html_template = multiline_string(u''' <html> <head> <title>Half</title> <meta http-equiv="Content-Type" content="text/html; charset="{charset}"> </head> <body> <p>{content}</p> </body> </html> ''') for encoding in ['utf-8', 'utf-16', 'latin-1', 'windows-1252']: for http_header_encoding in [ 'utf-8', 'utf-16', 'latin-1', 'windows-1252', None ]: chars = encoding2chars.get(encoding, windows1252_chars | latin1_chars) html = html_template.format( charset=encoding, content=html_escape(''.join(sorted(chars))), ) headers = { 'Content-Type': 'text/html; charset={charset}'.format( charset=http_header_encoding) if http_header_encoding is not None else 'text/html', } response = self._make_urlopen_response(html, encoding, headers=headers) result_html = get_response_html(response) self.assertEqual( result_html, html, msg="encoding={}, http_header_encoding={}".format( encoding, http_header_encoding), )
def test_inline_vs_header_charsets(self): html_template = multiline_string(u''' <html> <head> <title>Half</title> <meta http-equiv="Content-Type" content="text/html; charset="{charset}"> </head> <body> <p>{content}</p> </body> </html> ''') for encoding in ['utf-8', 'utf-16', 'latin-1', 'windows-1252']: for http_header_encoding in [ 'utf-8', 'utf-16', 'latin-1', 'windows-1252', None ]: chars = encoding2chars.get(encoding, windows1252_chars | latin1_chars) html = html_template.format( charset=encoding, content=html_escape(''.join(sorted(chars))), ) response = requests.models.Response() response.status_code = 200 response._content = html.encode(encoding) if http_header_encoding is not None: response.headers['Content-Type'] = \ 'text/html; charset={charset}'.format(charset=http_header_encoding) else: response.headers['Content-Type'] = 'text/html' result_html = get_response_html(response) self.assertEqual( result_html, html, msg="encoding={}, http_header_encoding={}".format( encoding, http_header_encoding), )