Exemplo n.º 1
0
 def test_with_header_encoding(self):
     "Test with Content-Type decalred in a HTTP response header."
     for encoding in ['utf-8', 'utf-16', 'iso-8859-1', 'windows-1252']:
         http_headers = {'Content-Type': 'text/html; charset=' + encoding}
         html = multiline_string(u'''
             <html>
                 <body>
                     <p>\u00E1</p>
                 </body>
             </html>
             ''')
         self.assertEqual(html, decode_html(html.encode(encoding), http_headers))
Exemplo n.º 2
0
 def test_just_ascii(self):
     "Test without any Content-Type declaration, with ASCII-only HTML."
     for encoding in ['utf-8', 'utf-16', 'iso-8859-1', 'windows-1252']:
         html = multiline_string(u'''
             <html>
                 <body>
                     <p>Hello ASCII!</p>
                 </body>
             </html>
             ''')
         decoded_html = decode_html(html.encode(encoding))
         self.assertEqual(html, decoded_html)
         self.assertTrue(isinstance(decoded_html, six.text_type))
Exemplo n.º 3
0
 def test_ascii_without_declaration(self):
     html = multiline_string(u'''
         <html>
         <head>
             <title>Half</title>
         </head>
         <body>
             <p>ASCII only!</p>
         </body>
         </html>
         ''')
     response = self._make_urlopen_response(html, 'ascii')
     result_html = get_response_html(response)
     self.assertEqual(result_html, html)
Exemplo n.º 4
0
 def test_with_meta_equiv_tag(self):
     "Test with Content-Type declared in a <meta http-equiv=...> HTML tag."
     for encoding in ['utf-8', 'utf-16', 'iso-8859-1', 'windows-1252']:
         html = multiline_string(u'''
             <html>
                 <head>
                     <meta http-equiv="Content-Type" content="text/html; charset="{charset}">
                 </head>
                 <body>
                     <p>\u00E1</p>
                 </body>
             </html>
             ''').format(charset=encoding)
         self.assertEqual(html, decode_html(html.encode(encoding),
                                            http_headers={'Content-Type': 'text/html'}))
Exemplo n.º 5
0
    def test_ascii_without_declaration(self):
        html = multiline_string(u'''
            <html>
            <head>
                <title>Half</title>
            </head>
            <body>
                <p>ASCII only!</p>
            </body>
            </html>
            ''')

        response = requests.models.Response()
        response.status_code = 200
        response._content = html.encode('ascii')

        result_html = get_response_html(response)
        self.assertEqual(result_html, html)
Exemplo n.º 6
0
    def test_inline_vs_header_charsets(self):
        html_template = multiline_string(u'''
            <html>
            <head>
                <title>Half</title>
                <meta http-equiv="Content-Type" content="text/html; charset="{charset}">
            </head>
            <body>
                <p>{content}</p>
            </body>
            </html>
            ''')
        for encoding in ['utf-8', 'utf-16', 'latin-1', 'windows-1252']:
            for http_header_encoding in [
                    'utf-8', 'utf-16', 'latin-1', 'windows-1252', None
            ]:
                chars = encoding2chars.get(encoding,
                                           windows1252_chars | latin1_chars)
                html = html_template.format(
                    charset=encoding,
                    content=html_escape(''.join(sorted(chars))),
                )

                session = requests.Session()
                session.hooks['response'].append(request_hook)
                headers = {
                    'Content-Type':
                    'text/html; charset={charset}'.format(
                        charset=http_header_encoding)
                    if http_header_encoding is not None else 'text/html',
                }
                httpretty.register_uri(httpretty.GET,
                                       'http://www.example.com/',
                                       body=html.encode(encoding),
                                       adding_headers=headers)
                response = session.get('http://www.example.com/')
                self.assertEqual(
                    response.text,
                    html,
                    msg="encoding={}, http_header_encoding={}".format(
                        encoding, http_header_encoding),
                )
Exemplo n.º 7
0
 def test_with_xhtml_doctype_encoding(self):
     "Test with Content-Type declared in the XML <?xml ...> tag."
     for encoding in ['utf-8', 'utf-16', 'iso-8859-1', 'windows-1252']:
         # lxml.html raises ValueError if given unicode
         # and the document has an XML encoding declaration
         html = multiline_string(u'''
             <?xml version="1.0" encoding="{charset}"?>
             <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
               "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 
             <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
                 <head>
                     <title>Half</title>
                 </head>
                 <body>
                     <p>\u00BD</p>
                 </body>
             </html>
             ''').format(charset=encoding)
         self.assertEqual(html, decode_html(html.encode(encoding)))
Exemplo n.º 8
0
    def test_with_doctype_encoding(self):
        # lxml.html (used by lxmlHTMLParser) raises ValueError if given unicode
        # and the document has an XML encoding declaration
        html = multiline_string(u'''
            <?xml version="1.0" encoding="utf-8"?>
            <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
              "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">

            <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
            <head>
                <title>Money!</title>
            </head>
            <body>
                <p>\u20AA</p>
            </body>
            </html>
            ''')
        encoded_html = html.encode("utf-8")

        parsed = make_lxml_html(encoded_html)
        self.assertEqual(u'\u20AA', parsed.xpath('//p/text()')[0])
Exemplo n.º 9
0
 def test_inline_vs_header_charsets(self):
     html_template = multiline_string(u'''
         <html>
         <head>
             <title>Half</title>
             <meta http-equiv="Content-Type" content="text/html; charset="{charset}">
         </head>
         <body>
             <p>{content}</p>
         </body>
         </html>
         ''')
     for encoding in ['utf-8', 'utf-16', 'latin-1', 'windows-1252']:
         for http_header_encoding in [
                 'utf-8', 'utf-16', 'latin-1', 'windows-1252', None
         ]:
             chars = encoding2chars.get(encoding,
                                        windows1252_chars | latin1_chars)
             html = html_template.format(
                 charset=encoding,
                 content=html_escape(''.join(sorted(chars))),
             )
             headers = {
                 'Content-Type':
                 'text/html; charset={charset}'.format(
                     charset=http_header_encoding)
                 if http_header_encoding is not None else 'text/html',
             }
             response = self._make_urlopen_response(html,
                                                    encoding,
                                                    headers=headers)
             result_html = get_response_html(response)
             self.assertEqual(
                 result_html,
                 html,
                 msg="encoding={}, http_header_encoding={}".format(
                     encoding, http_header_encoding),
             )
Exemplo n.º 10
0
 def test_inline_vs_header_charsets(self):
     html_template = multiline_string(u'''
         <html>
         <head>
             <title>Half</title>
             <meta http-equiv="Content-Type" content="text/html; charset="{charset}">
         </head>
         <body>
             <p>{content}</p>
         </body>
         </html>
         ''')
     for encoding in ['utf-8', 'utf-16', 'latin-1', 'windows-1252']:
         for http_header_encoding in [
                 'utf-8', 'utf-16', 'latin-1', 'windows-1252', None
         ]:
             chars = encoding2chars.get(encoding,
                                        windows1252_chars | latin1_chars)
             html = html_template.format(
                 charset=encoding,
                 content=html_escape(''.join(sorted(chars))),
             )
             response = requests.models.Response()
             response.status_code = 200
             response._content = html.encode(encoding)
             if http_header_encoding is not None:
                 response.headers['Content-Type'] = \
                     'text/html; charset={charset}'.format(charset=http_header_encoding)
             else:
                 response.headers['Content-Type'] = 'text/html'
             result_html = get_response_html(response)
             self.assertEqual(
                 result_html,
                 html,
                 msg="encoding={}, http_header_encoding={}".format(
                     encoding, http_header_encoding),
             )