예제 #1
0
 def test_html_body_declared_encoding(self):
     fragments = [
         # Content-Type as meta http-equiv
         """<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""",
         """\n<meta http-equiv="Content-Type"\ncontent="text/html; charset=utf-8">""",
         """<meta content="text/html; charset=utf-8"\n http-equiv='Content-Type'>""",
         """ bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""",
         # html5 meta charset
         """<meta charset="utf-8">""",
         # xml encoding
         """<?xml version="1.0" encoding="utf-8"?>""",
     ]
     for fragment in fragments:
         encoding = html_body_declared_encoding(fragment)
         self.assertEqual(encoding, 'utf-8', fragment)
     self.assertEqual(None, html_body_declared_encoding("something else"))
     self.assertEqual(
         None,
         html_body_declared_encoding("""
         <head></head><body>
         this isn't searched
         <meta charset="utf-8">
     """))
     self.assertEqual(
         None,
         html_body_declared_encoding(
             """<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""
         ))
예제 #2
0
 def test_html_body_declared_encoding(self):
     for fragment in self.utf8_fragments:
         encoding = html_body_declared_encoding(fragment)
         self.assertEqual(encoding, 'utf-8', fragment)
     self.assertEqual(None, html_body_declared_encoding(b"something else"))
     self.assertEqual(None, html_body_declared_encoding(b"""
         <head></head><body>
         this isn't searched
         <meta charset="utf-8">
     """))
     self.assertEqual(None, html_body_declared_encoding(
         b"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
예제 #3
0
 def test_html_body_declared_encoding(self):
     for fragment in self.utf8_fragments:
         encoding = html_body_declared_encoding(fragment)
         self.assertEqual(encoding, 'utf-8', fragment)
     self.assertEqual(None, html_body_declared_encoding(b"something else"))
     self.assertEqual(None, html_body_declared_encoding(b"""
         <head></head><body>
         this isn't searched
         <meta charset="utf-8">
     """))
     self.assertEqual(None, html_body_declared_encoding(
         b"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
예제 #4
0
    def test_html_body_declared_encoding_unicode(self):
        # html_body_declared_encoding should work when unicode body is passed
        self.assertEqual(None, html_body_declared_encoding(u"something else"))

        for fragment in self.utf8_fragments:
            encoding = html_body_declared_encoding(fragment.decode('utf8'))
            self.assertEqual(encoding, 'utf-8', fragment)

        self.assertEqual(None, html_body_declared_encoding(u"""
            <head></head><body>
            this isn't searched
            <meta charset="utf-8">
        """))
        self.assertEqual(None, html_body_declared_encoding(
            u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
예제 #5
0
    def test_html_body_declared_encoding_unicode(self):
        # html_body_declared_encoding should work when unicode body is passed
        self.assertEqual(None, html_body_declared_encoding(u"something else"))

        for fragment in self.utf8_fragments:
            encoding = html_body_declared_encoding(fragment.decode('utf8'))
            self.assertEqual(encoding, 'utf-8', fragment)

        self.assertEqual(None, html_body_declared_encoding(u"""
            <head></head><body>
            this isn't searched
            <meta charset="utf-8">
        """))
        self.assertEqual(None, html_body_declared_encoding(
            u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
예제 #6
0
def root_node(text, encoding=None, base_url=None):
    try:
        text = text.read()
    except AttributeError:
        pass

    if isinstance(text, str):
        parser = html.HTMLParser(recover=True)
    else:
        parser = html.HTMLParser(recover=True, encoding=encoding)

    try:
        return etree.fromstring(
            text,
            parser=parser,
            base_url=base_url
        )
    except ValueError as e:
        if 'Unicode strings with encoding declaration are not supported' in str(e):
            # html_body_declared_encoding may return None
            encoding = html_body_declared_encoding(text) or 'utf-8'

            return etree.fromstring(
                text.encode(encoding),
                parser=parser,
                base_url=base_url
            )
        raise
예제 #7
0
 def test_html_body_declared_encoding(self):
     fragments = [
         # Content-Type as meta http-equiv
         """<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""",
         """\n<meta http-equiv="Content-Type"\ncontent="text/html; charset=utf-8">""",
         """<meta content="text/html; charset=utf-8"\n http-equiv='Content-Type'>""",
         """ bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""",
         # html5 meta charset
         """<meta charset="utf-8">""",
         # xml encoding
         """<?xml version="1.0" encoding="utf-8"?>""",
     ]
     for fragment in fragments:
         encoding = html_body_declared_encoding(fragment)
         self.assertEqual(encoding, 'utf-8', fragment)
     self.assertEqual(None, html_body_declared_encoding("something else"))
     self.assertEqual(None, html_body_declared_encoding("""
         <head></head><body>
         this isn't searched
         <meta charset="utf-8">
     """))
     self.assertEqual(None, html_body_declared_encoding(
         """<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
예제 #8
0
def _encode_or_decode_string(html, method, default):
    if not default:
        encoding = html_body_declared_encoding(html)
        if encoding:
            default = [encoding]
        else:
            default = []
    elif isinstance(default, six.string_types):
        default = [default]
    for encoding in itertools.chain(default, ENCODINGS):
        try:
            return method(html, encoding)
        except UnicodeDecodeError:
            pass
    encoding = chardet.detect(html).get('encoding')
    return method(html, encoding)
예제 #9
0
파일: utils.py 프로젝트: fakegit/portia
def _encode_or_decode_string(html, method, default):
    if not default:
        encoding = html_body_declared_encoding(html)
        if encoding:
            default = [encoding]
        else:
            default = []
    elif isinstance(default, six.string_types):
        default = [default]
    for encoding in itertools.chain(default, ENCODINGS):
        try:
            return method(html, encoding)
        except (UnicodeDecodeError, UnicodeEncodeError, LookupError):
            pass
        except AttributeError:
            return html
    encoding = chardet.detect(html).get('encoding')
    return method(html, encoding)
예제 #10
0
파일: util.py 프로젝트: yz599/ArchiveBox
def download_url(url: str, timeout: int=None) -> str:
    """Download the contents of a remote url and return the text"""
    from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT
    timeout = timeout or TIMEOUT
    response = requests.get(
        url,
        headers={'User-Agent': WGET_USER_AGENT},
        verify=CHECK_SSL_VALIDITY,
        timeout=timeout,
    )

    content_type = response.headers.get('Content-Type', '')
    encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text)

    if encoding is not None:
        response.encoding = encoding

    return response.text
def find_response_encoding(response):
    """
	如果html的body中有charset声明的话,就会
	返回相应的内容.如果没有发现,就是用chardet来估算出网页所使用的字符编码
	"""
    r = response.body
    encoding = html_body_declared_encoding(r)
    if encoding:
        return encoding
    else:
        my_stringio = cStringIO.StringIO(r)
        my_detector = UniversalDetector()
        for x in my_stringio:
            my_detector.feed(x)
            if my_detector.done:
                break
        my_detector.close()
        return my_detector.result['encoding']
예제 #12
0
파일: text.py 프로젝트: wusy1209/scrapy
 def _body_declared_encoding(self):
     return html_body_declared_encoding(self.body)
예제 #13
0
 def _body_declared_encoding(cls,response):
     """
         根据body 查看编码类型。
         自动探测编码 是从scrapy学来的
     """
     return html_body_declared_encoding(response.body)
예제 #14
0
 def _body_declared_encoding(self):
     return html_body_declared_encoding(self.body)
예제 #15
0
 def _body_declared_encoding(cls, response):
     """
         根据body 查看编码类型。
         自动探测编码 是从scrapy学来的
     """
     return html_body_declared_encoding(response.body)
예제 #16
0
    def _body_declared_encoding(self):
        """
        从html xml等获取<meta charset="编码">
        """

        return html_body_declared_encoding(self.content)