def test_html_body_declared_encoding(self): fragments = [ # Content-Type as meta http-equiv """<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""", """\n<meta http-equiv="Content-Type"\ncontent="text/html; charset=utf-8">""", """<meta content="text/html; charset=utf-8"\n http-equiv='Content-Type'>""", """ bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""", # html5 meta charset """<meta charset="utf-8">""", # xml encoding """<?xml version="1.0" encoding="utf-8"?>""", ] for fragment in fragments: encoding = html_body_declared_encoding(fragment) self.assertEqual(encoding, 'utf-8', fragment) self.assertEqual(None, html_body_declared_encoding("something else")) self.assertEqual( None, html_body_declared_encoding(""" <head></head><body> this isn't searched <meta charset="utf-8"> """)) self.assertEqual( None, html_body_declared_encoding( """<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">""" ))
def test_html_body_declared_encoding(self): for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment) self.assertEqual(encoding, 'utf-8', fragment) self.assertEqual(None, html_body_declared_encoding(b"something else")) self.assertEqual(None, html_body_declared_encoding(b""" <head></head><body> this isn't searched <meta charset="utf-8"> """)) self.assertEqual(None, html_body_declared_encoding( b"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
def test_html_body_declared_encoding(self): for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment) self.assertEqual(encoding, 'utf-8', fragment) self.assertEqual(None, html_body_declared_encoding(b"something else")) self.assertEqual(None, html_body_declared_encoding(b""" <head></head><body> this isn't searched <meta charset="utf-8"> """)) self.assertEqual(None, html_body_declared_encoding( b"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
def test_html_body_declared_encoding_unicode(self): # html_body_declared_encoding should work when unicode body is passed self.assertEqual(None, html_body_declared_encoding(u"something else")) for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment.decode('utf8')) self.assertEqual(encoding, 'utf-8', fragment) self.assertEqual(None, html_body_declared_encoding(u""" <head></head><body> this isn't searched <meta charset="utf-8"> """)) self.assertEqual(None, html_body_declared_encoding( u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
def test_html_body_declared_encoding_unicode(self): # html_body_declared_encoding should work when unicode body is passed self.assertEqual(None, html_body_declared_encoding(u"something else")) for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment.decode('utf8')) self.assertEqual(encoding, 'utf-8', fragment) self.assertEqual(None, html_body_declared_encoding(u""" <head></head><body> this isn't searched <meta charset="utf-8"> """)) self.assertEqual(None, html_body_declared_encoding( u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
def root_node(text, encoding=None, base_url=None): try: text = text.read() except AttributeError: pass if isinstance(text, str): parser = html.HTMLParser(recover=True) else: parser = html.HTMLParser(recover=True, encoding=encoding) try: return etree.fromstring( text, parser=parser, base_url=base_url ) except ValueError as e: if 'Unicode strings with encoding declaration are not supported' in str(e): # html_body_declared_encoding may return None encoding = html_body_declared_encoding(text) or 'utf-8' return etree.fromstring( text.encode(encoding), parser=parser, base_url=base_url ) raise
def test_html_body_declared_encoding(self): fragments = [ # Content-Type as meta http-equiv """<meta http-equiv="content-type" content="text/html;charset=UTF-8" />""", """\n<meta http-equiv="Content-Type"\ncontent="text/html; charset=utf-8">""", """<meta content="text/html; charset=utf-8"\n http-equiv='Content-Type'>""", """ bad html still supported < meta http-equiv='Content-Type'\n content="text/html; charset=utf-8">""", # html5 meta charset """<meta charset="utf-8">""", # xml encoding """<?xml version="1.0" encoding="utf-8"?>""", ] for fragment in fragments: encoding = html_body_declared_encoding(fragment) self.assertEqual(encoding, 'utf-8', fragment) self.assertEqual(None, html_body_declared_encoding("something else")) self.assertEqual(None, html_body_declared_encoding(""" <head></head><body> this isn't searched <meta charset="utf-8"> """)) self.assertEqual(None, html_body_declared_encoding( """<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
def _encode_or_decode_string(html, method, default): if not default: encoding = html_body_declared_encoding(html) if encoding: default = [encoding] else: default = [] elif isinstance(default, six.string_types): default = [default] for encoding in itertools.chain(default, ENCODINGS): try: return method(html, encoding) except UnicodeDecodeError: pass encoding = chardet.detect(html).get('encoding') return method(html, encoding)
def _encode_or_decode_string(html, method, default): if not default: encoding = html_body_declared_encoding(html) if encoding: default = [encoding] else: default = [] elif isinstance(default, six.string_types): default = [default] for encoding in itertools.chain(default, ENCODINGS): try: return method(html, encoding) except (UnicodeDecodeError, UnicodeEncodeError, LookupError): pass except AttributeError: return html encoding = chardet.detect(html).get('encoding') return method(html, encoding)
def download_url(url: str, timeout: int=None) -> str: """Download the contents of a remote url and return the text""" from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT timeout = timeout or TIMEOUT response = requests.get( url, headers={'User-Agent': WGET_USER_AGENT}, verify=CHECK_SSL_VALIDITY, timeout=timeout, ) content_type = response.headers.get('Content-Type', '') encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text) if encoding is not None: response.encoding = encoding return response.text
def find_response_encoding(response): """ 如果html的body中有charset声明的话,就会 返回相应的内容.如果没有发现,就是用chardet来估算出网页所使用的字符编码 """ r = response.body encoding = html_body_declared_encoding(r) if encoding: return encoding else: my_stringio = cStringIO.StringIO(r) my_detector = UniversalDetector() for x in my_stringio: my_detector.feed(x) if my_detector.done: break my_detector.close() return my_detector.result['encoding']
def _body_declared_encoding(self): return html_body_declared_encoding(self.body)
def _body_declared_encoding(cls,response): """ 根据body 查看编码类型。 自动探测编码 是从scrapy学来的 """ return html_body_declared_encoding(response.body)
def _body_declared_encoding(self): return html_body_declared_encoding(self.body)
def _body_declared_encoding(cls, response): """ 根据body 查看编码类型。 自动探测编码 是从scrapy学来的 """ return html_body_declared_encoding(response.body)
def _body_declared_encoding(self): """ 从html xml等获取<meta charset="编码"> """ return html_body_declared_encoding(self.content)