def _auto_detect_fun(self, text): for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'): try: text.decode(enc) except UnicodeError: continue return resolve_encoding(enc)
def _auto_detect_fun(self, text): for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'): try: text.decode(enc) except UnicodeError: continue # print('!!!!!!!!', enc) return resolve_encoding( enc) # todo, whats? how do you kill my encoding?
def _detect_encoding(bytestring, default_encoding='utf-8'): # NOTE: alternatively `UnicodeDammit(x).originalEncoding` # NOTE: alternatively use scrapy.http.TextResponse().text encoding = chardet.detect(bytestring).get('encoding') if encoding: # TODO: `resolve_encoding`? return resolve_encoding(encoding) else: return default_encoding
def _auto_detect_fun(self, text): for enc in (self._DEFAULT_ENCODING, 'utf-8', 'ascii', 'GB18030'): try: text.decode(enc) except UnicodeError: continue return resolve_encoding(enc) #detect by chardet by wsy cc = chardet.detect(text) if cc is None: return enc = cc.get('encoding', None) if enc is None: return try: text.decode(enc) except: return None else: return resolve_encoding(enc)
def test_process_response_no_content_type_header(self): headers = { 'Content-Encoding': 'identity', } plainbody = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">""" respcls = responsetypes.from_args(url="http://www.example.com/index", headers=headers, body=plainbody) response = respcls("http://www.example.com/index", headers=headers, body=plainbody) request = Request("http://www.example.com/index") newresponse = self.mw.process_response(request, response, self.spider) assert isinstance(newresponse, respcls) self.assertEqual(newresponse.body, plainbody) self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
def test_process_response_force_recalculate_encoding(self): headers = { 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', } f = BytesIO() plainbody = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">""" zf = GzipFile(fileobj=f, mode='wb') zf.write(plainbody) zf.close() response = HtmlResponse("http;//www.example.com/page.html", headers=headers, body=f.getvalue()) request = Request("http://www.example.com/") newresponse = self.mw.process_response(request, response, self.spider) assert isinstance(newresponse, HtmlResponse) self.assertEqual(newresponse.body, plainbody) self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
def test_process_response_encoding_inside_body(self): headers = { 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', } f = BytesIO() plainbody = ( b'<html><head><title>Some page</title>' b'<meta http-equiv="Content-Type" content="text/html; charset=gb2312">' ) zf = GzipFile(fileobj=f, mode='wb') zf.write(plainbody) zf.close() response = Response("http;//www.example.com/", headers=headers, body=f.getvalue()) request = Request("http://www.example.com/") newresponse = self.mw.process_response(request, response, self.spider) assert isinstance(newresponse, HtmlResponse) self.assertEqual(newresponse.body, plainbody) self.assertEqual(newresponse.encoding, resolve_encoding('gb2312')) self.assertStatsEqual('httpcompression/response_count', 1) self.assertStatsEqual('httpcompression/response_bytes', 104)
def _assert_response_encoding(self, response, encoding): self.assertEqual(response.encoding, resolve_encoding(encoding))
def test_resolve_encoding(self): self.assertEqual(resolve_encoding('latin1'), 'cp1252') self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252') self.assertEqual(resolve_encoding('gb_2312-80'), 'gb18030') self.assertEqual(resolve_encoding('unknown encoding'), None)
def test_resolve_encoding(self): self.assertEqual(resolve_encoding("latin1"), "cp1252") self.assertEqual(resolve_encoding(" Latin-1"), "cp1252") self.assertEqual(resolve_encoding("gb_2312-80"), "gb18030") self.assertEqual(resolve_encoding("unknown encoding"), None)