def _unpack_content(self, raw_data): """Extract data from the body of a REST response object. :param raw_data: Data to be processed. This could be a requests.Response object, in which case the json content will be be returned. """ if raw_data and isinstance(raw_data, bytes): data = raw_data.decode( encoding=chardet.detect(raw_data)['encoding']) else: data = raw_data if hasattr(raw_data, 'content'): if not raw_data.content: return None if isinstance(raw_data.content, bytes): encoding = chardet.detect(raw_data.content)["encoding"] data = raw_data.content.decode(encoding=encoding) else: data = raw_data.content try: return json.loads(data) except (ValueError, TypeError): return data return data
def transferencode(): with open(config.http_repo_path+'---awtrc-ict-ac-cn-index-php-mact=News,cntnt01,detail,0&cntnt01articleid=217&cntnt01detailtemplate=custom_detail&cntnt01lang=zh_CN&cntnt01returnid=79.html') as f: html=f.read() print chardet.detect(html) html=html.encode('gb2312') with open('utf.html','w') as f: f.write(html)
def encode(): x = '天气' print chardet.detect(x), type(x), x y = x.decode('GB2312').encode('utf-8') # y = xx.decode('').encode('utf-8') # z = x.encode(encoding='utf-8') print chardet.detect(y), type(y), y
def dotask(self): self.cmd = 'ping' if sys.platform == "win32": self.cmd += ' -n 3 ' else: self.cmd += ' -c 3 ' self.cmd += self.host # os.popen(self.cmd, 'r', self.result) print "default coding type: {0}".format(sys.getdefaultencoding()) reload(sys) sys.setdefaultencoding('utf-8') args = shlex.split(self.cmd) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (tmpoutput, tmpouterr) = p.communicate() self.retcode = p.returncode self.output = "".join(tmpoutput) self.outerr = "".join(tmpouterr) print "return code: %d" % self.retcode print "stdout:" if self.output: if not isinstance(self.output, unicode): self.output = self.output.decode(chardet.detect(self.output)['encoding']) print self.output print "stderr:" if self.outerr: if not isinstance(self.outerr, unicode): self.outerr = self.outerr.decode(chardet.detect(self.outerr)['encoding']) print self.outerr '''
def get_str_charset(self, the_str): need_len = 400 test_str = the_str[:need_len] test_str = test_str.decode('ascii', 'ignore').lower() cs = 'charset=' if cs in test_str: idx_start = test_str.find(cs) + len(cs) idx_end = idx_start + 16 st = test_str[idx_start:idx_end].split('\'')[0].split('\"')[0] if len(st) > 0: return st if len(the_str) < need_len: charset_info = chardet.detect(the_str) else: charset_info = chardet.detect(the_str[:need_len]) while charset_info['confidence'] < 0.9: need_len += 400 charset_info = chardet.detect(the_str[:need_len]) if need_len > len(the_str): break if charset_info['confidence'] >= 0.9: return charset_info['encoding'] else: return 'utf-8'
def __init__(self, extractor='DefaultExtractor', **kwargs): if kwargs.get('url'): request = urllib2.urlopen(kwargs['url']) self.data = request.read() encoding = request.headers['content-type'].lower().split('charset=')[-1] if encoding.lower() == 'text/html': encoding = chardet.detect(self.data)['encoding'] self.data = unicode(self.data, encoding) elif kwargs.get('html'): self.data = kwargs['html'] if not isinstance(self.data, unicode): self.data = unicode(self.data, chardet.detect(self.data)['encoding']) else: raise Exception('No text or url provided') try: # make it thread-safe if threading.activeCount() > 1: if jpype.isThreadAttachedToJVM() == False: jpype.attachThreadToJVM() lock.acquire() self.extractor = jpype.JClass( "de.l3s.boilerpipe.extractors."+extractor).INSTANCE finally: lock.release() reader = StringReader(self.data) self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument() self.extractor.process(self.source)
def writeData(self): query = self.prefix for i in range(0,len(self.values)): if i > 0: query += ", " query += "(" for j in range(0,len(self.fields)): if j > 0: query += ", " if not isinstance(self.values[i][j], (str, unicode)): # Is not string query += "'" + str(self.values[i][j]) + "'" elif self.values[i][j] == "NULL": query += "NULL" elif self.values[i][j][0:12] == "GeomFromText": query += self.values[i][j] else: try: if self.values[i][j] == '': value = u'' else: charset = chardet.detect(self.values[i][j])['encoding'] value = unicode(self.values[i][j].decode(charset).replace(u"'",u"\\'")) query += u"'" + value + u"'" except Exception, e: print query print self.values[i] print chardet.detect(self.values[i][j]) print e sys.exit(1) query += ")"
def SavePhoto(aid, pid, purl, ptitle): global g_download_success_count global g_download_fail_count global g_download_fail_list dirname = os.path.join("data" , aid) if not os.path.exists(dirname): os.makedirs(dirname) download_success = False for i in range(0,3): try: print purl photo_content = urllib2.urlopen(purl).read() download_success = True break except Exception: print "download fail, sleep 3 seconds" time.sleep(3) if not download_success: g_download_fail_count += 1 g_download_fail_list.append(purl) return filename = os.path.join(dirname.decode("utf-8"), pid+ ".jpg") f = open(filename, 'w') f.write(photo_content) f.close() filename = os.path.join(dirname, pid + '.txt') f = open(filename, 'w') print ptitle print chardet.detect(ptitle) f.write(ptitle) f.close() g_download_success_count +=1
def html_to_txt(): """将多个html文件合并为一个txt文件,统一编码为utf-8 or ascii """ ft = open(YAHOO_TXT, 'w') start = 1 while 1: filename = YAHOO_DIR+ str(start) + '.html' if os.path.isfile(filename): fp = open(filename, 'r') htmltxt = ''.join(fp.readlines()) if not htmltxt or not len(htmltxt): continue fp.close() codedetect = chardet.detect(htmltxt)["encoding"] #检测得到修改之前的编码方式 print codedetect if not codedetect in ['utf-8', 'ascii']: htmltxt = unicode(htmltxt, codedetect).encode('utf-8') codedetect = chardet.detect(htmltxt)["encoding"] #检测得到修改之后的编码方式 print 'change', codedetect ft.write(html2txt(htmltxt)) print 'Success change html to txt %s' % start start += 1 else: break ft.close()
def decode_by_charset(content): # type: (bytes) -> Text r""" Detect the charset encoding of a string and decodes to unicode strings. >>> decode_by_charset(u'\u4e2d\u6587'.encode('UTF-8')) '\u4e2d\u6587' >>> decode_by_charset(u'\u4e2d\u6587'.encode('HZ-GB-2312')) '\u4e2d\u6587' """ encoding = chardet.detect(content)['encoding'] # Sometimes, the content is well encoded but the last few bytes. This is # common in the files downloaded by old versions of OSD Lyrics. In this # case,chardet may fail to determine what the encoding it is. So we take # half of the content of it and try again. if not encoding and len(content) > DETECT_CHARSET_GUESS_MIN_LEN: logging.warning('Failed to detect encoding, try to decode a part of it') content_half = len(content) // 2 slice_end = min(max(DETECT_CHARSET_GUESS_MIN_LEN, content_half), DETECT_CHARSET_GUESS_MAX_LEN) encoding = chardet.detect(content[:slice_end])['encoding'] logging.warning('guess encoding from part: ' + encoding) if not encoding: logging.warning('Failed to detect encoding, use utf-8 as fallback') encoding = 'utf-8' # When we take half of the content to determine the encoding, chardet may # think it be encoded with ascii, however the full content is probably # encoded with utf-8. As ascii is an subset of utf-8, decoding an ascii # string with utf-8 will always be right. if encoding == 'ascii': encoding = 'utf-8' return content.decode(encoding, 'replace')
def _try_decode_bytes_(raw_bytes: bytes) -> str: """helper function for decode_byte,try to decode the raw bytes :param raw_bytes: the bytes you get and want to decode to string :return: A decoded string """ # Detect the encoding with only the first couple of bytes encoding_detect = chardet.detect( raw_bytes[:constants.MIN_ENCODING_DETECT]) # get the encoding encoding_type = encoding_detect['encoding'] if encoding_type is None: encoding_detect = chardet.detect(raw_bytes) encoding_type = encoding_detect['encoding'] try: # try to decode the string using the encoding we get decoded_string = raw_bytes.decode(encoding_type) except UnicodeDecodeError: # if decoding failed, we use all the bytes to detect encoding encoding_detect = chardet.detect(raw_bytes) encoding_type = encoding_detect['encoding'] decoded_string = raw_bytes.decode(encoding_type) return decoded_string
def setTotals(self): rows = self.esc.generalInformationRows + self.esc.sequenceRows self.totals[cu.FILE] = os.path.basename(self.inFilePath) self.totals[cu.QUANTITY] = self.esc.seql.generalInformation.quantity self.totals[cu.SEQUENCES_NUC] = self.esc.seql.quantity_nuc self.totals[cu.SEQUENCES_PRT] = self.esc.seql.quantity_prt self.totals[cu.MIXED_MODE] = self.esc.seql.quantity_mix self.totals[cu.ELEMENT_ST25_LENGTH] = sum([r[2] for r in rows]) self.totals[cu.VALUE_LENGTH] = sum([r[3] for r in rows]) self.totals[cu.TAG_ST26_LENGTH] = sum([r[4] for r in rows]) self.totals[cu.ELEMENT_ST26_LENGTH] = sum([r[5] for r in rows]) with open(self.inFilePath, 'r') as inf: s_txt = inf.read() self.totals[cu.CHARS_TXT_FILE] = len(s_txt) self.totals[cu.ENCODING_TXT] = chardet.detect(s_txt)['encoding'] self.totals[cu.FILE_SIZE_TXT] = os.path.getsize(self.inFilePath) with open(self.cleanXmlFilePath, 'r') as f: s_xml = f.read() self.totals[cu.CHARS_XML_CLEAN_FILE] = len(s_xml) self.totals[cu.ENCODING_XML] = chardet.detect(s_xml)['encoding'] self.totals[cu.FILE_SIZE_XML_CLEAN] = os.path.getsize(self.cleanXmlFilePath) print self.inFilePath print 'encoding:', self.esc.seql.charEncoding
def convertor(test, encoding=""): """ convert zhpy source (Chinese) to Python Source >>> convertor("印出 'hello'") "print 'hello'" >>> convertor("印出 'hello'", encoding="utf8") "print 'hello'" more keyword test cases are in /tests folder. """ for k, v in replacedict.items(): test = test.replace(k,v) if encoding: utest = test.decode(encoding) else: try: #detect encoding det = chardet.detect(test) if det['confidence'] >= 0.8: encoding = chardet.detect(test)['encoding'] else : #print 'low confidence encoding detection, use utf8 encoding' encoding = 'utf8' utest = test.decode(encoding) except UnicodeDecodeError, e: print "can't recognize your language, set to utf-8" utest = test.decode('utf8') except ImportError, e: #no chardet mode utest = test.decode('utf8')
def getajax(url): if not pattern.match(url): url = 'http://' + url try: browser.get(url) n = browser.page_source soup = BeautifulSoup(n) n = soup.get_text() try: n = n.encode('utf-8') except: d = chardet.detect(n) n = n.decode(d['encoding']).encode('utf-8') except TimeoutException: n = browser.page_source soup = BeautifulSoup(n) n = soup.get_text() try: n = n.encode('utf-8') except: d = chardet.detect(n) n = n.decode(d['encoding']).encode('utf-8') n = 'TIMEDOUT'+n except WebDriverException as error: if 'MALFORMED_URI' in error.msg: n = 'MALFORMED_URI' else: raise error except Exception, error: raise error
def adb(self, *args): """adb命令执行入口 :param args: :return: """ if self.__serial: cmd = " ".join([self.__adb_name, '-s', self.__serial] + list(args)) else: cmd = " ".join([self.__adb_name] + list(args)) stdout, stderr = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if self.__debug: print cmd print stdout.strip().decode(self.__output_coding) if stderr: try: print stderr.decode(self.__output_coding) except Exception as e: print e print '-'*100 print stderr import chardet print chardet.detect(stderr) return [i for i in stdout.splitlines() if i and not i.startswith("* daemon")] # 过滤掉空的行,以及adb启动消息
def process_detail_info(detail_page): """ 处理详情页 :param url: :return: """ title = pq(detail_page)(".tb-detail-hd h1").text() if isinstance(title, str) and chardet.detect(title)["encoding"]: title = unicode(title, chardet.detect(title)["encoding"]) pay_type = u"" if pay_types["total_pay"] in title: pay_type = pay_types["total_pay"] elif pay_types["bargin_pay"] in title: pay_type = pay_types["bargin_pay"] elif pay_types["down_pay"] in title: pay_type = pay_types["down_pay"] else: pay_type = pay_types["other"] # car_price = pq(detail_page)('.tm-price-panel .tm-price').text() # 价格是再次加载进去的 car_brand = pq(detail_page)("#J_attrBrandName").text() car_brand = str(car_brand).replace(u"品牌:", u"") car_type = pq(detail_page)("#J_AttrUL li").eq(1).text() if str(car_type).__contains__(u"车型:"): car_type = str(car_type).replace(u"车型:", u"") if str(car_type).__contains__(u"车系,"): car_type = str(car_type).replace(u"车系,", u"") # detail_quantity = pq(detail_page)('.tm-ind-panel .tm-ind-sellCount').text() # 详情页的月销量, 暂时无法得到 # if not detail_quantity: # detail_quantity = -1 entity = "%s\t%s\t%s" % (pay_type, car_brand, car_type) # print entity return entity
def convertor(text, verbose=False, encoding="", outcoding=""): """ convert zhpy source (Chinese) to Python Source. annotator will be called automatically. Accept args: test: source to be converted verbose: show detail message, default: False encoding: codec for encoding outcoding: codec for output encoding #annotator() >>> convertor("印出 'hello'") "print 'hello'" >>> convertor("印出 'hello'", encoding="utf8") "print 'hello'" >>> convertor('測試_範例') 'test_p_7bc4_4f8b_v' more keyword test cases are in /tests folder. """ # annotate if necessary annotator(force=False) #Use the provided encoding, if not exist select utf-8 as default. if encoding and encoding.lower() != 'utf-8': utext = text.decode(encoding) else: if has_chardet: try: #detect encoding det = chardet.detect(text) if verbose: print "chardet", det if det['confidence'] >= 0.8: encoding = chardet.detect(text)['encoding'] else: if verbose: print """low confidence encoding detection, use utf8 encoding""" encoding = 'utf8' #prepare for unicode type support if isinstance(text, unicode): utext = text else: utext = text.decode(encoding) except UnicodeDecodeError, e: print "can't recognize your language, \ set to sys.stdout.encoding" utext = text.decode('utf8') except ImportError, e: if verbose: print "proceed no chardet mode" utext = text.decode('utf8')
def detect(self, line, num=450): try: l = len(line) if l < 1200: return chardet.detect(line)['encoding'] else: #first res1 = chardet.detect(line[: num]) #second str2 = line[l/2: l/2 + num] start = str2.find(' ') if start == -1: start = 0 res2 = chardet.detect(str2[start:]) if res1['encoding'] != res2['encoding']: if res1['encoding'] == 'ascii': return res2['encoding'] else: str3 = line[l/3: l/3 + num] start = str2.find(' ') if start == -1: start = 0 #third res3 = chardet.detect(str3[start:]) if res3['encoding'] == res2['encoding']: return res2['encoding'] else: return res1['encoding'] else: return res1['encoding'] except: print "detect error, return None" return None
def decode(self,content,url):#根据内容和url进行解码 result = content if not ALWAYS_CHAR_DETECT and self.encoding:#上次的编码 try: result = content.decode(self.encoding) except UnicodeDecodeError: # 解码错误,使用自动检测编码 encoding = chardet.detect(content)['encoding'] try: result = content.decode(encoding,'ignore')#show error,use : export LANG="en_US.UTF-8" except UnicodeDecodeError: # 还是出错,则不转换,直接返回 self.encoding = None result = content except TypeError: self.encoding = None result = content else:# 保存下次使用,以节省时间 self.encoding = encoding #保存到redis netloc = urlparse.urlsplit(url)[1] r.set(netloc,encoding) else:#暂时没有编码信息 netloc = urlparse.urlsplit(url)[1] self.encoding = chardet.detect(content)['encoding'] #使用探测到的编码解压 try: result = content.decode(self.encoding) except UnicodeDecodeError: # 出错,则不转换,直接返回 result = content else: #保存到redis r.set(netloc,self.encoding) return result
def get_text_content(self): if self.content_type.lower().startswith('text/'): file_contents = self.get_content() # unicode(self.get_content(), errors='xmlcharrefreplace') result = chardet.detect(file_contents) if result['encoding'] != None: try: return unicode(file_contents, result['encoding']) except: return unicode('Error: ' + str(sys.exc_info()[1])) else: return 'NOT_A_TEXT_FILE' elif self.content_type.lower().startswith('application/pdf'): try: f = slate.PDF(open(self.content_path)) file_contents = '' for page in f: file_contents += page result = chardet.detect(file_contents) return unicode(file_contents, result['encoding']) except: # traceback.print_exc() pass return None
def merge_subtitles(in_filename1, in_filename2, out_filename): # detect file encodings encoding1 = chardet.detect(open(in_filename1).read())['encoding'] encoding2 = chardet.detect(open(in_filename2).read())['encoding'] # create aeidon project project1 = aeidon.Project() project2 = aeidon.Project() project1.open_main(in_filename1, encoding1) project2.open_main(in_filename2, encoding2) # setup output format out_format = aeidon.files.new(aeidon.formats.ASS, out_filename, "utf_8") out_format.header = header header_lines = header.split('\n') defalut_margin_v = int(header_lines[6].split(',')[-1]) alternate_fontsize = int(header_lines[7].split(',')[2]) event_margin_v = defalut_margin_v + alternate_fontsize # motify event entries for subtitle in project1.subtitles: subtitle.main_text = subtitle.main_text.replace('\n', ' ') subtitle.ssa.margin_v = event_margin_v for subtitle in project2.subtitles: subtitle.main_text = subtitle.main_text.replace('\n', ' ') subtitle.ssa.style = 'Alternate' project1.subtitles.extend(project2.subtitles) project1.save_main(out_format)
def check_enc_fixed(url): print "\n\n" print "That is url {}".format(url) r = requests.get(url) ud = UnicodeDammit(r.content, is_html=True) print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding if not ud.original_encoding == ud.declared_html_encoding: print ("Origignal encoding: {} vs declared_html_encoding: {}" "".format(ud.original_encoding, ud.declared_html_encoding)) print "Detected encoding: {!r}". format(chardet.detect(r.content)) enc = ud.original_encoding.lower() declared_enc = ud.declared_html_encoding if declared_enc: declared_enc = declared_enc.lower() # possible misregocnition of an encoding if (declared_enc and enc != declared_enc): detect_dict = chardet.detect(r.content) det_conf = detect_dict["confidence"] det_enc = detect_dict["encoding"].lower() if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT: enc = declared_enc print "CHOOSED ENCODING: {}".format(enc) # if page contains any characters that differ from the main # encodin we will ignore them content = r.content.decode(enc, "ignore").encode(enc) htmlparser = etree.HTMLParser(encoding=enc) root = etree.HTML(content, parser=htmlparser) etree.strip_elements(root, html.etree.Comment, "script", "style") text = html.tostring(root, method="text", encoding=unicode) text = re.sub('\s+', ' ', text) print text[:200]
def html_to_txt(): ft = open(BAIDU_TXT,'w') start = 1 while 1: filename = BAIDU_DIR+str(start)+'.html' if os.path.isfile(filename): fp = open(filename, 'r') htmltxt = ''.join(fp.readlines()) if not htmltxt or not len(htmltxt): continue fp.close() codedetect = chardet.detect(htmltxt)["encoding"] print codedetext if not codedetect in ['utf-8', 'ascii']: htmltxt = unicode(htmltxt, codedetect).encode('utf-8') codedetect = chardet.detect(htmltxt)["encoding"] print 'change', codedetect ft.write(html2txt(htmltxt)) print 'Success change html to txt %s' % start start+=1 else: break ft.close()
def charpick(files): path = r"D:\testdata\testsoft1\raw" path = path.replace('\\', '/') conn = mysql.connector.connect(host='127.0.0.1', user='******', password='******', port='3306', database='url', use_unicode=True) cursor = conn.cursor() for filename in files: #print chardet.detect(filename) with open(path + '/' + filename) as f1: data0 = f1.read() #print chardet.detect(data) data1=data0.decode("ISO-8859-2") data=data1.encode("utf-8") print chardet.detect(data) cursor.execute("insert into urlsoft2 (url) values (%s)",[data]) conn.commit() cursor.close() conn.close()
def getURL(url): cJar = cookielib.LWPCookieJar() cookie_support = urllib2.HTTPCookieProcessor(cJar) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) headers = {"": "", "": ""} data = {} getdata = urllib.urlencode(data) req = urllib2.Request( url=url, # data=getdata, headers=headers, ) result = urllib2.urlopen(req).read() typeEncode = sys.getfilesystemencoding() ##系统默认编码 print chardet.detect(result) infoencode = chardet.detect(result).get("encoding", "utf-8") ##通过第3方模块来自动提取网页的编码 if infoencode: result = result.decode(infoencode, "ignore").encode(typeEncode) ##先转换成unicode编码,然后转换系统编码输出 for ind, cookie in enumerate(cJar): print "%d - %s" % (ind, cookie) print "HTML Content:" print result
def win_ping(self,inter='test',host=None,num=2,times=2,expect=1): print("run keyword:%s"%(sys._getframe().f_code.co_name)) inter=_unicode_to_utf(inter) host=_unicode_to_utf(host) num=_unicode_to_utf(num) times=_unicode_to_utf(times) expect=_unicode_to_utf(expect) msgs=[] stat=0 for i in range(int(times)) : tmp_msgs=[] cmd='ping '+str(host)+' -n '+str(num) print("%s"%cmd) p=subprocess.Popen(cmd,stdin=subprocess.PIPE,stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) p.wait() tmp_msgs.append(p.stdout.read()) tmp_msgs.append(p.stderr.read()) p.terminate() msgs.extend(tmp_msgs) reobj=re.compile('\(0% loss\)|\(0% 丢失\)') tmp_msgs=' '.join(tmp_msgs) tmp_msgs=unicode(tmp_msgs,chardet.detect(tmp_msgs)['encoding']).encode('utf-8') if reobj.search(tmp_msgs) : stat = 1 break print("%s"%unicode('\n'.join(msgs),chardet.detect('\n'.join(msgs))['encoding'])) if expect != 'None' and expect != None : if int(stat) == int(expect) : print("Expect is %s, actually %s"%('success' if int(expect) == 1 else 'fail','success' if int(expect) == 1 else 'fail')) else: raise ExpectError(message="Expect is %s, actually %s"%("Success" if int(expect) == 1 else "failed","success" if int(stat) == 1 else "failed"))
def open(fn, write, codecname = 'mskanji', autodetect = 'safedetect'): if write or (autodetect == 'no'): return io.open(fn, {True: 'w', False: 'r'}[write], encoding=codecname) elif autodetect == 'chardet' or autodetect == 'mskanji' or autodetect == 'safedetect': with io.open(fn, 'rb') as f: buf = f.read() try: if autodetect == 'safedetect': #read only BOM and early characters. result = chardet.detect(buf[:10]) else: result = chardet.detect(buf) confidence = result.get('confidence', 0.0) encoding = result.get('encoding', None) if encoding == None or confidence < 0.1: encoding = codecname elif encoding.upper().startswith('UTF-8'): encoding = 'utf-8-sig' #UTF-8 -> UTF-8-SIG (BOM) elif encoding.upper().startswith('UTF-16'): encoding = 'UTF-16' #UTF-16LE -> UTF-16 (BOM) elif autodetect == 'safedetect': if not encoding.upper().startswith('UTF'): encoding = codecname elif autodetect == 'mskanji' and encoding.upper() == 'SHIFT_JIS': encoding = 'mskanji' except: encoding = codecname file = io.BytesIO(buf) setattr(file, 'name', fn) return io.TextIOWrapper(file, encoding=encoding) else: raise LookupError(u'unknown autodetect mode: %s' % autodetect)
def printlog(key,out): print "=================== Order by " + key + " =======================" print "[.] Start to output!" resList = sorted(resultList,key = lambda e:e.__getitem__(key)) xlwtwork = xlwt.Workbook(encoding='utf-8') ws = xlwtwork.add_sheet('info') ws.write(0, 0, '网站状态') ws.write(0, 1, '网站地址') ws.write(0, 2, '允许HTTP请求方法') ws.write(0, 3, '网站标题') order = 1 for res in resList: ws.write(order, 0, res['status']) ws.write(order, 1, res['target']) ws.write(order, 2, res['head_allow']) if chardet.detect(res['title'])['encoding'] == 'GB2312': ws.write(order, 3, res['title'].decode('GB2312').encode('utf-8')) elif chardet.detect(res['title'])['encoding'] == 'ascii': ws.write(order, 3, res['title'].decode('ascii').encode('utf-8')) elif chardet.detect(res['title'])['encoding'] == 'utf-8': ws.write(order, 3, res['title']) else: print '[-](手工补录)不支持 ' + res['target'] + ' title编码格式:' + chardet.detect(res['title'])['encoding'] + ' 输出到表格' order = order + 1 xlwtwork.save(out) print "[.] End output!" print "======================================================="
def whatisthis(s): if isinstance(s,str): print "Ordinary string : ", chardet.detect(s) elif isinstance(s,unicode): print "Unicode string, chardet Expected a bytes object, not a unicode object" else: print "Not a string : ",chardet.detect(s)
def check(dir): count = 0 for file in os.listdir(dir): checkfile = open(os.path.join(dir,file),"r") content = checkfile.read() print str(count)+' '+str(chardet.detect(content)['confidence'])+' '+str(chardet.detect(content)['encoding']) count+=1
#!/usr/bin/env python # _*_ coding:utf-8 _*_ str1 = "我们" print(str1) print(type(str1)) str1 = "我们" str_utf8 = str1.encode('utf-8') print(str_utf8) print(type(str_utf8)) str_decode = str1.encode('utf-8').decode('utf-8') print(str_decode) print(type(str_decode)) import chardet str_gbk = "我们".encode('gbk') print(chardet.detect(str_gbk)) # str_unicode_decode = "我们".decode() # str_utf8 = "我们".encode('utf-8') # str_gbk = str_utf8.encode('gbk') str_utf8 = "我们".encode('utf-8') str_gbk = str_utf8.decode('utf-8').encode('gbk') print(str_gbk)
def code_detecter(self, data): try: return chardet.detect(data)['encoding'] except: return "windows-1251"
def get_encoding_type(self, file): print_blue(file) with open(file, 'rb') as f: rawdata = f.read() return detect(rawdata)['encoding']
return extract(lda, count_feature_names, no_top_words) with open(SURVEY_FILE_NAME) as content_file: responses = [] reader = csv.reader(content_file) for row in reader: if row[1]: responses.append(row[1]) LDA_clusters = LDA(responses) # Create CSV and json with open('PROB_'+SURVEY_FILE_NAME,'w+') as out_file, open('TOPICS_'+SURVEY_FILE_NAME.replace('csv','json'),'w+') as jf: writer = csv.writer(out_file, quoting=csv.QUOTE_ALL) for idx, a_resp in enumerate(responses): writer.writerow([a_resp]+probabilities[idx]) json.dump(LDA_clusters,jf, indent=4) #----------------------------------------------------------------------------------------------------------------------------- with open(SURVEY2_FILE_NAME,'rb') as f: result = chardet.detect(f.read()) with open(SURVEY2_FILE_NAME, encoding=result['encoding']) as content_file: responses = [] reader = csv.reader(content_file) for row in reader: if row[1]: responses.append(row[1]) LDA_clusters = LDA(responses) # Create CSV with open('PROB_'+SURVEY2_FILE_NAME,'w+') as out_file, open('TOPICS_'+SURVEY2_FILE_NAME.replace('csv','json'),'w+') as jf: writer = csv.writer(out_file, quoting=csv.QUOTE_ALL) for idx, a_resp in enumerate(responses): writer.writerow([a_resp]+probabilities[idx]) json.dump(LDA_clusters,jf, indent=4)
except Exception,e: # print ('unknow error') logger.error('unknow error %s: request fail url:%s'% (str(e), url)) elif detect_flag == "web_detect": driver = webdriver.PhantomJS(executable_path=phantomjs_dir) driver.get(url) time.sleep(5) html_src = driver.page_source # print html_src # print 'type:%s' % type(html_src) driver.close() html = html_src.encode('utf-8') # 动态加载完是unicode编码,转化为utf-8 page_html = html if page_html: html_srccode = chardet.detect(page_html)['encoding'] # page_html = html_encode(page_html) page_html = page_html.decode(encoding=html_srccode, errors='replace') page_html = page_html.replace('\n', '').replace(' ','') # 去换行和空格 # print page_html return page_html def structtime_to_timestamp(time_sourse): try: time_stamp_soure = time.mktime(time.strptime(time_sourse, '%Y-%m-%d %H:%M:%S')) except ValueError: time_stamp_soure = time.mktime(time.strptime(time_sourse, '%Y-%m-%d')) return str(int(time_stamp_soure)) def html_encode(html_1): encoding_dict = chardet.detect(html_1)
def jiexi(self, name): end = chardet.detect(open(name, 'rb').read()) dk = open(name, 'r', encoding=end['encoding']) for j in dk.readlines(): data = json.loads("".join(j.split('\n'))) if "computers" in name: print('[+] wait write file:{}'.format(self.name)) computers = data['computers'] for c in computers: print('=' * 90, file=open(self.name, 'a', encoding=end['encoding'])) print('{}:{}'.format('ObjectIdentifier', c['ObjectIdentifier']), file=open(self.name, 'a', encoding=end['encoding'])) Aces = c['Aces'] for k in Aces: for r in k: print('{}:{}'.format(r, k[r]), file=open(self.name, 'a', encoding=end['encoding'])) print('', file=open(self.name, 'a', encoding=end['encoding'])) Properties = c['Properties'] for p in Properties: print('{}:{}'.format(p, Properties[p]), file=open(self.name, 'a', encoding=end['encoding'])) print('', file=open(self.name, 'a', encoding=end['encoding'])) meta = data['meta'] for m in meta: print('{}:{}'.format(m, meta[m]), file=open(self.name, 'a', encoding=end['encoding'])) elif "domains" in name: print('[+] wait write file:{}'.format(self.name2)) computers = data['domains'] for c in computers: print('=' * 90, file=open(self.name2, 'a', encoding=end['encoding'])) print('{}:{}'.format('ObjectIdentifier', c['ObjectIdentifier']), file=open(self.name2, 'a', encoding=end['encoding'])) Aces = c['Aces'] for k in Aces: for r in k: print('{}:{}'.format(r, k[r]), file=open(self.name2, 'a', encoding=end['encoding'])) print('', file=open(self.name2, 'a', encoding=end['encoding'])) Properties = c['Properties'] for p in Properties: print('{}:{}'.format(p, Properties[p]), file=open(self.name2, 'a', encoding=end['encoding'])) print('', file=open(self.name2, 'a', encoding=end['encoding'])) meta = data['meta'] for m in meta: print('{}:{}'.format(m, meta[m]), file=open(self.name2, 'a', encoding=end['encoding'])) elif 'gpos' in name: print('[+] wait write file:{}'.format(self.name3)) computers = data['gpos'] for c in computers: print('=' * 90, file=open(self.name3, 'a', encoding=end['encoding'])) print('{}:{}'.format('ObjectIdentifier', c['ObjectIdentifier']), file=open(self.name3, 'a', encoding=end['encoding'])) Aces = c['Aces'] for k in Aces: for r in k: print('{}:{}'.format(r, k[r]), file=open(self.name3, 'a', encoding=end['encoding'])) print('', file=open(self.name3, 'a', encoding=end['encoding'])) Properties = c['Properties'] for p in Properties: print('{}:{}'.format(p, Properties[p]), file=open(self.name3, 'a', encoding=end['encoding'])) print('', file=open(self.name3, 'a', encoding=end['encoding'])) meta = data['meta'] for m in meta: print('{}:{}'.format(m, meta[m]), file=open(self.name3, 'a', encoding=end['encoding'])) elif 'groups' in name: print('[+] wait write file:{}'.format(self.name4)) computers = data['groups'] for c in computers: print('=' * 90, file=open(self.name4, 'a', encoding=end['encoding'])) print('{}:{}'.format('ObjectIdentifier', c['ObjectIdentifier']), file=open(self.name4, 'a', encoding=end['encoding'])) Aces = c['Aces'] for k in Aces: for r in k: print('{}:{}'.format(r, k[r]), file=open(self.name4, 'a', encoding=end['encoding'])) print('', file=open(self.name4, 'a', encoding=end['encoding'])) Properties = c['Properties'] for p in Properties: print('{}:{}'.format(p, Properties[p]), file=open(self.name4, 'a', encoding=end['encoding'])) print('', file=open(self.name4, 'a', encoding=end['encoding'])) meta = data['meta'] for m in meta: print('{}:{}'.format(m, meta[m]), file=open(self.name4, 'a', encoding=end['encoding'])) elif 'ous' in name: print('[+] wait write file:{}'.format(self.name4)) computers = data['ous'] for c in computers: print('=' * 90, file=open(self.name5, 'a', encoding=end['encoding'])) print('{}:{}'.format('ObjectIdentifier', c['ObjectIdentifier']), file=open(self.name5, 'a', encoding=end['encoding'])) Aces = c['Aces'] for k in Aces: for r in k: print('{}:{}'.format(r, k[r]), file=open(self.name5, 'a', encoding=end['encoding'])) print('', file=open(self.name5, 'a', encoding=end['encoding'])) Properties = c['Properties'] for p in Properties: print('{}:{}'.format(p, Properties[p]), file=open(self.name5, 'a', encoding=end['encoding'])) print('', file=open(self.name5, 'a', encoding=end['encoding'])) meta = data['meta'] for m in meta: print('{}:{}'.format(m, meta[m]), file=open(self.name5, 'a', encoding=end['encoding'])) elif 'users' in name: print('[+] wait write file:{}'.format(self.name4)) computers = data['users'] for c in computers: print('=' * 90, file=open(self.name6, 'a', encoding=end['encoding'])) print('{}:{}'.format('ObjectIdentifier', c['ObjectIdentifier']), file=open(self.name6, 'a', encoding=end['encoding'])) Aces = c['Aces'] for k in Aces: for r in k: print('{}:{}'.format(r, k[r]), file=open(self.name6, 'a', encoding=end['encoding'])) print('', file=open(self.name6, 'a', encoding=end['encoding'])) Properties = c['Properties'] for p in Properties: print('{}:{}'.format(p, Properties[p]), file=open(self.name6, 'a', encoding=end['encoding'])) print('', file=open(self.name6, 'a', encoding=end['encoding'])) meta = data['meta'] for m in meta: print('{}:{}'.format(m, meta[m]), file=open(self.name6, 'a', encoding=end['encoding']))
import chardet de = chardet.detect(b'Hello World') print(de) data = '中文'.encode('utf-8') de = chardet.detect(data) print(de)
def encoding_file(): file_name = input_file_name() with open(file_name, 'rb') as f: data = f.read() result = chardet.detect(data) return file_name, result['encoding']
import os import glob import chardet path = input('input a csv_file path:') os.chdir(path) csv_files = glob.glob('*.csv') for csv_file in csv_files: f = open(csv_file, "rb").read() print(csv_file, chardet.detect(f))
# -*-coding:utf-* ''' 利用request下载页面,自动检测页面编码 ''' from urllib import request import chardet if __name__ == '__main__': url = 'http://finance.eastmoney.com/news/1345,20180923951189228.html' rsp = request.urlopen(url) html = rsp.read() # 利用 chardet自动检测编码 cs = chardet.detect(html) print(type(cs)) print(cs) html = html.decode(cs.get("encoding", "utf-8")) print(html)
# -*- coding: utf-8 -*- # @File : Spider.py # @Author: Zhuozhuo.Geng # @Date : 2018/2/12 # @Desc : from urllib import request import chardet import re if __name__ == '__main__': response = request.urlopen('http://fanyi.baidu.com') htmlPage = response.read() # 获取网页的编码格式 chardetRes = chardet.detect(htmlPage) encodeHtml = chardetRes.get('encoding', -1) chardetRes = htmlPage.decode(encodeHtml) REGX = r'href="(.*?)"' urlSet = re.findall(REGX, chardetRes, re.S) # print(chardetRes) print(urlSet)
def chardet_dammit(s): return chardet.detect(s)['encoding']
def __init__(s, text, fromFile=0, stop_words=('-')): nx.DiGraph.__init__(s) if (fromFile): with open(text) as f: text = f.read() try: encoding = chardet.detect(text[:289] + ' ' + text[len(text) / 2 - 144:len(text) / 2 + 144] + ' ' + text[-289:])['encoding'] except (TypeError): encoding = 'utf-8' print('Error: Encoding not detected, utf-8 selected\n') if (encoding): ss = TextWords( text.decode(encoding), stop_words=stop_words ) #.decode(encoding))#text.decode('cp1251').split()# else: ss = TextWords(unicode(text, 'utf-8'), stop_words=stop_words) s.node_list = dict() s.node_property = dict() s.edge_list = dict() s.edge_property = dict() _S = None try: s.add_node(ss[0].lower()) _S = ss[0].lower() except (IndexError): return None s.node_list = dict() s.node_property = dict() s.edge_list = dict() s.edge_property = dict() s.add_properties('node', weight=1, textPositionFirst=-1, textPositionLast=-1, textPositionAvg=0, edges_in=0, edges_out=0, sweight=1.0) s.add_properties('edge', weight=1, textPositionFirst=-1, textPositionLast=-1, textPositionAvg=0, sweight=1.0, is_tree_edge=False) s.node[_S]['weight'] = 1 s.node[_S]['textPositionFirst'] = 1 s.node[_S]['textPositionAvg'] = 1 s.node[_S]['textPositionLast'] = 1 t = 0 for tt in ss: t += 1 try: ss[t] = ss[t] except (IndexError): break #print(t,' ',ss.ii)########################## s.add_node(ss[t]) s._add_node(ss[t], 'weight', 1, 1) if not (s.node[ss[t]].get('textPositionFirst')): s.node[ss[t]]['textPositionFirst'] = t + 1 s._add_node(ss[t], 'textPositionAvg', t + 1, t + 1) s.node[ss[t]]['textPositionLast'] = t + 1 s.add_edge(ss[t - 1], ss[t]) s._add_edge(ss[t - 1], ss[t]) if not (s.edge[ss[t - 1]][ss[t]].get('textPositionFirst')): s.edge[ss[t - 1]][ss[t]]['textPositionFirst'] = t s._add_edge(ss[t - 1], ss[t], 'textPositionAvg', 1 + t, t + 1) s.edge[ss[t - 1]][ss[t]]['textPositionLast'] = t for v in s.node.keys(): s.node[v]['textPositionAvg'] /= float(s.node[v]['weight']) s.node[v]['edges_out'] = len(s.edge[v]) s.node[v]['edges_in'] = 0 s.node[v]['sweight'] = 1.0 / s.node[v]['weight'] for e in s.edge: for ee in s.edge[e]: s.edge[e][ee]['textPositionAvg'] /= float( s.edge[e][ee]['weight']) s.node[ee]['edges_in'] += 1 s.edge[e][ee]['is_tree_edge'] = s.node[e][ 'textPositionFirst'] < s.node[ee]['textPositionFirst'] s.edge[e][ee]['sweight'] = 1.0 / s.edge[e][ee]['weight'] s.sort_edge() s.sort_node()
print('''---------------------chardet--------------------- ''') # 字符串编码一直是令人非常头疼的问题,尤其是我们在处理一些不规范的第三方网页的时候。 # chardet这个第三方库正好就派上了用场。用它来检测编码,简单易用。 # # *********** 安装chardet *********** # # 如果安装了Anaconda,chardet就已经可用了。否则,需要在命令行下通过pip安装: # # $ pip install chardet # 如果遇到Permission denied安装失败,请加上sudo重试。 # *********** 使用chardet *********** # 当我们拿到一个bytes时,就可以对其检测编码。用chardet检测编码,只需要一行代码: print("chardet.detect(b'Hello, world!')=\t", chardet.detect(b'Hello, world!')) # 我们来试试检测GBK编码的中文: data = '离离原上草,一岁一枯荣'.encode('gbk') print("chardet.detect(data)=\t", chardet.detect(data)) # 对UTF-8编码进行检测: data = '离离原上草,一岁一枯荣'.encode('utf-8') print("chardet.detect(data)=\t", chardet.detect(data)) # 我们再试试对日文进行检测: data = '最新の主要ニュース'.encode('euc-jp') print("chardet.detect(data)=\t", chardet.detect(data)) # 可见,用chardet检测编码,使用简单。获取到编码后,再转换为str,就可以方便后续处理。 # 使用chardet检测编码非常容易,chardet支持检测中文、日文、韩文等多种语言。
def get_encoding_type(file): with open(file, 'rb') as f: rawdata = f.read() return detect(rawdata)['encoding']
def get_encode(filepath): f = open(filepath, 'rb') b = f.read(1024) f.close() return chardet.detect(b)['encoding']
raw_input(u'按回车键退出……'.encode(sys.stdin.encoding)) sys.exit() if not os.path.isfile(list_file_code): print list_file, u'文件不存在。' raw_input(u'按回车键退出……'.encode(sys.stdin.encoding)) sys.exit() if path_src_code == path_dst_code: print u'源路径和目标路径相同,请检查后重新输入。' raw_input(u'按回车键退出……'.encode(sys.stdin.encoding)) sys.exit() # print u'list文件:',list_file f_list = open(list_file_code, 'r') fcode = chardet.detect(f_list.read())['encoding'] # print 'File encoding:',fcode if fcode != 'utf-8': fcode = 'gbk' f_list.seek(0) list_all = [] for list_line in f_list: list_line = list_line.strip(' \n\r') list_line_encode = list_line.decode(fcode).encode(sys.stdin.encoding) if not list_line: continue if list_line not in os.listdir(path_src_code): print u'源路径 ', path_src, u'中没有文件', list_line_encode raw_input(u'按回车键退出……'.encode(sys.stdin.encoding)) sys.exit() if list_line not in os.listdir(path_dst_code):
# -chardet[检测编码] import chardet b = b'hello catface!' c = chardet.detect(b) print('chardet:', c) s1 = '国破山河在,城春草木深' s3 = '最新の主要ニュース' r = s1.encode('gbk') c = chardet.detect(r) print('chardet:', c) r = s1.encode('utf-8') c = chardet.detect(r) print('chardet:', c) r = s3.encode('euc-jp') c = chardet.detect(r) print('chardet:', c)
#!/usr/bin/env python2 # -*- encoding: utf-8 -*- import chardet import re import os for n in os.listdir('.'): encoding = chardet.detect(n)['encoding'] if re.match(r"ascii|utf", encoding): continue print '%s: %s (%s)' % (n, chardet.detect(n)['encoding'], chardet.detect(n)['confidence'])
"".join(extended_event_descriptor_multi)).strip() if not (extended_event_descriptor): extended_event_descriptor = short_event_descriptor extended_event_codepage = short_event_codepage if name_event_descriptor: try: if name_event_codepage: if name_event_codepage != 'utf-8': name_event_descriptor = name_event_descriptor.decode( name_event_codepage).encode("utf-8") else: name_event_descriptor.decode('utf-8') else: encdata = chardet.detect(name_event_descriptor) enc = encdata['encoding'].lower() confidence = str(encdata['confidence']) emcDebugOut( "[META] Detected name_event encoding-type: " + enc + " (" + confidence + ")") if enc == "utf-8": name_event_descriptor.decode(enc) else: name_event_descriptor = name_event_descriptor.decode( enc).encode('utf-8') except (UnicodeDecodeError, AttributeError), e: emcDebugOut("[META] Exception in readEitFile: " + str(e)) self.eit['name'] = name_event_descriptor
time.sleep(1) html = page.read() page.close() return html argv0_list = sys.argv[0].split("\\") script_name = argv0_list[len(argv0_list) - 1] script_name = script_name[0:-3] tmp_record = script_name + '@' + str(os.getpid()) + '.txt' try: for url in urls: content = load_html(url) #encoding = extract(str(content).lower(), 'charset=', '"') encoding = chardet.detect(content)['encoding'] #print('-'*50) #print( "Encoding type = %s" % encoding ) #print('-'*50) if encoding: # note that Python3 does not read the html code as string # but as html code bytearray, convert to string with content = content.decode(encoding, 'ignore').replace(u'\xa9', u'') else: print("Debug: Encoding type not found!") match = re.search("\'(\w{8,}-\w{4,}-\w{4,}-\w{4,}-\w{12,})\'", str(content)) if match is None and urlparse(url).path.startswith('/m/'): try: from selenium import webdriver dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS)
def GetEncodeString(str): try: str = str.decode(chardet.detect(str)["encoding"]).encode("utf-8") except: pass return str
def _detect_encoding(self, file_name: str): with open(file_name, 'rb') as f: return chardet.detect(f.read())['encoding']
def get_encoding_type(file): """ Detect `file` encoding codec """ with open(file, 'rb') as f: rawdata = f.read() return detect(rawdata)['encoding']
except: title_num = trans_int(title_s[0]) finally: if title_num >= 100: title_num = str(title_num) elif title_num >= 10: title_num = '0' + str(title_num) elif title_num >= 1: title_num = '00' + str(title_num) title = re.sub('第.*章', '第' + title_num + '章', title) article = str(soup.find(id="contents")) with open('D:/soft/test/' + title + '.htm', 'w', encoding='utf-8') as file: file.write(article) print('%s done' % title) if __name__ == '__main__': origin_url = 'https://www.aszw.org/book/35/35222/' response = requests.get(origin_url) response.encoding = chardet.detect(response.content)['encoding'] html = response.text lists = re.findall('<td.*?href="(.*?)">(.*?)</a></td>', html, re.S) contents = dict( zip([lists[i][1] for i in range(len(lists))], [lists[i][0] for i in range(len(lists))])) p = Pool(processes=5) for key in contents.keys(): p.apply_async(get_article, args=(origin_url + contents[key], )) p.close() p.join()
def clean_cetc(institution, title): new_name = '' if_no_title_name = '' flag = False flag2 = False index = 0 temp = '' no_title_name_flag = False #set separator separator = '所' institution = institution.split(';')[0] if separator in institution: ''' e.x. input:中国电子科技集团,第十四研究所,江苏,南京,210013 output:电子科技集团14所 ''' institution = institution.strip().split(separator)[0].replace( ',', '').replace(',', '').replace(' ', '').split(';')[0] no_title_name_flag = True else: ''' e.x. input:中国电子科技集团第十四研究 output:电子科技集团14所 ''' institution = institution.strip().split(" ")[0].split(",")[0].split( ',')[0] print(institution) n = { u'一': '1', u'二': '2', u'三': '3', u'四': '4', u'五': '5', u'六': '6', u'七': '7', u'八': '8', u'九': '9', u'十': ' ' } if title in institution: new_name += title flag = True if flag or no_title_name_flag: institution = institution + separator try: for c in institution.decode(chardet.detect(institution)['encoding']): if n.has_key(c): temp += str(n[c]) index += 1 flag2 = True else: # 十四 => 14, 十 => 10, 二十 => 20 if index != 0: if temp[0] == ' ': temp = '1' + temp if temp[-1] == ' ': temp = temp + '0' temp = temp.replace(' ', '') new_name += temp if_no_title_name += temp #reset value index = 0 temp = '' if_no_title_name += c if c <= '9' and c >= '0': new_name += str(c) flag2 = True if flag and flag2: return new_name + separator return if_no_title_name except Exception as e: return institution
def __parseCsvFile(self, handle): """ Parse a CSV file. Does not reset the file handle to start. @arg handle: CSV file. Must be a seekable binary file object. @type handle: file object @return: list of lists @rtype: list """ buf = handle.read(BUFFER_SIZE) result = chardet.detect(buf) handle.seek(0) if result['confidence'] > 0.5: encoding = unicode(result['encoding']) else: encoding = 'utf-8' # Python 2.7 makes it extraordinarily hard to do this correctly. We # have a binary file object containing lines of text in a certain # encoding with unknown style of line-endings. # # We want to correctly decode the file contents, accept any style of # line-endings, parse the lines with the `csv` module, and return # unicode strings. # # 1. `codecs.getreader` does not have a universal newlines mode. # 2. `io.TextIOWrapper` cannot be wrapped around our file object, # since it is required to be an `io.BufferedIOBase`, which it # usually will not be. # 3. The `csv` module cannot read unicode. # # Ugh. # # So, we use a stream wrapper that consumes byte strings, decodes to # unicode, normalises newlines, and produces the result UTF-8 encoded. # That's what we feed the `csv` module. We decode what it gives back # to unicode strings. What a mess. handle = _UniversalNewlinesByteStreamIter(handle, encoding=encoding, buffer_size=BUFFER_SIZE) try: buf = handle.read(BUFFER_SIZE) except UnicodeDecodeError: self.__output.addMessage( __file__, 3, 'EBPARSE', 'Could not decode file (using %s encoding).' % encoding) return None # Default dialect dialect = 'excel' # The idea is that for new-style batch input files we have only # one column and the sniffer cannot find a delimiter. try: # Todo: delimiters in config file dialect = csv.Sniffer().sniff(buf, delimiters="\t ;|,") dialect.skipinitialspace = True except csv.Error: #self.__output.addMessage(__file__, 4, "EBPARSE", e) #return None pass #except #Watch out for : delimiter FIXME and for the . delimiter # if dialect.delimiter == ":": # dialect.delimiter = "\t" handle.seek(0) reader = csv.reader(handle, dialect) ret = [] try: for i in reader: ret.append([c.decode('utf-8') for c in i]) except UnicodeDecodeError: self.__output.addMessage( __file__, 3, 'EBPARSE', 'Could not decode file (using %s encoding).' % encoding) return None return ret
from hanziconv import HanziConv reload(sys) sys.setdefaultencoding("utf-8") countsig = 0 count1 = 0 count = 0 listnew = '/home/hongliang/Downloads/workspace/mirrorfunctions' for root, dirs, files in os.walk(listnew): for fn in files: a = os.path.join(root, fn) raw = open(a) content = raw.read() raw.close if chardet.detect(content)['encoding'] == 'utf-8': count1 += 1 print a, 'success' elif chardet.detect(content)['encoding'] == 'UTF-8-SIG': print a, 'sig success' countsig += 1 else: print a content = content.decode('gbk').encode('utf-8') content = HanziConv.toSimplified(content) raw = open(a, 'w') raw.write(content) raw.close
# use natural language toolkit import re import numpy as np from random import randint import pandas as pd import string import chardet with open('data_for_spam.csv', 'rb') as f: result = chardet.detect(f.read()) # or readline if the file is large dataset = pd.read_csv('data_for_spam.csv', encoding=result['encoding']) x = dataset.iloc[:, 0] y = dataset.iloc[:, 1] x = x.to_dict() X = [] for d in range(len(x)): b = x[d].lower() sentence = re.sub(r'\d+', '', b) sentence = re.sub('[' + string.punctuation + ']', '', sentence) X.append(sentence) from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer count_vect = CountVectorizer() a = count_vect.fit_transform(X) a.toarray() from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
def form_valid(self, form): node_id = self.request.GET.get("node_id") node = get_object_or_none(Node, id=node_id) if node_id else Node.root() f = form.cleaned_data['file'] det_result = chardet.detect(f.read()) f.seek(0) # reset file seek index file_data = f.read().decode(det_result['encoding']).strip(codecs.BOM_UTF8.decode()) csv_file = StringIO(file_data) reader = csv.reader(csv_file) csv_data = [row for row in reader] fields = [ field for field in Asset._meta.fields if field.name not in [ 'date_created' ] ] header_ = csv_data[0] mapping_reverse = {field.verbose_name: field.name for field in fields} attr = [mapping_reverse.get(n, None) for n in header_] if None in attr: data = {'valid': False, 'msg': 'Must be same format as ' 'template or export file'} return self.render_json_response(data) created, updated, failed = [], [], [] assets = [] for row in csv_data[1:]: if set(row) == {''}: continue asset_dict_raw = dict(zip(attr, row)) asset_dict = dict() for k, v in asset_dict_raw.items(): v = v.strip() if k == 'is_active': v = False if v in ['False', 0, 'false'] else True elif k == 'admin_user': v = get_object_or_none(AdminUser, name=v) elif k in ['port', 'cpu_count', 'cpu_cores']: try: v = int(v) except ValueError: v = '' elif k == 'domain': v = get_object_or_none(Domain, name=v) if v != '': asset_dict[k] = v asset = None asset_id = asset_dict.pop('id', None) if asset_id: asset = get_object_or_none(Asset, id=asset_id) if not asset: try: if len(Asset.objects.filter(hostname=asset_dict.get('hostname'))): raise Exception(_('already exists')) with transaction.atomic(): asset = Asset.objects.create(**asset_dict) if node: asset.nodes.set([node]) created.append(asset_dict['hostname']) assets.append(asset) except Exception as e: failed.append('%s: %s' % (asset_dict['hostname'], str(e))) else: for k, v in asset_dict.items(): if v != '': setattr(asset, k, v) try: asset.save() updated.append(asset_dict['hostname']) except Exception as e: failed.append('%s: %s' % (asset_dict['hostname'], str(e))) data = { 'created': created, 'created_info': 'Created {}'.format(len(created)), 'updated': updated, 'updated_info': 'Updated {}'.format(len(updated)), 'failed': failed, 'failed_info': 'Failed {}'.format(len(failed)), 'valid': True, 'msg': 'Created: {}. Updated: {}, Error: {}'.format( len(created), len(updated), len(failed)) } return self.render_json_response(data)