def detect_file_enc(file_name): utf_8_bom_size = 3 utf_8_bom_flag = b'\xef\xbb\xbf' enc_name = None file = open(file_name, 'rb') data = file.read(utf_8_bom_size) file.close() if data == utf_8_bom_flag: enc_name = 'utf-8-sig' usock = open(file_name, 'rb') detector = det_.UniversalDetector() for line in usock.readlines(): detector.feed(line) if detector.done: break detector.close() usock.close() # print(detector.result) if enc_name: detector.result['encoding'] = enc_name return detector.result
def detect(path): """ Detect the encoding of file at `path` and return code or ``None``. Raise :exc:`IOError` if reading fails. """ bom_encoding = detect_bom(path) if bom_encoding is not None: return bom_encoding from chardet import universaldetector detector = universaldetector.UniversalDetector() with open(path, "rb") as f: detector.reset() for line in f: detector.feed(line) if detector.done: break detector.close() code = detector.result["encoding"] if code is None: return None try: # chardet returns what seem to be IANA names. They need to be # translated to their Python equivalents. Some of the encodings # returned by chardet are not supported by Python. return translate_code(code) except ValueError: return None
def detect(aBuf): from chardet import universaldetector u = universaldetector.UniversalDetector() u.reset() u.feed(aBuf) u.close() return u.result
def detect_encodings(data): """ Analyze the provided data for possible character encodings. This simply wraps chardet and extracts all the potential encodings it considered before deciding on a particular result. :param data: An array of bytes to treat as text data :type data: bytes :return: A dictionary mapping possible encodings to confidence levels :rtype: dict """ if not data: # It's an empty string so we can safely say it's ascii return {'ascii': 1.0} # We can't use ``chardet.detect`` because we want to dig in the internals # of the detector to bias the utf-8 result. detector = universaldetector.UniversalDetector() detector.reset() detector.feed(data) result = detector.close() if not result: return {'utf-8': 1.0} encodings = {result['encoding']: result['confidence']} for prober in detector._mCharSetProbers: if prober: encodings[prober.get_charset_name()] = prober.get_confidence() return encodings
def detect_encoding(string): detector = universaldetector.UniversalDetector() detector.reset() detector.feed(string) detector.close() if detector.result and detector.result['confidence'] >= 0.6: return detector.result['encoding'] return 'gb18030'
def bigfilechardet(filename, block=256): detector = universaldetector.UniversalDetector() with open(filename, 'rb') as fd: bytes = fd.read(block) while bytes and not detector.done: detector.feed(bytes) bytes = fd.read(block) detector.close() # print(detector.result) return detector.result.get("encoding")
def run_cmd_with_cmd_str_and_decode_gracefully(cmd_str, print_flag=False, capture_stdout=True): if capture_stdout: p = subprocess.Popen(cmd_str, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) else: p = subprocess.Popen(cmd_str, shell=True, stderr=subprocess.STDOUT) if capture_stdout: encoding = None detector = detector_.UniversalDetector() result = bytearray() for line in p.stdout.readlines(): result.extend(line) if not encoding: detector.feed(line) if detector.done: encoding = detector.result['encoding'] rtn_val = p.wait() detector.close() # print(detector.result) if not encoding: encoding = detector.result['encoding'] if not encoding: encoding = system_util.get_system_encoding() # print('encoding: ' + encoding) rtn_str = '' if result: rtn_str = result.decode(encoding) if print_flag: print(rtn_str) else: rtn_val = p.wait() rtn_str = None return rtn_val, rtn_str
def convertAsciiToUtf8(txt): if not isinstance(txt, unicode): try: txt = txt.decode('utf-8') except UnicodeDecodeError: detector = universaldetector.UniversalDetector() detector.feed(txt) detector.close() result = detector.result encoding = result['encoding'] if encoding: try: txt = txt.decode(encoding) except UnicodeDecodeError: txt = splitTextToConvertToUtf8(txt, encoding) else: txt = splitTextToConvertToUtf8(txt, 'utf-8') return txt