def get_file_encoding(file_name): if not os.path.isfile(file_name): return "" u = UniversalDetector() with open(file_name, "rb") as f: for index, line in enumerate(f): u.feed(line) if index > 500: break u.close() if u.result["encoding"].lower() == "gb2312": try: _file = codecs.open(file_name, encoding="gb2312") _file.readlines() result = "gb2312" except Exception as e: print e try: _file = codecs.open(file_name, encoding="gbk") _file.readlines() result = "gbk" except Exception as e: print e result = "gb18030" else: result = u.result["encoding"] return result
def repo_cat_file(self, repo_path, commit_hash, path): (commit_hash, path) = self._all_to_utf8(commit_hash, path) if not self._path_check_chdir(repo_path, commit_hash, path): return '' path = self._get_quote_path(path) if path.startswith('./'): path = path[2:] file_type = path.split('.')[-1] if file_type in BINARY_FILE_TYPE: return u'二进制文件' stage_file = self._get_stage_file(repo_path, commit_hash, path) result = self._read_load_stage_file(stage_file) if result is not None: return result['blob'] command = '/usr/bin/git show %s:%s | /usr/bin/head -c 524288' % (commit_hash, path) try: signal.signal(signal.SIGPIPE, signal.SIG_DFL) result = check_output(command, shell=True) ud = UniversalDetector() ud.feed(result) ud.close() if ud.result['encoding']: encoding = ud.result['encoding'] if encoding != 'utf-8' or encoding != 'utf8': result = result.decode(encoding).encode('utf-8') self._dumps_write_stage_file({'blob': result}, stage_file) return result except Exception, e: logger.exception(e)
def decode_data(data, encoding_guess, can_be_binary=True): """Given string data, return an (is_text, data) tuple, where data is returned as unicode if we think it's text and were able to determine an encoding for it. If can_be_binary is False, then skip the initial is_binary check. """ if not (can_be_binary and is_binary_string(data[:1024])): try: # Try our default encoding. data = data.decode(encoding_guess) return True, data except UnicodeDecodeError: # Fall back to chardet - chardet is really slow, which is why we # don't just do chardet from the start. detector = UniversalDetector() for chunk in ichunks(80, data): detector.feed(chunk) if detector.done: break detector.close() if detector.result['encoding']: try: data = data.decode(detector.result['encoding']) return True, data except (UnicodeDecodeError, LookupError): # Either we couldn't decode or chardet gave us an encoding # that python doesn't recognize (yes, it can do that). pass # Leave data as str. return False, data
def safe_open(path, mode='r'): ''' Retrieves a file's encoding and returns the opened file. If the opened file begins with a BOM, it is read before the file object is returned. This allows callers to not have to handle BOMs of files. :param str path: file path to open :param str mode: the mode to open the file (see :func:`open`) :returns file: the opened file object ''' u = UniversalDetector() first = None with open(path, 'rb') as fp: bin = first = fp.read(0x1000) while not u.done and bin: u.feed(bin) if not u.done: bin = fp.read(0x1000) u.close() if not first: return open(path, mode) fp = codecs.open(path, mode, encoding=u.result['encoding']) for bom in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE, codecs.BOM_UTF8, codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE): if first.startswith(bom): fp.seek(len(bom)) break return fp
def get_csv_reader(filename, charset=None): logger.info("Reading CSV file %s", filename) myfile = open(filename, "rb") if not charset: # Detect encoding detector = UniversalDetector() for line in myfile.xreadlines(): detector.feed(line) if detector.result["confidence"] > 0.01: logger.debug("Result so far: %s", detector.result) if detector.done: break detector.close() charset = detector.result["encoding"] logger.info("Found encoding %s", charset) # Reset the file index myfile.seek(0) # Attempt to detect the dialect encodedfile = codecs.EncodedFile(myfile, charset) dialect = csv.Sniffer().sniff(encodedfile.read(1024)) logger.info("Found dialect %s", dialect) # Reset the file index myfile.seek(0) return UnicodeReader(myfile, dialect=dialect, encoding=charset)
def detect_encoding(bytes, encoding=None): """Detect encoding of a byte stream. """ # To reduce tabulator import time from chardet.universaldetector import UniversalDetector if encoding is not None: if encoding.lower() == 'utf-8': prefix = bytes.read(len(codecs.BOM_UTF8)) if prefix == codecs.BOM_UTF8: encoding = 'utf-8-sig' bytes.seek(0) return encoding detector = UniversalDetector() num_lines = config.ENCODING_DETECTION_MAX_LINES while num_lines > 0: line = bytes.readline() detector.feed(line) if detector.done: break num_lines -= 1 detector.close() bytes.seek(0) confidence = detector.result['confidence'] encoding = detector.result['encoding'] # Do not use if not confident if confidence < config.ENCODING_DETECTION_MIN_CONFIDENCE: encoding = config.DEFAULT_ENCODING # Default to utf-8 for safety if encoding == 'ascii': encoding = config.DEFAULT_ENCODING return encoding
def decode(string): """ detects string encoding and returns decoded string""" u = UniversalDetector() u.feed(string) u.close() result = u.result return string.decode(result['encoding'])
def detect(view, file_name, encoding): if not file_name or not os.path.exists(file_name): return if not encoding.endswith(' with BOM'): encoding = encoding_cache.pop(file_name) if encoding: sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0) detector = UniversalDetector() cnt = SETTINGS['max_detect_lines'] fp = open(file_name, 'rb') for line in fp: # cut MS-Windows CR code line = line.replace(b'\r',b'') detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] if encoding: encoding = encoding.upper() confidence = detector.result['confidence'] sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def listTextBasedFiles(file): try: # Detect MIME type for file # https://github.com/kaito834/myNotes/blob/master/snippets/python/magic_from_file.py # https://github.com/ahupp/python-magic#usage f_mimetype = magic.from_file(file, mime=True) except Exception as e: print("[!] Exception: {0} ({1})".format(e, type(e))) # Open and count lines if MIME type of the file is text/* if f_mimetype.split('/')[0] == 'text': # Detect encoding by chardet.universaldetector.UniversalDetector() # https://chardet.readthedocs.io/en/latest/usage.html#advanced-usage detector = UniversalDetector() with open(file, 'rb') as f: for line in f.readlines(): detector.feed(line) if detector.done: break detector.close() with open(file, "r", encoding=detector.result['encoding']) as f: line_count = 0 for line in f.readlines(): line_count += 1 print("{0}: {1}, {2}, {3} lines".format(file, f_mimetype, detector.result['encoding'], line_count)) else: print("{0}: NOT txet based file (reason: MIME type isn't text/*: {1})".format(file, f_mimetype))
def get_unicode_content(file_path, encoding=None): """ Return a unicode string of the files contents using the given encoding. If no encoding is given then chardet will be used to determine the encoding. Note that this uses the chardet library and may cause problems, if an error is thrown then a utf-8 encoding is assumed and unrecognize caracters are discarded. """ from chardet.universaldetector import UniversalDetector try: if not encoding: detector = UniversalDetector() contents = '' with open(file_path, 'rb') as f: contents = f.read() detector.feed(contents) detector.close() determined_encoding = detector.result['encoding'] return contents.decode(encoding=determined_encoding) else: with open(file_path, 'r') as f: return unicode(f.read(), encoding=encoding, errors='ignore') except UnicodeError: with open(file_path, 'r') as f: return unicode(f.read(), encoding='utf-8', errors='ignore')
def detect(view, file_name, encoding): if not file_name or not os.path.exists(file_name): return if not encoding.endswith(" with BOM"): encoding = encoding_cache.pop(file_name) if encoding: sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status("origin_encoding", "Detecting encoding, please wait..."), 0) detector = UniversalDetector() cnt = SETTINGS["max_detect_lines"] fp = open(file_name, "rb") for line in fp: # cut MS-Windows CR code line = line.replace(b"\r", b"") detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result["encoding"] if encoding: encoding = encoding.upper() if encoding == "BIG5": encoding = "BIG5-HKSCS" elif encoding == "GB2312": encoding = "GBK" confidence = detector.result["confidence"] sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def detect_encoding(file_path): with open(file_path, 'rb') as f: u = UniversalDetector() for line in f: u.feed(line) u.close() return u.result['encoding']
def detectEncoding(self, parseMeta=True, chardet=True): # First look for a BOM # This will also read past the BOM if present encoding = self.detectBOM() confidence = "certain" # If there is no BOM need to look for meta elements with encoding # information if encoding is None and parseMeta: encoding = self.detectEncodingMeta() confidence = "tentative" # Guess with chardet, if available if encoding is None and chardet: confidence = "tentative" try: from chardet.universaldetector import UniversalDetector buffers = [] detector = UniversalDetector() while not detector.done: buffer = self.rawStream.read(self.numBytesChardet) assert isinstance(buffer, bytes) if not buffer: break buffers.append(buffer) detector.feed(buffer) detector.close() encoding = lookupEncoding(detector.result['encoding']) self.rawStream.seek(0) except ImportError: pass # If all else fails use the default encoding if encoding is None: confidence = "tentative" encoding = lookupEncoding(self.defaultEncoding) return encoding, confidence
def description_of(lines, name='stdin'): """ Return a string describing the probable encoding of a file or list of strings. :param lines: The lines to get the encoding of. :type lines: Iterable of bytes :param name: Name of file or collection of lines :type name: str """ u = UniversalDetector() for line in lines: line = bytearray(line) u.feed(line) # shortcut out of the loop to save reading further - particularly useful if we read a BOM. if u.done: break u.close() result = u.result if PY2: name = name.decode(sys.getfilesystemencoding(), 'ignore') if result['encoding']: return '{0}: {1} with confidence {2}'.format(name, result['encoding'], result['confidence']) else: return '{0}: no result'.format(name)
def get_coding(text, force_chardet=False): """ Function to get the coding of a text. @param text text to inspect (string) @return coding string """ if not force_chardet: for line in text.splitlines()[:2]: try: result = CODING_RE.search(to_text_string(line)) except UnicodeDecodeError: # This could fail because to_text_string assume the text # is utf8-like and we don't know the encoding to give # it to to_text_string pass else: if result: codec = result.group(1) # sometimes we find a false encoding that can # result in errors if codec in CODECS: return codec # Fallback using chardet if is_binary_string(text): detector = UniversalDetector() for line in text.splitlines()[:2]: detector.feed(line) if detector.done: break detector.close() return detector.result['encoding'] return None
def validate_csv(document): """Return dialect information about given csv file.""" with open(document.document.uri, 'rU') as csvfile: is_valid = False try: dialect = csv.Sniffer().sniff(csvfile.read(1024)) except Exception as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (document.get_filename(), e)) return { 'delimiter': '', 'encoding': '', 'is_valid': is_valid } universal_detector = UniversalDetector() dialect.strict = True csvfile.seek(0) reader = csv.reader(csvfile, dialect) try: for row in reader: universal_detector.feed( dialect.delimiter.join(row).encode('utf-8')) is_valid = True except csv.Error as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (document.get_filename(), e)) finally: universal_detector.close() return { 'delimiter': dialect.delimiter, 'encoding': universal_detector.result['encoding'], 'is_valid': is_valid }
def detect(view, file_name): if not os.path.exists(file_name): return encoding = encoding_cache.pop(file_name) if encoding: sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0) detector = UniversalDetector() cnt = SETTINGS['max_detect_lines'] fp = file(file_name, 'rb') for line in fp: detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] confidence = detector.result['confidence'] if not encoding or confidence < 0.7: sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Encoding can not be detected, please choose one manually. (%s/%.2f)' % (encoding, confidence)), 0) return encoding = encoding.upper() if encoding == 'BIG5': encoding = 'BIG5-HKSCS' elif encoding == 'GB2312': encoding = 'GBK' sublime.set_timeout(lambda: init_encoding_vars(view, encoding), 0)
def validate_csv(f): """Return dialect information about given csv file.""" with open(f.fullpath, 'rU') as csvfile: is_valid = False try: dialect = csv.Sniffer().sniff(csvfile.read(1024)) except Exception as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (f.name + f.superformat, e)) return { 'delimiter': '', 'encoding': '', 'is_valid': is_valid } u = UniversalDetector() dialect.strict = True csvfile.seek(0) reader = csv.reader(csvfile, dialect) try: for row in reader: u.feed(dialect.delimiter.join(row)) is_valid = True except csv.Error as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (f.name + f.superformat, e)) finally: u.close() return { 'delimiter': dialect.delimiter, 'encoding': u.result['encoding'], 'is_valid': is_valid }
def process_buffer(buf, d): if not buf: return header = buf[0] url = header.split()[1] skip = 0 empty_lines = 0 while empty_lines < 2: skip += 1 if not buf[skip].strip(): empty_lines += 1 rawhtml = "".join(buf[skip + 1:]) html = None try: html = rawhtml.decode("utf-8") except: try: detector = UniversalDetector() for line in buf[skip + 1:]: detector.feed(line) if detector.done: break detector.close() encoding = detector.result html = rawhtml.decode(encoding["encoding"]) except: html = rawhtml.decode("utf-8", errors='ignore') assert html is not None, "Error processing %s\n" % rawhtml html = html.replace(r"\r", "") d[url] = (header, html)
def decode(filename, data): if '.m3u8' in filename: encoding = 'utf-8' data = data.decode(encoding) elif '.m3u' in filename or '.pls' in filename: try: encoding = 'ISO-8859-2' data = data.decode(encoding) except: if chardet: u = UniversalDetector() u.feed(data) u.close() if u.result['confidence'] > 0.5: try: encoding = result['encoding'] data = data.decode(encoding) except: encoding = 'ascii' else: encoding = 'ascii' else: encoding = 'ascii' elif '.xml' in filename or '.xspf' in filename: encoding = 'utf-8' return {'data' : data, 'encoding' : encoding}
def transferToEncoding(filename, toCode): if os.path.isdir(filename): print "error:not file" return False try: detector = UniversalDetector() f = open(filename, 'r') ls = f.readlines() f.close() # 如果空文件没法探测到,所以直接跳出做提示即可 if len(ls) == 0: print printRed(filename), printRed(' is blank file, can not detect encoding') return False; # 探测编码 for l in ls: detector.feed(l) if detector.done: break detector.close() encode = gb(detector.result['encoding']) if encode.lower() != toCode.lower(): f = open(filename, 'w') print printGreen(filename) + ' ====> ' + toCode + ' SUCCESS' for l in ls: f.write(unicode(l, encode).encode(toCode)) f.close() else: pass except Exception, e: traceback.print_exc() print 'exception'
def deserialize(file_bytes): try: file_string = file_bytes.decode('utf-8') except UnicodeDecodeError as ude: detector = UniversalDetector() for line in BytesIO(file_bytes): detector.feed(line) if detector.done: break detector.close() if detector.result['confidence'] < 0.5: raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files.") try: file_string = file_bytes.decode(detector.result['encoding']) except UnicodeDecodeError: raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files. " "(The invalid character is '{char:#x}' at {pos})".format(pos=ude.start, char=file_bytes[ude.start])) csv_lines = file_string.splitlines() first_line = csv_lines[:1] first_row_tab = next(csv.reader(first_line, delimiter="\t")) first_row_semicolon = next(csv.reader(first_line, delimiter=";")) if len(first_row_tab) > 1: rows = csv.reader(csv_lines, delimiter="\t") elif len(first_row_semicolon) > 1: rows = csv.reader(csv_lines, delimiter=";") else: raise ValueError("Csv file is not delimited by ';' or 'tab'") return rows
def detect_encoding(file): detector = UniversalDetector() for line in open(file, 'rb').readlines(): detector.feed(line) if detector.done: break detector.close() return detector.result['encoding']
def _guessEncoding(self, path): """Opens a file from the given `path` and checks the file encoding. The file must exists on the file system and end with the extension `.csv`. The file is read line by line until the encoding could be guessed. On a successfull identification, the widgets of this dialog will be updated. Args: path (string): Path to a csv file on the file system. """ if os.path.exists(path) and path.lower().endswith('csv'): encodingDetector = UniversalDetector() with open(path, 'r') as fp: for line in fp: encodingDetector.feed(line) if encodingDetector.done: break encodingDetector.close() result = encodingDetector.result['encoding'] result = result.replace('-','_') self._encodingKey = _calculateEncodingKey(result) if self._encodingKey: index = self._encodingComboBox.findText(result.upper()) self._encodingComboBox.setCurrentIndex(index)
def sanitize_texts(directory): """ Strip all header and copyright information from downloaded text files in the specified directory using gutenberg.strip_headers module and ensure proper file encodings. :param directory: <String> A string containing the full path to directory containing files to strip :return: """ for item in os.listdir(directory): file_path = os.path.join(directory, item) if os.path.isfile(file_path): # Detect file encoding, takes time to run with open(file_path, 'rb') as inf: text = inf.readlines() detector = UniversalDetector() for line in text: detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] # Open file, strip headers, and save result with open(file_path, 'r', encoding=encoding) as inf: text = inf.read() text = strip_headers(text).strip() os.remove(file_path) with open(file_path, 'w+', encoding=encoding) as outf: outf.write(text)
def detect_local_charset(filepath): global VERBOSE # Open to read in binary. fp = open(filepath, "rb") detector = UniversalDetector() if VERBOSE: print "Reading file to detect encoding..." for line in fp: line = line.replace(b'\r',b'') detector.feed(line) if detector.done: break fp.close() detector.close() if VERBOSE: print "Encoding: %s" % detector.result["encoding"] print "Confidence: {0:.0f}% ".format(detector.result["confidence"]*100) if detector.result["confidence"] > 0.75: encoding = detector.result["encoding"] return encoding.replace('-','_').lower() # Format for codecs else: return None
def detect_encoding(f, verbose=False): """Detects a file's encoding. Args: f (obj): The file like object to detect. verbose (Optional[bool]): The file open mode (default: False). mode (Optional[str]): The file open mode (default: 'rU'). Returns: dict: The encoding result Examples: >>> filepath = p.join(DATA_DIR, 'test.csv') >>> with open(filepath, 'rb') as f: ... result = detect_encoding(f) ... result == {'confidence': 0.99, 'encoding': 'utf-8'} True """ pos = f.tell() detector = UniversalDetector() for line in f: detector.feed(line) if detector.done: break detector.close() f.seek(pos) if verbose: print('result', detector.result) return detector.result
def detect(view, file_name, cnt): #traceback.print_stack() print("detect...") if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0: return encoding = encoding_cache.pop(file_name) if encoding: print("it is already at cache encoding_cache.json:",encoding) sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0) detector = UniversalDetector() fp = open(file_name, 'rb') for line in fp: # cut MS-Windows CR code line = line.replace(b'\r',b'') detector.feed(line) #print(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] print(encoding) if encoding: encoding = encoding.upper() confidence = detector.result['confidence'] print(confidence) sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def guessWithChardet(content): u = UniversalDetector() for line in content: u.feed(line) u.close() result = u.result return result
def getEncoding(filename): fp = open(filename, 'r') orig_content = fp.read() detector = UniversalDetector() detector.feed(orig_content) detector.close() fp.close() return detector.result["encoding"]
def detect_encoding(filename): """ Detect encoding of `filename`, which can be a ``str`` filename, a ``file``-like object, or ``bytes``. """ # Try with Unix file utility first because it's faster (~10ms vs 100ms) if isinstance(filename, str) and not filename.endswith(Compression.all): try: with subprocess.Popen(('file', '--brief', '--mime-encoding', filename), stdout=subprocess.PIPE) as process: process.wait() if process.returncode == 0: encoding = process.stdout.read().strip() # file only supports these encodings; for others it says # unknown-8bit or binary. So we give chardet a chance to do # better if encoding in (b'utf-8', b'us-ascii', b'iso-8859-1', b'utf-7', b'utf-16le', b'utf-16be', b'ebcdic'): return encoding.decode('us-ascii') except OSError: pass # windoze # file not available or unable to guess the encoding, have chardet do it detector = UniversalDetector() # We examine only first N 4kB blocks of file because chardet is really slow MAX_BYTES = 4*1024*12 def _from_file(f): detector.feed(f.read(MAX_BYTES)) detector.close() return (detector.result.get('encoding') if detector.result.get('confidence', 0) >= .85 else 'utf-8') if isinstance(filename, str): with open_compressed(filename, 'rb') as f: return _from_file(f) elif isinstance(filename, bytes): detector.feed(filename[:MAX_BYTES]) detector.close() return detector.result.get('encoding') elif hasattr(filename, 'encoding'): return filename.encoding else: # assume file-like object that you can iter through return _from_file(filename)
def detect_data(self, req, csvfile, csvsep=',', csvdel='"', csvcode='utf-8', jsonp='callback'): #detect encoding if csvcode == 'auto': u = UniversalDetector() for line in csvfile: u.feed(line) u.close() csvcode = u.result['encoding'].lower() csvfile.seek(0) # gb2312 gbk hz-gb-2312 hz-gb if csvcode == 'gb2312': csvcode = 'gbk' if 'hz' in csvcode: csvcode = 'hz' #remove bom if 'utf' in csvcode: if 'utf-8' in csvcode: contents = csvfile.read().decode('utf-8-sig').encode('utf-8') csvcode = 'utf-8' #FIXME not support utf-16 if 'utf-16' in csvcode: contents = csvfile.read().decode(csvcode).encode('utf-16') csvcode = 'utf-16' #FIXME not support utf-32 if 'utf-32' in csvcode: contents = csvfile.read().decode('utf-32be').encode('utf-32') csvcode = 'utf-32' csvfile.truncate(0) csvfile.write(contents) csvfile.seek(0) try: data = list(csv.reader( csvfile, quotechar=str(csvdel), delimiter=str(csvsep))) except csv.Error, e: csvfile.seek(0) return '<script>window.top.%s(%s);</script>' % ( jsonp, simplejson.dumps({'error': { 'message': 'Error parsing CSV file: %s' % e, # decodes each byte to a unicode character, which may or # may not be printable, but decoding will succeed. # Otherwise simplejson will try to decode the `str` using # utf-8, which is very likely to blow up on characters out # of the ascii range (in range [128, 256)) 'preview': csvfile.read(200).decode('iso-8859-1')}}))
def my_open_source_file( path ): # copied from C:\ProgramData\CAST\CAST\Extensions\com.castsoftware.sqlscript.1.2.0-alpha1\analyser.py """ Uses chardet to autodetect encoding and open the file in the correct encoding. """ from chardet.universaldetector import UniversalDetector detector = UniversalDetector() with open(path, 'rb') as f: for line in f: detector.feed(line) if detector.done: break detector.close() result = open(path, 'r', encoding=detector.result['encoding']) #print (encoding=detector.result['encoding']) return result
def find_response_encoding(response): """ 如果html的body中有charset声明的话,就会 返回相应的内容.如果没有发现,就是用chardet来估算出网页所使用的字符编码 """ r = response.body encoding = html_body_declared_encoding(r) if encoding: return encoding else: my_stringio = cStringIO.StringIO(r) my_detector = UniversalDetector() for x in my_stringio: my_detector.feed(x) if my_detector.done: break my_detector.close() return my_detector.result['encoding']
def guess_encoding(file_bytes: bytes) -> str: """Guesses the encoding as a string using the Universal Encoding Detector library incrementally calling its feed method repeatedly with each block of text Args: file_bytes (bytes): raw bytes Returns: str: Type of the encoding """ detector = UniversalDetector() for line in file_bytes.readlines(): detector.feed(line) if detector.done: break detector.close() return detector.result['encoding']
def open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True): rawdata = io.open(file, mode='rb') detector = UniversalDetector() for line in rawdata.readlines(): detector.feed(line) if detector.done: break detector.close() rawdata.close() decoded = io.open(file, mode=mode, buffering=buffering, encoding=detector.result["encoding"], errors=errors, newline=newline, closefd=closefd) return decoded
class Encoding(object): def __init__(self): self.detector = UniversalDetector() def _detect(self, data): self.detector.reset() self.detector.feed(data) self.detector.close() return self.detector.result def detect(self, data, safe=False): try: return self._detect(data) except: if safe: return None raise
def id_encodefile(file__): """ Find encoding of file using chardet """ from chardet.universaldetector import UniversalDetector detector = UniversalDetector() for l in open(file__): detector.feed(l) if detector.done: break detector.close() return detector.result['encoding']
def read_file(archive, name): try: with archive.open(name) as open_file: detector = UniversalDetector() for line in open_file: detector.feed(line) if detector.done: break detector.close() open_file.seek(0) with io.TextIOWrapper( open_file, encoding=detector.result['encoding']) as wrapped_file: for line in csv.DictReader(wrapped_file): yield (line) except KeyError: # file doesn't exist return
def detect(self, begin_line, end_line): begin_line = int(begin_line) end_line = int(end_line) begin_point = self.view.text_point(begin_line + 1, 0) end_point = self.view.text_point(end_line, 0) - 1 region = sublime.Region(begin_point, end_point) content = self.get_text(region) if not content: return detector = UniversalDetector() detector.feed(content) detector.close() encoding = detector.result['encoding'] confidence = detector.result['confidence'] encoding = encoding.upper() if confidence < SETTINGS['confidence'] or encoding in SKIP_ENCODINGS: return self.view.run_command('convert_text_to_utf8', {'begin_line': begin_line, 'end_line': end_line, 'encoding': encoding})
def get_file_encode(file_name: str) -> str: try: detector = UniversalDetector() with open(file_name, mode='rb') as f: for binary in f: detector.feed(binary) if detector.done: break detector.close() return detector.result['encoding'] except FileNotFoundError as e: print("%s file is not found!" % (file_name)) sys.exit() except csv.Error as e: print(e) sys.exit()
def translate(): detector = UniversalDetector() source = 'Source' path = os.path.join(os.path.dirname(os.path.abspath(__file__)), source) file_list = os.listdir(path) result_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'Result') if not os.path.exists(result_folder): os.makedirs(result_folder) print("Для перевода доступны следующиее файлы:") for i in file_list: print(i) file_name = input('Укажите, какой файл из указанных выше Вы хотитк перевести:') file_path = os.path.join(path, file_name) with open(file_path, 'rb') as file: for line in file: detector.feed(line) if detector.done: break detector.close() code_type = detector.result['encoding'] print('Файл {} выполнен в кодировке {}' .format(file_name, code_type)) to_lang = os.path.basename(file_path).lower()[0]+os.path.basename(file_path).lower()[1] with open(file_path, encoding=code_type) as translate_file: params = { 'key': API_KEY, 'text': translate_file, 'lang': '{}-ru'.format(to_lang) } response = requests.get(URL, params=params) json_ = response.json() with open(os.path.join(result_folder, 'Translate_{}'.format(file_name)), 'w', encoding='utf-8') as new_file: new_file.write(''.join(json_['text'])) print('Результат перевода:') with open(os.path.join(result_folder, 'Translate_{}'.format(file_name)), 'r', encoding='utf-8') as read_result: for line in read_result: pprint(line)
def insert_resets(csv_path): # Attempt to detect file encoding detector = UniversalDetector() file = codecs.open(csv_path, 'rb') for line in file: detector.feed(line) if detector.done: break detector.close() file.close() encoding = detector.result['encoding'] if detector.result else 'utf-8' with codecs.open(csv_path, 'r', encoding) as named_resets_file: # Check if CSV file has a header row. sniffer = csv.Sniffer() has_header = sniffer.has_header(named_resets_file.read(2048)) named_resets_file.seek(0) named_resets = csv.reader(named_resets_file) iter_resets = iter(named_resets) if has_header: next(iter_resets) for r in iter_resets: if r[6] not in ['aux-support', 'aux-testUser']: try: named_password_resets = NamedAccountPasswordReset( agent=r[0], acct=r[1], acct_location=r[2] if r[2] else '00099', reset_date=parse(r[3], dayfirst=False, yearfirst=True, fuzzy=True, ignoretz=True), reset_day=r[4], reset_type=r[5], type=r[6] if r[6] else 'Unknown') db.session.add(named_password_resets) db.session.commit() except ValueError: pass
def extract_raw_text(self, filePath, encoding): """This function receive a filePath and an encoding value and return a string with the text of the given file.""" if encoding == "(auto-detect)": detector = UniversalDetector() fh = open(filePath, 'rb') for line in fh: detector.feed(line) if detector.done: break detector.close() fh.close() encoding = detector.result['encoding'] fh = open( filePath, mode='rU', encoding=encoding, ) try: i = 0 fileContent = "" chunks = list() for chunk in iter(lambda: fh.read(CHUNK_LENGTH), ""): chunks.append('\n'.join(chunk.splitlines())) i += CHUNK_LENGTH if i % (CHUNK_NUM * CHUNK_LENGTH) == 0: fileContent += "".join(chunks) chunks = list() if len(chunks): fileContent += "".join(chunks) del chunks return fileContent except UnicodeError: progressBar.finish() if len(myFiles) > 1: message = u"Please select another encoding " \ + u"for file %s." % filePath else: message = u"Please select another encoding." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return finally: fh.close()
class LDictText: """Text to be analysed for word usage""" def __init__(self, fileName): self.fileName = fileName self.detector = UniversalDetector() def getWords(self): """Returns iterator on words in text""" self.detector.reset() for line in open(self.fileName, 'rb'): self.detector.feed(line) if self.detector.done: break self.detector.close() inFile = open(self.fileName,'r',encoding=self.detector.result['encoding'],errors='replace') # patt = "[\p{L}][\p{L}\p{Nd}-]*" patt = "[\w][\w\d-]*" t = inFile.read() # print(t[1:150]) inFile.close() return(re.finditer(patt,t,flags=re.I))
def check_encoding(file_path): try: detector = UniversalDetector() with open(file_path, mode='rb') as f: for binary in f: detector.feed(binary) #ある程度、ファイルを読んだら読込終了 if detector.done: break detector.close() # ファイルが存在しなかった場合 except FileNotFoundError as e: print('ファイルが見つかりません', e) # Exceptionは、それ以外の例外が発生した場合 except Exception as e: print('予期せぬエラーです', e) return detector.result['encoding']
def get_encoding(full_path: str) -> str: ''' Проверка кодировки файла :param full_path: полный путь к файлу :return: кодировка или None в случае ошибки ''' try: detector = UniversalDetector() with open(full_path, 'rb') as fh: for line in fh: detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] # Получили кодировку return encoding except BaseException as miss: raise ProcessingError('Encoding detection error') from miss
def get_file_encoding(filename): """ Utility function to incrementally detect the file encoding. :param filename: Filename for the file to determine the encoding for. Str :return: A dict with the keys 'encoding' and 'confidence' """ detector = UniversalDetector() try: with open(filename, 'rb') as detect_file: while not detector.done: chunk = detect_file.read(1024) if not chunk: break detector.feed(chunk) detector.close() return detector.result except OSError: log.exception('Error detecting file encoding')
def get_detail_line(root, filepath): md5check = md5(filepath) filetype = mimetypes.guess_type(filepath)[0] purepath = '/' + filepath[len(root):] filesize = os.path.getsize(filepath) # encoding detection detector = UniversalDetector() for line in open(filepath, 'rb'): detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] return [ str(md5check), str(filesize), str(filetype), str(encoding), str(purepath) ]
def detect_encoding(filename): """ Takes a filename and attempts to detect the character encoding of the file using `chardet`. :param filename: Name of the file to process :type filename: string :returns: encoding : string """ detector = UniversalDetector() with open(filename, 'rb') as unknown_file: for line in unknown_file: detector.feed(line) if detector.done: break detector.close() return detector.result['encoding']
def getFileCod(file): # == определить кодировку файла # -- возвращает название кодировки или пустую строку # -- пример: # file_cod = getFileCod(file_name) # fh = open(file_name) # for line in fh: # line = fh.readline().decode(file_cod) # print(line) # -- используется для последующей работы со строками файла detector = UniversalDetector() with open(file, 'rb') as fh: for line in fh: detector.feed(line) if detector.done: break #https://python-scripts.com/question/7401 detector.close() return detector.result['encoding']
def get_encoding(data): """Try to get encoding incrementally. See http://chardet.readthedocs.org/en/latest/usage.html#example-detecting-encoding-incrementally # noqa """ start = time.time() msg = 'detecting file encoding...' logger.info(msg) file_like = io.BytesIO(data) detector = UniversalDetector() for i, line in enumerate(file_like): detector.feed(line) if detector.done: break detector.close() msg = 'encoding found in %s sec' % str(time.time() - start) msg += str(detector.result) logger.info(msg) return detector.result
def description_of(lines, name='stdin'): """ Return a string describing the probable encoding of a file or list of strings. :param lines: The lines to get the encoding of. :type lines: Iterable of bytes :param name: Name of file or collection of lines :type name: str """ u = UniversalDetector() for line in lines: u.feed(line) u.close() result = u.result if result['encoding']: return '{0}: {1} with confidence {2}'.format(name, result['encoding'], result['confidence']) else: return '{0}: no result'.format(name)
def open(self): file = self.path.open(mode="rb") if not self._codec: detector = UniversalDetector() for line in file.readlines(): detector.feed(line) if detector.done: break detector.close() file.seek(0) codec = detector.result["encoding"] try: self._codec = codecs.lookup("sloppy-" + codec) except LookupError: self._codec = codecs.lookup(codec) return TextIOWrapper(file, encoding=self._codec.name)
def CheckPattern(ownname,FileName): jsonFName = ownname.split('.')[0] + '.json' json_data = open(jsonFName, "r+") JD = json.loads(json_data.read()) extension = '*.'+FileName.split('.')[-1] for key,val in JD.items(): patFN = val["FileName"] if extension!=patFN: return 'none' detector = UniversalDetector() with open(FileName, 'rb') as fh: for line in fh: detector.feed(line) if detector.done: break detector.close() code=detector.result enc='"EncodingInfo" :("'+code['encoding']+'")' err='"ErrorInfo" :("Ошибок нет")' col_lst = '"Colls" : [' if code['encoding']=='windows-1251': rdr = csv.reader(open(FileName, 'r',encoding='cp1251')) line1 = next(rdr) for coll in line1: dlm=coll.find(';') if dlm>0: err='"ErrorInfo" :("Неверный формат")' else: col_lst += '"%s",' % (coll) elif code['encoding']=='utf-8': rdr = csv.reader(open(FileName, 'r',encoding='utf-8')) line1 = next(rdr) for coll in line1: dlm=coll.find(';') if dlm>0: err='"ErrorInfo" :("Неверный формат")' else: col_lst += '"%s",' % (coll) else: err='"ErrorInfo" :("Кодировка не поддерживается")' col_lst = col_lst.rstrip(",") + "]" return '{"ScriptName" :("csv_l"), "TargetInfo" : ("Сценарий загрузки Акта сверки (csv)"), '+enc+','+err+','+col_lst+'}'
def _encoding_heuristic(stream, confidence_threshold, encoding_preferences=('1252', '8859-1'), encoding_excludes=('koi', 'mik', 'iscii', 'tscii', 'viscii', 'jis', 'gb', 'big5', 'hkscs', 'ks', 'euc', '2022'), encoding_default='ascii', error_no_encoding=True): """Return a pair of (encoding, confidence) Encoding_preferences overrule other code-pages to prevent mistakes. Should be changed in other countries than NL!!!""" pos = stream.tell() detector = UniversalDetector() i = 0 size = 2**10 while not (detector.done or i > 7): detector.feed(stream.read(size)) i += 1 if size < 2**16: size <<= 1 stream.seek(pos) certain = detector.done detector.close() confidence, encoding = (detector.result['confidence'], detector.result['encoding']) if certain else (0.0, None) if not encoding: encoding = encoding_default elif confidence < confidence_threshold + .5 * (1 - confidence_threshold) and \ (any([base in encoding.lower() for base in ('windows', 'iso')]) and not any([cp in encoding for cp in encoding_preferences])): # Heuristic: ANSI (mbcs) is much more probable than exotic windows/iso encoding: encoding = encoding_default if any([enc in encoding.lower() for enc in encoding_excludes]): encoding = encoding_default confidence = 0.0 if encoding == 'ascii': # Plain ASCII is very improbable. Windows is superset of ascii, so does not hurt. encoding = 'mbcs' if confidence < confidence_threshold and error_no_encoding: raise ValueError( 'Heuristic determination of encoding failed: {:.1%} confidence in "{}", {:.1%} required.' .format(confidence, encoding, confidence_threshold)) return encoding, confidence
def detect_encoding(filename): """ Detects the encoding of a file. Parameters ---------- filename: str Name of the file to detect the encoding. Returns ------- encoding: str The encoding of the file. """ detector = UniversalDetector() for line in open(filename, 'rb').readlines(): detector.feed(line) detector.close() return detector.result['encoding']
def detect_file_encode(file): """ファイルの文字エンコード判定""" detector = UniversalDetector() try: with open(file, mode='rb') as f: while True: binary = f.readline() if binary == b'': # ファイルを最後まで読みきった break detector.feed(binary) if detector.done: # 十分な確度でエンコーディングが推定できた break finally: detector.close() return detector.result
def detect_encoding(filepath): """ Given a path to a CSV of unknown encoding read lines to detects its encoding type :param filepath: Filepath to check :type filepath: str :return: Example `{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}` :rtype: dict """ detector = UniversalDetector() with io.open(filepath, 'rb') as f: for line in f: detector.feed(line) if detector.done: break detector.close() return detector.result
def guess_file_encoding(path): detector = UniversalDetector() with open(path, 'rb') as file: for line in file: detector.feed(line) if detector.done: break detector.close() guessed = detector.result use_guess = (guessed['encoding'] is not None and guessed['encoding'] != 'ascii' and guessed['confidence'] >= 1) if use_guess: return guessed['encoding'] return 'utf-8'