def csv_properties(self, csv_file): detector = chardet.UniversalDetector() with open(csv_file, 'rb') as eaop: for line in eaop.readlines(100): detector.feed(line) if detector.done: break detector.close() try: with open(csv_file, 'r', newline='', encoding=detector.result['encoding']) as f: file_subset = f.read(10240) except: with open(csv_file, 'rb') as f: # sniff into 10KB of the file to check its dialect # this will sort out the delimiter and quote character. file_subset = f.read(10240) csvDialect = csv.Sniffer().sniff(file_subset) hasFieldHeader = csv.Sniffer().has_header(file_subset) # read header based on the 10k of file. self.chkHasHeader.setChecked(hasFieldHeader) self.source_file = {'file': csv_file, 'dialect': csvDialect, 'encoding': detector.result['encoding'], 'has_header': hasFieldHeader, 'field_types': {}, }
def gather_candidates(self, context): candidates = [] # Find TeX files that are contained in the same directory as # current file. file_dir = self.vim.call('expand', '%:p:h') tex_files = glob.glob(path.join(file_dir, '*.tex')) # Search labels within TeX files enc_detector = chardet.UniversalDetector() for tex in tex_files: # Detect encoding, assuming the encoding never changes if tex not in self.tex_encs.keys(): enc_detector.reset() for l in open(tex, 'rb'): enc_detector.feed(l) enc_detector.close() self.tex_encs[tex] = enc_detector.result['encoding'] with open(tex, 'r', encoding=self.tex_encs[tex]) as f: tex_str = f.read() for match in self.ref_re.findall(tex_str): candidates.append(match[7:-1]) return candidates
def Open(self, fileDir): self.fileDir = fileDir list = fileDir.split("\\") self.fileName = list[len(list) - 1] del list[-1] self.filePath = '\\'.join(list) try: detector = chardet.UniversalDetector() detector.reset() with open(self.fileDir, mode='rb') as f: for b in f: detector.feed(b) if detector.done: break detector.close() self.encoding = detector.result['encoding'] if fileDir.endswith(".crypt"): self.mode = status.DECIPHER else: self.mode = status.CIPHER try: with open(self.fileDir, "r", encoding=self.encoding) as read_test: read_test.read(1000) except UnicodeDecodeError: print( "this type of file cannot be encrypted due to decoding error. try a different type of file" ) return False return True except FileNotFoundError: print("file not found, please make sure spelling is correct") return False
def get_encoding(filename): """Get the encoding of the file This function uses the chardet package for detecting the encoding of a file. Parameters ---------- filename: str Path to a file Returns ------- encoding: str Encoding of the file. """ detector = chardet.UniversalDetector() final_chunk = False blk_size = 65536 with open(filename, "rb") as fid: while (not final_chunk) and (not detector.done): chunk = fid.read(blk_size) if len(chunk) < blk_size: final_chunk = True detector.feed(chunk) detector.close() encoding = detector.result.get("encoding", None) return encoding
def describe_file(self): """Describe a CSV File and set class properties """ with open(self.source, 'r') as f: # sniff into 10KB of the file to check its dialect # this will sort out the delimiter and quote character. self.dialect = csv.Sniffer().sniff(f.read(10 * 1024)) f.seek(0) # reset read to start of file # read header based on the 10k of file. header = csv.Sniffer().has_header(f.read(10 * 1024)) f.seek(0) # reset read to start of file if not header: warnings.warn( "The CSV file doesn't appear to contain column headers") self.has_column_header = False f.seek(0) # reset read to start of file detector = chardet.UniversalDetector() with open(self.source, 'rb') as eaop: for line in eaop.readlines(100): detector.feed(line) if detector.done: break detector.close() self.file_encoding = detector.result['encoding'] pandas_df = self.open_pandas_dataframe() self.row_count = len(pandas_df) # store a dictionary of original and alias names along with column types. In most cases, objects types will # be strings this will enable lookups if necessary self.column_properties = get_column_properties(pandas_df) return
def detect_encoding(filepath): detector = chardet.UniversalDetector() for line in open(filepath, 'rb'): detector.feed(line) if detector.done: break detector.close() return detector.result
def detect_charset(file, fallback="utf-8"): with open(file, "rb") as f: detector = chardet.UniversalDetector() for line in f.readlines(): detector.feed(line) if detector.done: return detector.result['encoding'] return fallback
def _get_encoding(): detector = chardet.UniversalDetector() for chunk in data.splitlines(keepends=True): detector.feed(chunk) if detector.done: break detector.close() return detector.result
def detect_file_encoding(filename): detector = chardet.UniversalDetector() with open(filename, 'rb') as f: for line in f: detector.feed(line) if detector.done: break detector.close() return detector.result
def from_file(cls, path: Union[str, Path], encoding=None) -> "Cuesheet": content = Path(path).read_bytes() if encoding is None: detector = chardet.UniversalDetector( lang_filter=LanguageFilter.CHINESE | LanguageFilter.JAPANESE) detector.feed(content) encoding = detector.close()['encoding'] return cls.parse(content.decode(encoding))
def _guess_encoding(self): file = open(self.path, 'rb') detector = chardet.UniversalDetector() for line in file.readlines(): detector.feed(line) if detector.done: break return detector.result['encoding']
def sniff_encoding(self): # TODO: where to call this? detector = chardet.UniversalDetector() with open(self.fname, "rb") as f: for line in f: detector.feed(line) if detector.done: break detector.close() self.encoding = detector.result["encoding"]
def CheckCode(file_path): detector = chardet.UniversalDetector() with open(file_path, 'rb') as fh: for line in fh: detector.feed(line) if detector.done: break detector.close() output = detector.result.get('encoding') return output
def whats_coding(file): detector = chardet.UniversalDetector( ) # интересно получить обратную связь, правильно ли использую detector with open(file, 'rb') as fl: for ln in fl: detector.feed(ln) if detector.done: break detector.close() return detector.result
def detect_encoding(file_name: pathlib.Path) -> dict: detector = chardet.UniversalDetector() with file_name.open('rb') as _file: for line in _file.readlines(): detector.feed(line) if detector.done: break detector.close() _file.close() return detector.result
def detect_list_encoding(self, items, default=DEFAULT_ENCODING): detector = chardet.UniversalDetector() for text in items: if not isinstance(text, bytes): continue detector.feed(text) if detector.done: break detector.close() return normalize_result(detector.result, default)
def get_encoding(filename): detector = chardet.UniversalDetector() final_chunk = False blk_size = 65536 with open(filename, "rb") as fid: while (not final_chunk) and (not detector.done): chunk = fid.read(blk_size) if len(chunk) < blk_size: final_chunk = True detector.feed(chunk) detector.close() encoding = detector.result.get("encoding", None) return encoding
def whats_coding(file): detector = chardet.UniversalDetector( ) # интересно получить обратную связь, правильно ли использую detector with open(file, 'rb') as fl: for ln in fl: detector.feed(ln) if detector.done: break detector.close() # data = fl.read() # result = chardet.detect(data) # s = data.decode(result['encoding']) # print(s) return detector.result
def get_inc_enc( inpt: Union[bytes, bytearray, str, os.PathLike, object]) -> str: """ Returns char encoding using chardet.UniversalDetector. """ inpt = get_bytes(inpt) detector = chardet.UniversalDetector() for line in inpt.splitlines(): detector.feed(line) if vars(detector)['done'] == True: break detector.close() rezz = detector.result if rezz['encoding'] == 'ascii': rezz.update({'encoding': 'UTF-8'}) return rezz['encoding']
def check_encoding(filepath): ''' analyze files encoding args[filepath] ''' detector = chardet.UniversalDetector() with open(filepath, mode="rb") as file: for binary in file: detector.feed(binary) #if all bynary data was readed,loop will end if detector.done: break detector.close() #detector-feed process result is return dictionaly style data print(detector.result, end='') print(detector.result['encoding'])
def read(): dect = chardet.UniversalDetector() dataset = set() with open(r'C:\Users\Administrator\Desktop\1695.log', 'r', encoding='UTF-16') as f: line = f.readline() while line: dataset.add(line) line = f.readline() with open(r'C:\Users\Administrator\Desktop\1895.log', 'r', encoding='UTF-16') as f: line = f.readline() while line: if line not in dataset: print(line) line = f.readline()
def guess_file_encoding(fh, default=DEFAULT_ENCODING): """Guess encoding from a file handle.""" start = fh.tell() detector = chardet.UniversalDetector() while True: data = fh.read(1024 * 10) if not data: detector.close() break detector.feed(data) if detector.done: break fh.seek(start) return normalize_result(detector.result, default=default)
def gather_candidates(self, context): candidates = [] # Find bib files that are contained in the same directory as # the TeX file. file_dir = self.vim.call('expand', '%:p:h') bib_files = glob.glob(path.join(file_dir, '*.bib')) # Search cite key within bib files # TODO: Make candidates `dict`, and add `abbr` key to show # bib info. enc_detector = chardet.UniversalDetector() for bib in bib_files: # Detect encoding, assuming the encoding never changes if bib not in self.bib_encs.keys(): enc_detector.reset() for l in open(bib, 'rb'): enc_detector.feed(l) enc_detector.close() self.bib_encs[bib] = enc_detector.result['encoding'] with open(bib, 'r', encoding=self.bib_encs[bib]) as f: bib_lines = f.readlines() for l in bib_lines: if l[0] == '@': candidates.append(self.cite_key_re.search(l)[0][1:-1]) # Search cite key defined by \bibitem within TeX files tex_files = glob.glob(path.join(file_dir, '*.tex')) for tex in tex_files: # Detect encoding, assuming the encoding never changes if tex not in self.bib_encs.keys(): enc_detector.reset() for l in open(tex, 'rb'): enc_detector.feed(l) enc_detector.close() self.bib_encs[tex] = enc_detector.result['encoding'] with open(tex, 'r', encoding=self.bib_encs[tex]) as f: tex_lines = f.readlines() for l in tex_lines: match = self.bibitem_re.search(l) if match: candidates.append(match[1]) return candidates
def bigfile_to_utf8(path): detector = chardet.UniversalDetector() with open(path, 'rU') as fd: for line in fd: detector.feed(line) if detector.done: break fencoding = detector.result['encoding'] detector.close() # 转存到其它文件 new_file = '{0}.{1}'.format(path, 'saving') with open(new_file, 'a+b') as fd_saving: with open(path, 'rU') as fd: for line in fd: encode_line = line.decode(fencoding).encode('utf-8') fd_saving.write(encode_line) shutil.move(new_file, path)
def post(self, request: HttpRequest, **kwargs)->HttpResponse: self.__Form = BOMUploadSelectForm(request.POST, request.FILES) if self.__Form.is_valid(): F = self.__Form.cleaned_data['BOMFile'] detector = chardet.UniversalDetector() for line in F.readlines(): detector.feed(line) if detector.done: break detector.close() CSVFile = TextIOWrapper(F, encoding=detector.result['encoding'], errors='strict' ) CSVFile.seek(0) Reader = csv.reader(CSVFile) self.__Headers = next(Reader) self.__Data = list(Reader) context = self.get_context_data() return self.render_to_response(context)
def detect_encoding(self, buffer, *, encoding=None): """Detect encoding from buffer Parameters: buffer (byte): byte buffer Returns: str: encoding """ # Use defined if self.__encoding_function: return self.__encoding_function(buffer) # Detect encoding if not encoding: detector = chardet.UniversalDetector() for line in buffer.splitlines(): detector.feed(line) detector.close() encoding = detector.result["encoding"] or settings.DEFAULT_ENCODING confidence = detector.result["confidence"] or 0 if confidence < self.__encoding_confidence: encoding = settings.DEFAULT_ENCODING if encoding == "ascii": encoding = settings.DEFAULT_ENCODING if encoding is None: encoding = self.resource.detector.detect_encoding(buffer) # Normalize encoding encoding = codecs.lookup(encoding).name # Work around for incorrect inferion of utf-8-sig encoding if encoding == "utf-8": if buffer.startswith(codecs.BOM_UTF8): encoding = "utf-8-sig" # Use the BOM stripping name (without byte-order) for UTF-16 encodings elif encoding == "utf-16-be": if buffer.startswith(codecs.BOM_UTF16_BE): encoding = "utf-16" elif encoding == "utf-16-le": if buffer.startswith(codecs.BOM_UTF16_LE): encoding = "utf-16" return encoding
def open_file(filename, encoding="*", skip_unreadable=False, split=True): if not 'chardet' in globals(): import chardet p = None if not path.exists(filename): die("Can't find file " + filename) if encoding == "*": p = open(filename, "rb") import chardet d = chardet.UniversalDetector() length = p.seek(0, 2) p.seek(0, 0) while d.done == False and p.tell() < length: d.feed(p.read(50)) charencoding = d.close()["encoding"] p = open(filename, mode="r", encoding=charencoding) else: p = open(filename, mode="r", encoding=encoding) try: lines = p.readlines(2) for i in lines: if i.startswith("#") and 'coding' in i: i = i.strip() if i.startswith("-*-") and i.endswith("-*-"): i = i.split("-*- coding: ", 1)[0].strip(" -*-") print("Found encoding " + i, filename) p = open(filename, encoding=i, mode="r") break if i.startswith("#coding="): i = i.split("#coding=", 1)[1].strip() p = open(filename, mode="r", encoding=i) break p.seek(0) data = p.read() p.close() except UnicodeDecodeError: if p: p.close() if skip_unreadable == False: die("Can't decode " + filename + " with encoding " + p.encoding) else: raise if split == True: data = data.splitlines(True) return data
def read_subtitle_file(filename): milliseconds = 0 word_count = 0 file = open(filename, "rb") detector = chardet.UniversalDetector() filelines = len(file.readlines()) threshold = 50 file.seek(0) n = filelines if filelines < threshold else threshold for line in file.readlines()[0:n]: detector.feed(line) if detector.done: break detector.close() file.close() with open( filename, 'r', encoding=detector.result['encoding'], errors='ignore') as file: text = file.readlines() filelines = len(text) for i in range(filelines): if '-->' in text[i]: text[i] = text[i].replace('-->', ' --> ') elements = text[i].split() try: milliseconds += format_time(elements[2]) - format_time( elements[0]) except: break for j in range(1, 2): if i + j >= filelines: break if text[i + j].strip() == '': break else: if judge_pure_english(text[i + j]): try: word_count += len(segment_word(text[i + j])) except: break frequency = word_count / (milliseconds / 60000) return word_count, milliseconds, frequency
def set_file_utf8(path): detector = chardet.UniversalDetector() with open(path, 'r+b') as fd: for line in fd: detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] dirname, _, basename = path.rpartition(os.sep) new_file = os.path.join(dirname, '.{0}'.format(basename)) with open(new_file, 'a+b') as _fd: with open(path, 'r+b') as fd: for line in fd: encoded_data = line.decode(encoding).encode('utf-8') _fd.write(encoded_data) shutil.move(new_file, path)
def guess_file_encoding(fh: BinaryIO, default: Encoding = DEFAULT_ENCODING) -> Encoding: """Guess encoding from a file handle.""" warnings.warn( "guess_encoding is now deprecated. Use predict_encoding instead", DeprecationWarning, ) start = fh.tell() detector = chardet.UniversalDetector() while True: data = fh.read(1024 * 10) if not data: detector.close() break detector.feed(data) if detector.done: break fh.seek(start) return normalize_result(detector.result, default=default)