def csv_properties(self, csv_file):
        detector = chardet.UniversalDetector()
        with open(csv_file, 'rb') as eaop:
            for line in eaop.readlines(100):
                detector.feed(line)
                if detector.done:
                    break
            detector.close()

        try:
            with open(csv_file, 'r', newline='', encoding=detector.result['encoding']) as f:
                file_subset = f.read(10240)
        except:
            with open(csv_file, 'rb') as f:
                # sniff into 10KB of the file to check its dialect
                # this will sort out the delimiter and quote character.
                file_subset = f.read(10240)

        csvDialect = csv.Sniffer().sniff(file_subset)
        hasFieldHeader = csv.Sniffer().has_header(file_subset)

        # read header based on the 10k of file.
        self.chkHasHeader.setChecked(hasFieldHeader)
        self.source_file = {'file': csv_file,
                            'dialect': csvDialect,
                            'encoding': detector.result['encoding'],
                            'has_header': hasFieldHeader,
                            'field_types': {},
                            }
Пример #2
0
    def gather_candidates(self, context):
        candidates = []

        # Find TeX files that are contained in the same directory as
        # current file.
        file_dir = self.vim.call('expand', '%:p:h')
        tex_files = glob.glob(path.join(file_dir, '*.tex'))

        # Search labels within TeX files
        enc_detector = chardet.UniversalDetector()
        for tex in tex_files:
            # Detect encoding, assuming the encoding never changes
            if tex not in self.tex_encs.keys():
                enc_detector.reset()
                for l in open(tex, 'rb'):
                    enc_detector.feed(l)
                enc_detector.close()
                self.tex_encs[tex] = enc_detector.result['encoding']

            with open(tex, 'r', encoding=self.tex_encs[tex]) as f:
                tex_str = f.read()
            for match in self.ref_re.findall(tex_str):
                candidates.append(match[7:-1])

        return candidates
Пример #3
0
    def Open(self, fileDir):
        self.fileDir = fileDir
        list = fileDir.split("\\")
        self.fileName = list[len(list) - 1]
        del list[-1]
        self.filePath = '\\'.join(list)

        try:
            detector = chardet.UniversalDetector()
            detector.reset()
            with open(self.fileDir, mode='rb') as f:
                for b in f:
                    detector.feed(b)
                    if detector.done: break
            detector.close()
            self.encoding = detector.result['encoding']

            if fileDir.endswith(".crypt"):
                self.mode = status.DECIPHER
            else:
                self.mode = status.CIPHER
            try:
                with open(self.fileDir, "r",
                          encoding=self.encoding) as read_test:
                    read_test.read(1000)
            except UnicodeDecodeError:
                print(
                    "this type of file cannot be encrypted due to decoding error. try a different type of file"
                )
                return False
            return True

        except FileNotFoundError:
            print("file not found, please make sure spelling is correct")
            return False
Пример #4
0
def get_encoding(filename):
    """Get the encoding of the file

    This function uses the chardet package for detecting the encoding of a 
    file.

    Parameters
    ----------
    filename: str
        Path to a file

    Returns
    -------
    encoding: str
        Encoding of the file.
    """
    detector = chardet.UniversalDetector()
    final_chunk = False
    blk_size = 65536
    with open(filename, "rb") as fid:
        while (not final_chunk) and (not detector.done):
            chunk = fid.read(blk_size)
            if len(chunk) < blk_size:
                final_chunk = True
            detector.feed(chunk)
    detector.close()
    encoding = detector.result.get("encoding", None)
    return encoding
Пример #5
0
    def describe_file(self):
        """Describe a CSV File and set class properties
        """
        with open(self.source, 'r') as f:
            # sniff into 10KB of the file to check its dialect
            # this will sort out the delimiter and quote character.
            self.dialect = csv.Sniffer().sniff(f.read(10 * 1024))
            f.seek(0)  # reset read to start of file

            # read header based on the 10k of file.
            header = csv.Sniffer().has_header(f.read(10 * 1024))
            f.seek(0)  # reset read to start of file
            if not header:
                warnings.warn(
                    "The CSV file doesn't appear to contain column headers")
                self.has_column_header = False

            f.seek(0)  # reset read to start of file

        detector = chardet.UniversalDetector()
        with open(self.source, 'rb') as eaop:
            for line in eaop.readlines(100):
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
        self.file_encoding = detector.result['encoding']

        pandas_df = self.open_pandas_dataframe()
        self.row_count = len(pandas_df)
        # store a dictionary of original and alias names along with column types. In most cases, objects types will
        # be strings this will enable lookups if necessary
        self.column_properties = get_column_properties(pandas_df)
        return
Пример #6
0
def detect_encoding(filepath):
    detector = chardet.UniversalDetector()
    for line in open(filepath, 'rb'):
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    return detector.result
Пример #7
0
def detect_charset(file, fallback="utf-8"):
    with open(file, "rb") as f:
        detector = chardet.UniversalDetector()
        for line in f.readlines():
            detector.feed(line)
            if detector.done:
                return detector.result['encoding']
    return fallback
Пример #8
0
 def _get_encoding():
     detector = chardet.UniversalDetector()
     for chunk in data.splitlines(keepends=True):
         detector.feed(chunk)
         if detector.done:
             break
     detector.close()
     return detector.result
Пример #9
0
def detect_file_encoding(filename):
    detector = chardet.UniversalDetector()
    with open(filename, 'rb') as f:
        for line in f:
            detector.feed(line)
            if detector.done: break
        detector.close()
    return detector.result
Пример #10
0
 def from_file(cls, path: Union[str, Path], encoding=None) -> "Cuesheet":
     content = Path(path).read_bytes()
     if encoding is None:
         detector = chardet.UniversalDetector(
             lang_filter=LanguageFilter.CHINESE | LanguageFilter.JAPANESE)
         detector.feed(content)
         encoding = detector.close()['encoding']
     return cls.parse(content.decode(encoding))
Пример #11
0
 def _guess_encoding(self):
     file = open(self.path, 'rb')
     detector = chardet.UniversalDetector()
     for line in file.readlines():
         detector.feed(line)
         if detector.done:
             break
     return detector.result['encoding']
Пример #12
0
 def sniff_encoding(self):  # TODO: where to call this?
     detector = chardet.UniversalDetector()
     with open(self.fname, "rb") as f:
         for line in f:
             detector.feed(line)
             if detector.done:
                 break
     detector.close()
     self.encoding = detector.result["encoding"]
Пример #13
0
def CheckCode(file_path):
    detector = chardet.UniversalDetector()
    with open(file_path, 'rb') as fh:
        for line in fh:
            detector.feed(line)
            if detector.done:
                break
    detector.close()
    output = detector.result.get('encoding')
    return output
Пример #14
0
def whats_coding(file):
    detector = chardet.UniversalDetector(
    )  # интересно получить обратную связь, правильно ли использую detector
    with open(file, 'rb') as fl:
        for ln in fl:
            detector.feed(ln)
            if detector.done:
                break
        detector.close()
    return detector.result
Пример #15
0
def detect_encoding(file_name: pathlib.Path) -> dict:
    detector = chardet.UniversalDetector()
    with file_name.open('rb') as _file:
        for line in _file.readlines():
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        _file.close()
    return detector.result
Пример #16
0
    def detect_list_encoding(self, items, default=DEFAULT_ENCODING):
        detector = chardet.UniversalDetector()
        for text in items:
            if not isinstance(text, bytes):
                continue
            detector.feed(text)
            if detector.done:
                break

        detector.close()
        return normalize_result(detector.result, default)
Пример #17
0
def get_encoding(filename):
    detector = chardet.UniversalDetector()
    final_chunk = False
    blk_size = 65536
    with open(filename, "rb") as fid:
        while (not final_chunk) and (not detector.done):
            chunk = fid.read(blk_size)
            if len(chunk) < blk_size:
                final_chunk = True
            detector.feed(chunk)
    detector.close()
    encoding = detector.result.get("encoding", None)
    return encoding
Пример #18
0
def whats_coding(file):
    detector = chardet.UniversalDetector(
    )  # интересно получить обратную связь, правильно ли использую detector
    with open(file, 'rb') as fl:
        for ln in fl:
            detector.feed(ln)
            if detector.done:
                break
        detector.close()
        # data = fl.read()
        # result = chardet.detect(data)
        # s = data.decode(result['encoding'])
        # print(s)
    return detector.result
Пример #19
0
def get_inc_enc(
        inpt: Union[bytes, bytearray, str, os.PathLike, object]) -> str:
    """ Returns char encoding using chardet.UniversalDetector. """
    inpt = get_bytes(inpt)
    detector = chardet.UniversalDetector()
    for line in inpt.splitlines():
        detector.feed(line)
        if vars(detector)['done'] == True:
            break
        detector.close()
        rezz = detector.result
        if rezz['encoding'] == 'ascii':
            rezz.update({'encoding': 'UTF-8'})
        return rezz['encoding']
Пример #20
0
def check_encoding(filepath):
    ''' analyze files encoding args[filepath] '''
    detector = chardet.UniversalDetector()
    with open(filepath, mode="rb") as file:
        for binary in file:
            detector.feed(binary)
            #if all bynary data was readed,loop will end
            if detector.done:
                break
    detector.close()

    #detector-feed process result is return dictionaly style data
    print(detector.result, end='')
    print(detector.result['encoding'])
Пример #21
0
def read():
    dect = chardet.UniversalDetector()
    dataset = set()
    with open(r'C:\Users\Administrator\Desktop\1695.log', 'r', encoding='UTF-16') as f:
        line = f.readline()
        while line:
            dataset.add(line)
            line = f.readline()

    with open(r'C:\Users\Administrator\Desktop\1895.log', 'r', encoding='UTF-16') as f:
        line = f.readline()
        while line:
            if line not in dataset:
                print(line)
            line = f.readline()
def guess_file_encoding(fh, default=DEFAULT_ENCODING):
    """Guess encoding from a file handle."""
    start = fh.tell()
    detector = chardet.UniversalDetector()
    while True:
        data = fh.read(1024 * 10)
        if not data:
            detector.close()
            break
        detector.feed(data)
        if detector.done:
            break

    fh.seek(start)
    return normalize_result(detector.result, default=default)
Пример #23
0
    def gather_candidates(self, context):
        candidates = []

        # Find bib files that are contained in the same directory as
        # the TeX file.
        file_dir = self.vim.call('expand', '%:p:h')
        bib_files = glob.glob(path.join(file_dir, '*.bib'))

        # Search cite key within bib files
        # TODO: Make candidates `dict`, and add `abbr` key to show
        #       bib info.
        enc_detector = chardet.UniversalDetector()
        for bib in bib_files:
            # Detect encoding, assuming the encoding never changes
            if bib not in self.bib_encs.keys():
                enc_detector.reset()
                for l in open(bib, 'rb'):
                    enc_detector.feed(l)
                enc_detector.close()
                self.bib_encs[bib] = enc_detector.result['encoding']

            with open(bib, 'r', encoding=self.bib_encs[bib]) as f:
                bib_lines = f.readlines()
            for l in bib_lines:
                if l[0] == '@':
                    candidates.append(self.cite_key_re.search(l)[0][1:-1])

        # Search cite key defined by \bibitem within TeX files
        tex_files = glob.glob(path.join(file_dir, '*.tex'))
        for tex in tex_files:
            # Detect encoding, assuming the encoding never changes
            if tex not in self.bib_encs.keys():
                enc_detector.reset()
                for l in open(tex, 'rb'):
                    enc_detector.feed(l)
                enc_detector.close()
                self.bib_encs[tex] = enc_detector.result['encoding']

            with open(tex, 'r', encoding=self.bib_encs[tex]) as f:
                tex_lines = f.readlines()
            for l in tex_lines:
                match = self.bibitem_re.search(l)
                if match:
                    candidates.append(match[1])

        return candidates
Пример #24
0
 def bigfile_to_utf8(path):
     detector = chardet.UniversalDetector()
     with open(path, 'rU') as fd:
         for line in fd:
             detector.feed(line)
             if detector.done:
                 break
     fencoding = detector.result['encoding']
     detector.close()
     # 转存到其它文件
     new_file = '{0}.{1}'.format(path, 'saving')
     with open(new_file, 'a+b') as fd_saving:
         with open(path, 'rU') as fd:
             for line in fd:
                 encode_line = line.decode(fencoding).encode('utf-8')
                 fd_saving.write(encode_line)
     shutil.move(new_file, path)
Пример #25
0
    def post(self, request: HttpRequest, **kwargs)->HttpResponse:
        self.__Form = BOMUploadSelectForm(request.POST, request.FILES)
        if self.__Form.is_valid():
            F = self.__Form.cleaned_data['BOMFile']
            detector = chardet.UniversalDetector()
            for line in F.readlines():
                detector.feed(line)
                if detector.done: break

            detector.close()
            CSVFile = TextIOWrapper(F, encoding=detector.result['encoding'], errors='strict' )
            CSVFile.seek(0)
            Reader = csv.reader(CSVFile)
            self.__Headers = next(Reader)
            self.__Data = list(Reader)

        context = self.get_context_data()
        return self.render_to_response(context)
Пример #26
0
    def detect_encoding(self, buffer, *, encoding=None):
        """Detect encoding from buffer

        Parameters:
            buffer (byte): byte buffer

        Returns:
            str: encoding
        """

        # Use defined
        if self.__encoding_function:
            return self.__encoding_function(buffer)

        # Detect encoding
        if not encoding:
            detector = chardet.UniversalDetector()
            for line in buffer.splitlines():
                detector.feed(line)
            detector.close()
            encoding = detector.result["encoding"] or settings.DEFAULT_ENCODING
            confidence = detector.result["confidence"] or 0
            if confidence < self.__encoding_confidence:
                encoding = settings.DEFAULT_ENCODING
            if encoding == "ascii":
                encoding = settings.DEFAULT_ENCODING
            if encoding is None:
                encoding = self.resource.detector.detect_encoding(buffer)

        # Normalize encoding
        encoding = codecs.lookup(encoding).name
        # Work around for incorrect inferion of utf-8-sig encoding
        if encoding == "utf-8":
            if buffer.startswith(codecs.BOM_UTF8):
                encoding = "utf-8-sig"
        # Use the BOM stripping name (without byte-order) for UTF-16 encodings
        elif encoding == "utf-16-be":
            if buffer.startswith(codecs.BOM_UTF16_BE):
                encoding = "utf-16"
        elif encoding == "utf-16-le":
            if buffer.startswith(codecs.BOM_UTF16_LE):
                encoding = "utf-16"

        return encoding
Пример #27
0
def open_file(filename, encoding="*", skip_unreadable=False, split=True):
    if not 'chardet' in globals(): import chardet
    p = None
    if not path.exists(filename):
        die("Can't find file " + filename)

    if encoding == "*":
        p = open(filename, "rb")
        import chardet
        d = chardet.UniversalDetector()
        length = p.seek(0, 2)
        p.seek(0, 0)
        while d.done == False and p.tell() < length:
            d.feed(p.read(50))
        charencoding = d.close()["encoding"]
        p = open(filename, mode="r", encoding=charencoding)
    else:
        p = open(filename, mode="r", encoding=encoding)
    try:
        lines = p.readlines(2)
        for i in lines:
            if i.startswith("#") and 'coding' in i:
                i = i.strip()
                if i.startswith("-*-") and i.endswith("-*-"):
                    i = i.split("-*- coding: ", 1)[0].strip(" -*-")
                    print("Found encoding " + i, filename)
                    p = open(filename, encoding=i, mode="r")
                    break
            if i.startswith("#coding="):
                i = i.split("#coding=", 1)[1].strip()
                p = open(filename, mode="r", encoding=i)
                break
        p.seek(0)
        data = p.read()
        p.close()
    except UnicodeDecodeError:
        if p: p.close()
        if skip_unreadable == False:
            die("Can't decode " + filename + " with encoding " + p.encoding)
        else:
            raise
    if split == True: data = data.splitlines(True)
    return data
Пример #28
0
def read_subtitle_file(filename):
    milliseconds = 0
    word_count = 0
    file = open(filename, "rb")
    detector = chardet.UniversalDetector()
    filelines = len(file.readlines())
    threshold = 50
    file.seek(0)
    n = filelines if filelines < threshold else threshold
    for line in file.readlines()[0:n]:
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    file.close()
    with open(
            filename, 'r', encoding=detector.result['encoding'],
            errors='ignore') as file:
        text = file.readlines()
        filelines = len(text)
        for i in range(filelines):
            if '-->' in text[i]:
                text[i] = text[i].replace('-->', ' --> ')
                elements = text[i].split()
                try:
                    milliseconds += format_time(elements[2]) - format_time(
                        elements[0])
                except:
                    break
                for j in range(1, 2):
                    if i + j >= filelines:
                        break
                    if text[i + j].strip() == '':
                        break
                    else:
                        if judge_pure_english(text[i + j]):
                            try:
                                word_count += len(segment_word(text[i + j]))
                            except:
                                break
    frequency = word_count / (milliseconds / 60000)
    return word_count, milliseconds, frequency
Пример #29
0
    def set_file_utf8(path):
        detector = chardet.UniversalDetector()
        with open(path, 'r+b') as fd:
            for line in fd:
                detector.feed(line)
                if detector.done:
                    break

        detector.close()
        encoding = detector.result['encoding']

        dirname, _, basename = path.rpartition(os.sep)
        new_file = os.path.join(dirname, '.{0}'.format(basename))
        with open(new_file, 'a+b') as _fd:
            with open(path, 'r+b') as fd:
                for line in fd:
                    encoded_data = line.decode(encoding).encode('utf-8')
                    _fd.write(encoded_data)

        shutil.move(new_file, path)
Пример #30
0
def guess_file_encoding(fh: BinaryIO,
                        default: Encoding = DEFAULT_ENCODING) -> Encoding:
    """Guess encoding from a file handle."""
    warnings.warn(
        "guess_encoding is now deprecated. Use predict_encoding instead",
        DeprecationWarning,
    )
    start = fh.tell()
    detector = chardet.UniversalDetector()
    while True:
        data = fh.read(1024 * 10)
        if not data:
            detector.close()
            break
        detector.feed(data)
        if detector.done:
            break

    fh.seek(start)
    return normalize_result(detector.result, default=default)