Exemplo n.º 1
0
def file_encoding(path):
    iso_unique = (b'\xb1', b'\xac', b'\xbc', b'\xa1', b'\xb6', b'\xa6')
    cp_unique = (b'\xb9', b'\xa5', b'\x9f', b'\x8f', b'\x8c', b'\x9c')

    iso_counter = 0
    cp_counter = 0

    _detector = cchardet.UniversalDetector()

    with open(path, 'rb') as f:
        for line in f:
            for c in iso_unique:
                iso_counter += line.count(c)
            for c in cp_unique:
                cp_counter += line.count(c)

            _detector.feed(line)
            if _detector.done:
                break
    _detector.close()

    backup_encoding = 'utf-8'
    encoding = _detector.result.get('encoding')
    confidence = _detector.result.get('confidence') or 0.0
    if confidence < 0.95 and (cp_counter or iso_counter):
        backup_encoding = 'Windows-1250' if cp_counter > iso_counter else 'iso-8859-2'
    return encoding, backup_encoding
Exemplo n.º 2
0
def detect_encoding(bytesio: io.BytesIO) -> str:
    """
    Detect charset, as Python-friendly encoding string.

    Peculiarities:

    * Reads file by CHARDET_CHUNK_SIZE defined in settings.py
    * Stops seeking when detector.done flag True
    * Seeks back to beginning of file for downstream usage
    * Returns "utf-8" in case of empty file or ASCII -- since the parse
      framework is designed to be UTF-native.
    """
    detector = chardet.UniversalDetector()
    while not detector.done:
        chunk = bytesio.read(settings.CHARDET_CHUNK_SIZE)
        if not chunk:
            break  # EOF
        detector.feed(chunk)

    detector.close()
    bytesio.seek(0)
    encoding = detector.result["encoding"]
    if encoding is None:
        # There isn't enough data for chardet
        return "UTF-8"
    elif encoding == "ASCII":
        return "UTF-8"
    else:
        return encoding
Exemplo n.º 3
0
    def test_github_issue_20(self):
        """
        https://github.com/PyYoshi/cChardet/issues/20
        """
        msg = b'\x8f'

        cchardet.detect(msg)

        detector = cchardet.UniversalDetector()
        detector.feed(msg)
        detector.close()
Exemplo n.º 4
0
def guess_encoding_from_stream(stream, chunk_size=4096, chardet_threshold=0.5):
    detector = chardet.UniversalDetector()
    chunk = stream.read(chunk_size)
    while not detector.done and chunk:
        detector.feed(chunk)
        chunk = stream.read(chunk_size)
    detector.close()
    result = detector.result
    confidence = result.get("confidence")
    if not confidence or confidence < chardet_threshold:
        raise ValueError("Failed to detect encoding")
    encoding = result["encoding"]
    return encoding
Exemplo n.º 5
0
def guess_file_encoding(fh, default=DEFAULT_ENCODING):
    """Guess encoding from a file handle."""
    start = fh.tell()
    detector = chardet.UniversalDetector()
    for idx in six.moves.range(1024):
        data = fh.read(1024)
        if not len(data):
            break
        detector.feed(data)
        if detector.done:
            break

    detector.close()
    fh.seek(start)
    return normalize_result(detector.result, default=default)
Exemplo n.º 6
0
 def test_detector(self):
     detector = cchardet.UniversalDetector()
     with open(
             "tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt",
             'rb') as f:
         line = f.readline()
         while line:
             detector.feed(line)
             if detector.done:
                 break
             line = f.readline()
     detector.close()
     detected_encoding = detector.result
     eq_(
         "shift_jis", detected_encoding['encoding'].lower(),
         'Expected %s, but got %s' %
         ("shift_jis", detected_encoding['encoding'].lower()))
Exemplo n.º 7
0
def detect_encoding(bytesio: io.BytesIO):
    """
    Detect charset, as Python-friendly encoding string.

    Peculiarities:

    * Reads file by CHARDET_CHUNK_SIZE defined in settings.py
    * Stops seeking when detector.done flag True
    * Seeks back to beginning of file for downstream usage
    """
    detector = chardet.UniversalDetector()
    while not detector.done:
        chunk = bytesio.read(settings.CHARDET_CHUNK_SIZE)
        if not chunk:
            break  # EOF
        detector.feed(chunk)

    detector.close()
    bytesio.seek(0)
    return detector.result["encoding"]