Пример #1
0
 def attempt_parse(quoting: HintQuoting) -> PartialRecordsHints:
     with rewound_fileobj(fileobj) as fresh_fileobj:
         current_hints = streaming_hints.copy()
         current_hints['quoting'] = quoting
         logger.info(f"Attempting to parse with quoting: {quoting}")
         with stream_csv(fresh_fileobj, current_hints):
             return {'quoting': quoting}
Пример #2
0
def sniff_compression_hint(fileobj: IO[bytes]) -> HintCompression:
    print(f'Sniffing compression')
    with rewound_fileobj(fileobj) as fileobj_rewound:
        # https://stackoverflow.com/a/13044946/9795956
        magic_dict: Dict[bytes, HintCompression] = {
            b"\x1f\x8b\x08": "GZIP",
            b"\x42\x5a\x68": "BZIP",
            # "\x50\x4b\x03\x04": "zip"
        }

        max_len = max(len(x) for x in magic_dict)

        file_start = fileobj_rewound.read(max_len)
        for magic, filetype in magic_dict.items():
            if file_start.startswith(magic):
                return filetype
        return None
Пример #3
0
def rewound_decompressed_fileobj(
        fileobj: IO[bytes],
        compression: HintCompression) -> Iterator[IO[bytes]]:
    with rewound_fileobj(fileobj) as fileobj_after_rewind:
        if compression is None:
            yield fileobj
        elif compression == 'GZIP':
            yield gzip.GzipFile(mode='rb',
                                fileobj=fileobj_after_rewind)  # type: ignore
        elif compression == 'LZO':
            # This might be useful to implement this:
            #  https://github.com/ir193/python-lzo/blob/master/lzo.py#L44
            raise NotImplementedError(
                'Records mover does not currently know how '
                'to decompress LZO files for inspection')
        elif compression == 'BZIP':
            yield bz2.BZ2File(mode='rb', filename=fileobj_after_rewind)
        else:
            _assert_never(compression)
Пример #4
0
def sniff_encoding_hint(fileobj: IO[bytes]) -> Optional[HintEncoding]:
    with rewound_fileobj(fileobj) as fileobj:
        detector = chardet.UniversalDetector()
        while True:
            chunk = fileobj.read(HINT_INFERENCE_SAMPLING_SIZE_BYTES)
            detector.feed(chunk)
            if detector.done or len(
                    chunk) < HINT_INFERENCE_SAMPLING_SIZE_BYTES:
                break
        detector.close()
        assert detector.result is not None
        if 'encoding' in detector.result:
            chardet_encoding = detector.result['encoding']
            if chardet_encoding in hint_encoding_from_chardet:
                return hint_encoding_from_chardet[chardet_encoding]
            else:
                logger.warning("Got unrecognized encoding from chardet "
                               f"sniffing: {detector.result}")
                return None
        else:
            logger.warning(
                f"Unable to sniff file encoding using chardet: {detector.result}"
            )
            return None