def attempt_parse(quoting: HintQuoting) -> PartialRecordsHints: with rewound_fileobj(fileobj) as fresh_fileobj: current_hints = streaming_hints.copy() current_hints['quoting'] = quoting logger.info(f"Attempting to parse with quoting: {quoting}") with stream_csv(fresh_fileobj, current_hints): return {'quoting': quoting}
def sniff_compression_hint(fileobj: IO[bytes]) -> HintCompression: print(f'Sniffing compression') with rewound_fileobj(fileobj) as fileobj_rewound: # https://stackoverflow.com/a/13044946/9795956 magic_dict: Dict[bytes, HintCompression] = { b"\x1f\x8b\x08": "GZIP", b"\x42\x5a\x68": "BZIP", # "\x50\x4b\x03\x04": "zip" } max_len = max(len(x) for x in magic_dict) file_start = fileobj_rewound.read(max_len) for magic, filetype in magic_dict.items(): if file_start.startswith(magic): return filetype return None
def rewound_decompressed_fileobj( fileobj: IO[bytes], compression: HintCompression) -> Iterator[IO[bytes]]: with rewound_fileobj(fileobj) as fileobj_after_rewind: if compression is None: yield fileobj elif compression == 'GZIP': yield gzip.GzipFile(mode='rb', fileobj=fileobj_after_rewind) # type: ignore elif compression == 'LZO': # This might be useful to implement this: # https://github.com/ir193/python-lzo/blob/master/lzo.py#L44 raise NotImplementedError( 'Records mover does not currently know how ' 'to decompress LZO files for inspection') elif compression == 'BZIP': yield bz2.BZ2File(mode='rb', filename=fileobj_after_rewind) else: _assert_never(compression)
def sniff_encoding_hint(fileobj: IO[bytes]) -> Optional[HintEncoding]: with rewound_fileobj(fileobj) as fileobj: detector = chardet.UniversalDetector() while True: chunk = fileobj.read(HINT_INFERENCE_SAMPLING_SIZE_BYTES) detector.feed(chunk) if detector.done or len( chunk) < HINT_INFERENCE_SAMPLING_SIZE_BYTES: break detector.close() assert detector.result is not None if 'encoding' in detector.result: chardet_encoding = detector.result['encoding'] if chardet_encoding in hint_encoding_from_chardet: return hint_encoding_from_chardet[chardet_encoding] else: logger.warning("Got unrecognized encoding from chardet " f"sniffing: {detector.result}") return None else: logger.warning( f"Unable to sniff file encoding using chardet: {detector.result}" ) return None