def from_file(file: AbstractBufferedFile, options: dict = {}) -> Union[Schema, InvalidSchema]: sample_size = 10000 sample = file.read(sample_size) file_type = magic.from_buffer(sample) # file magic is not great, find a way to replace it # TODO: Remove this hack for Debian https://github.com/ahupp/python-magic/issues/208 path_str = get_path(file).path_str if path_str.endswith(".csv") and file_type == "ASCII text": file_type = "CSV text" # TODO: Remove this hack for Ubuntu elif (path_str.endswith(".json") or path_str.endswith(".jsonl")) and file_type == "ASCII text": file_type = "JSON data" file.seek(0) path = get_path(file) if file_type == "Apache Parquet": return ParquetSchema.from_file(file, path) elif file_type == "JSON data": return JsonSchema.from_file(path) elif file_type in list(supported_text_types.keys()): return TextSchema.from_file(file_type, sample, path, options.get("read_headers")) elif file_type == "empty": return EmptySchema() else: return InvalidSchema( f"File type not supported for file {path.path_str}. Type: {file_type}" )
def header_length(file: AbstractBufferedFile): head = [] for i in range(4): head.append(file.readline().decode("utf-8")) file.seek(0) header_length = max(list(map(lambda l: len(l.split(",")), head))) return header_length
def test_trim_kwarg_warns(): fs = DummyTestFS() with pytest.warns(FutureWarning, match="cache_options"): AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes", trim=False)
def test_cache_options(): fs = DummyTestFS() f = AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes") assert f.cache.trim # TODO: dummy buffered file f = AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes", cache_options=dict(trim=False)) assert f.cache.trim is False f = fs.open("misc/foo.txt", cache_type="bytes", cache_options=dict(trim=False)) assert f.cache.trim is False