def test_batch_lifetime(self): gc.collect() old_allocated = pa.total_allocated_bytes() # Memory occupation should not grow with CSV file size def check_one_batch(reader, expected): batch = reader.read_next_batch() assert batch.to_pydict() == expected rows = b"10,11\n12,13\n14,15\n16,17\n" read_options = ReadOptions() read_options.column_names = ['a', 'b'] read_options.block_size = 6 reader = self.open_bytes(rows, read_options=read_options) check_one_batch(reader, {'a': [10], 'b': [11]}) allocated_after_first_batch = pa.total_allocated_bytes() check_one_batch(reader, {'a': [12], 'b': [13]}) assert pa.total_allocated_bytes() == allocated_after_first_batch check_one_batch(reader, {'a': [14], 'b': [15]}) assert pa.total_allocated_bytes() == allocated_after_first_batch check_one_batch(reader, {'a': [16], 'b': [17]}) assert pa.total_allocated_bytes() == allocated_after_first_batch with pytest.raises(StopIteration): reader.read_next_batch() assert pa.total_allocated_bytes() == old_allocated reader = None assert pa.total_allocated_bytes() == old_allocated
def test_encoding(self): # latin-1 (invalid utf-8) rows = b"a,b\nun,\xe9l\xe9phant" read_options = ReadOptions() reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('a', pa.string()), ('b', pa.binary())]) self.check_reader(reader, expected_schema, [{ 'a': ["un"], 'b': [b"\xe9l\xe9phant"] }]) read_options.encoding = 'latin1' reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('a', pa.string()), ('b', pa.string())]) self.check_reader(reader, expected_schema, [{ 'a': ["un"], 'b': ["éléphant"] }]) # utf-16 rows = (b'\xff\xfea\x00,\x00b\x00\n\x00u\x00n\x00,' b'\x00\xe9\x00l\x00\xe9\x00p\x00h\x00a\x00n\x00t\x00') read_options.encoding = 'utf16' reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('a', pa.string()), ('b', pa.string())]) self.check_reader(reader, expected_schema, [{ 'a': ["un"], 'b': ["éléphant"] }])
def test_inference_failure(self): # Inference on first block, then conversion failure on second block rows = b"a,b\n123,456\nabc,de\xff\ngh,ij\n" read_options = ReadOptions() read_options.block_size = len(rows) - 7 reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('a', pa.int64()), ('b', pa.int64())]) assert reader.schema == expected_schema assert reader.read_next_batch().to_pydict() == {'a': [123], 'b': [456]} # Second block with pytest.raises(ValueError, match="CSV conversion error to int64"): reader.read_next_batch() # EOF with pytest.raises(StopIteration): reader.read_next_batch() # Inference on first block, then conversion failure on second block, # then success on third block rows = b"a,b\n1,2\nabc,def\n45,67\n" read_options.block_size = 8 reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('a', pa.int64()), ('b', pa.int64())]) assert reader.schema == expected_schema assert reader.read_next_batch().to_pydict() == {'a': [1], 'b': [2]} # Second block with pytest.raises(ValueError, match="CSV conversion error to int64"): reader.read_next_batch() # Third block assert reader.read_next_batch().to_pydict() == {'a': [45], 'b': [67]} # EOF with pytest.raises(StopIteration): reader.read_next_batch()
def test_header_column_names(self): rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n" opts = ReadOptions() opts.column_names = ["x", "y"] table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["x", "y"]) assert table.to_pydict() == { "x": ["ab", "ef", "ij", "mn"], "y": ["cd", "gh", "kl", "op"], } opts.skip_rows = 3 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["x", "y"]) assert table.to_pydict() == { "x": ["mn"], "y": ["op"], } opts.skip_rows = 4 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["x", "y"]) assert table.to_pydict() == { "x": [], "y": [], } opts.skip_rows = 5 with pytest.raises(pa.ArrowInvalid): # Not enough rows table = self.read_bytes(rows, read_options=opts) # Unexpected number of columns opts.skip_rows = 0 opts.column_names = ["x", "y", "z"] with pytest.raises(pa.ArrowInvalid, match="Expected 3 columns, got 2"): table = self.read_bytes(rows, read_options=opts) # Can skip rows with a different number of columns rows = b"abcd\n,,,,,\nij,kl\nmn,op\n" opts.skip_rows = 2 opts.column_names = ["x", "y"] table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["x", "y"]) assert table.to_pydict() == { "x": ["ij", "mn"], "y": ["kl", "op"], }
def parse_green_taxi_csv(fobj): """ Parse a binary file object of cleaned "green taxi" CSV data as returned by the "read_green_taxi_csv" function, and return a PyArrow table. """ convert_options = ConvertOptions( column_types=SCHEMA, false_values=['N'], null_values=[''], timestamp_parsers=['%Y-%m-%d %H:%M:%S'], true_values=['Y'], ) parse_options = ParseOptions(quote_char=False) read_options = ReadOptions( column_names=SCHEMA.names, encoding=ENCODING, ) return read_csv( fobj, convert_options=convert_options, parse_options=parse_options, read_options=read_options, )
def test_invalid_csv(self): # CSV errors on first block rows = b"a,b\n1,2,3\n4,5\n6,7\n" read_options = ReadOptions() read_options.block_size = 10 with pytest.raises(pa.ArrowInvalid, match="Expected 2 columns, got 3"): reader = self.open_bytes(rows, read_options=read_options) # CSV errors on second block rows = b"a,b\n1,2\n3,4,5\n6,7\n" read_options.block_size = 8 reader = self.open_bytes(rows, read_options=read_options) assert reader.read_next_batch().to_pydict() == {'a': [1], 'b': [2]} with pytest.raises(pa.ArrowInvalid, match="Expected 2 columns, got 3"): reader.read_next_batch() # Cannot continue after a parse error with pytest.raises(StopIteration): reader.read_next_batch()
def test_include_missing_columns(self): rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n" read_options = ReadOptions() convert_options = ConvertOptions() convert_options.include_columns = ['xx', 'ab', 'yy'] convert_options.include_missing_columns = True table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('xx', pa.null()), ('ab', pa.string()), ('yy', pa.null())]) assert table.schema == schema assert table.to_pydict() == { "xx": [None, None, None], "ab": ["ef", "ij", "mn"], "yy": [None, None, None], } # Combining with `column_names` read_options.column_names = ["xx", "yy"] convert_options.include_columns = ["yy", "cd"] table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('yy', pa.string()), ('cd', pa.null())]) assert table.schema == schema assert table.to_pydict() == { "yy": ["cd", "gh", "kl", "op"], "cd": [None, None, None, None], } # And with `column_types` as well convert_options.column_types = {"yy": pa.binary(), "cd": pa.int32()} table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('yy', pa.binary()), ('cd', pa.int32())]) assert table.schema == schema assert table.to_pydict() == { "yy": [b"cd", b"gh", b"kl", b"op"], "cd": [None, None, None, None], }
def test_inference(self): # Inference is done on first block rows = b"a,b\n123,456\nabc,de\xff\ngh,ij\n" expected_schema = pa.schema([('a', pa.string()), ('b', pa.binary())]) read_options = ReadOptions() read_options.block_size = len(rows) reader = self.open_bytes(rows, read_options=read_options) self.check_reader(reader, expected_schema, [{'a': ['123', 'abc', 'gh'], 'b': [b'456', b'de\xff', b'ij']}]) read_options.block_size = len(rows) - 1 reader = self.open_bytes(rows, read_options=read_options) self.check_reader(reader, expected_schema, [{'a': ['123', 'abc'], 'b': [b'456', b'de\xff']}, {'a': ['gh'], 'b': [b'ij']}])
def test_header_skip_rows(self): rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n" opts = ReadOptions() opts.skip_rows = 1 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["ef", "gh"]) assert table.to_pydict() == { "ef": ["ij", "mn"], "gh": ["kl", "op"], } opts.skip_rows = 3 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["mn", "op"]) assert table.to_pydict() == { "mn": [], "op": [], } opts.skip_rows = 4 with pytest.raises(pa.ArrowInvalid): # Not enough rows table = self.read_bytes(rows, read_options=opts) # Can skip rows with a different number of columns rows = b"abcd\n,,,,,\nij,kl\nmn,op\n" opts.skip_rows = 2 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["ij", "kl"]) assert table.to_pydict() == { "ij": ["mn"], "kl": ["op"], }
def test_stress_block_sizes(self): # Test a number of small block sizes to stress block stitching csv_base, expected = make_random_csv(num_cols=2, num_rows=500) block_sizes = [11, 12, 13, 17, 37, 111] csvs = [csv_base, csv_base.rstrip(b'\r\n')] for csv in csvs: for block_size in block_sizes: read_options = ReadOptions(block_size=block_size) table = self.read_bytes(csv, read_options=read_options) assert table.schema == expected.schema if not table.equals(expected): # Better error output assert table.to_pydict() == expected.to_pydict()
def test_column_types_with_column_names(self): # When both `column_names` and `column_types` are given, names # in `column_types` should refer to names in `column_names` rows = b"a,b\nc,d\ne,f\n" read_options = ReadOptions(column_names=['x', 'y']) convert_options = ConvertOptions(column_types={'x': pa.binary()}) table = self.read_bytes(rows, read_options=read_options, convert_options=convert_options) schema = pa.schema([('x', pa.binary()), ('y', pa.string())]) assert table.schema == schema assert table.to_pydict() == { 'x': [b'a', b'c', b'e'], 'y': ['b', 'd', 'f'], }
def test_header_autogenerate_column_names(self): rows = b"ab,cd\nef,gh\nij,kl\nmn,op\n" opts = ReadOptions() opts.autogenerate_column_names = True table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["f0", "f1"]) assert table.to_pydict() == { "f0": ["ab", "ef", "ij", "mn"], "f1": ["cd", "gh", "kl", "op"], } opts.skip_rows = 3 table = self.read_bytes(rows, read_options=opts) self.check_names(table, ["f0", "f1"]) assert table.to_pydict() == { "f0": ["mn"], "f1": ["op"], } # Not enough rows, impossible to infer number of columns opts.skip_rows = 4 with pytest.raises(pa.ArrowInvalid): table = self.read_bytes(rows, read_options=opts)
def test_stress_block_sizes(self): # Test a number of small block sizes to stress block stitching csv_base, expected = make_random_csv(num_cols=2, num_rows=500) block_sizes = [19, 21, 23, 26, 37, 111] csvs = [csv_base, csv_base.rstrip(b'\r\n')] for csv in csvs: for block_size in block_sizes: # Need at least two lines for type inference assert csv[:block_size].count(b'\n') >= 2 read_options = ReadOptions(block_size=block_size) reader = self.open_bytes(csv, read_options=read_options) table = reader.read_all() assert table.schema == expected.schema if not table.equals(expected): # Better error output assert table.to_pydict() == expected.to_pydict()
def test_write_read_round_trip(): t = pa.Table.from_arrays([[1, 2, 3], ["a", "b", "c"]], ["c1", "c2"]) record_batch = t.to_batches(max_chunksize=4)[0] for data in [t, record_batch]: # Test with header buf = io.BytesIO() write_csv(data, buf, WriteOptions(include_header=True)) buf.seek(0) assert t == read_csv(buf) # Test without header buf = io.BytesIO() write_csv(data, buf, WriteOptions(include_header=False)) buf.seek(0) read_options = ReadOptions(column_names=t.column_names) assert t == read_csv(buf, read_options=read_options)
def test_empty_lines(self): rows = b"a,b\n\r1,2\r\n\r\n3,4\r\n" table = self.read_bytes(rows) assert table.to_pydict() == { 'a': [1, 3], 'b': [2, 4], } parse_options = ParseOptions(ignore_empty_lines=False) table = self.read_bytes(rows, parse_options=parse_options) assert table.to_pydict() == { 'a': [None, 1, None, 3], 'b': [None, 2, None, 4], } read_options = ReadOptions(skip_rows=2) table = self.read_bytes(rows, parse_options=parse_options, read_options=read_options) assert table.to_pydict() == { '1': [None, 3], '2': [None, 4], }
def read(self, env: CylonEnv, table, relevant_cols=None, **kwargs) -> DataFrame: filepath = self.table_path_mapping[table].replace('$TABLE', table) names, _ = get_schema(table) # csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) # .with_delimiter('|') read_opts = ReadOptions(column_names=names, block_size=(1 << 30)) parse_opts = ParseOptions(delimiter='|') convert_opts = ConvertOptions(include_columns=relevant_cols) # if table is in refresh_tables list, read that table and concat # NOTE: refresh tables have the same parallelism as its data tables if table in REFRESH_TABLES: data_table = pa_read_csv(filepath, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) refresh_path = filepath.replace('/data/', '/data_refresh/') refresh_table = pa_read_csv(refresh_path, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) pa_table = pa_concat_tables([data_table, refresh_table]) else: pa_table = pa_read_csv(filepath, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) return DataFrame(Table.from_arrow(env.context, pa_table))
def read_csv( cls, filepath_or_buffer, sep=",", delimiter=None, header="infer", names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression="infer", thousands=None, decimal=b".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, ): items = locals().copy() mykwargs = {k: items[k] for k in items if k in cls.arg_keys} eng = str(engine).lower().strip() try: if eng in ["pandas", "c"]: return cls._read(**mykwargs) if isinstance(dtype, dict): column_types = { c: cls._dtype_to_arrow(t) for c, t in dtype.items() } else: column_types = cls._dtype_to_arrow(dtype) if (type(parse_dates) is list) and type(column_types) is dict: for c in parse_dates: column_types[c] = pa.timestamp("s") if names: if header == 0: skiprows = skiprows + 1 if skiprows is not None else 1 elif header is None or header == "infer": pass else: raise NotImplementedError( "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and 'infer' header values" ) else: if header == 0 or header == "infer": pass else: raise NotImplementedError( "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' header values" ) if delimiter is None: delimiter = sep if delim_whitespace and delimiter != ",": raise ValueError( "Specified a delimiter and delim_whitespace=True; you can only specify one." ) usecols_md = cls._prepare_pyarrow_usecols(mykwargs) po = ParseOptions( delimiter="\\s+" if delim_whitespace else delimiter, quote_char=quotechar, double_quote=doublequote, escape_char=escapechar, newlines_in_values=False, ignore_empty_lines=skip_blank_lines, ) co = ConvertOptions( check_utf8=None, column_types=column_types, null_values=None, true_values=None, false_values=None, # timestamp fields should be handled as strings if parse_dates # didn't passed explicitly as an array or a dict timestamp_parsers=[""] if isinstance(parse_dates, bool) else None, strings_can_be_null=None, include_columns=usecols_md, include_missing_columns=None, auto_dict_encode=None, auto_dict_max_cardinality=None, ) ro = ReadOptions( use_threads=True, block_size=None, skip_rows=skiprows, column_names=names, autogenerate_column_names=None, ) at = read_csv( filepath_or_buffer, read_options=ro, parse_options=po, convert_options=co, ) return cls.from_arrow(at) except (pa.ArrowNotImplementedError, NotImplementedError): if eng in ["arrow"]: raise ErrorMessage.default_to_pandas("`read_csv`") return cls._read(**mykwargs)
def read_csv(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = True table = read_csv(*args, **kwargs) table.validate(full=True) return table
def open_csv(self, *args, **kwargs): read_options = kwargs.setdefault('read_options', ReadOptions()) read_options.use_threads = False return open_csv(*args, **kwargs)
def read_csv( cls, filepath_or_buffer, sep=",", delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, squeeze=False, prefix=lib.no_default, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression="infer", thousands=None, decimal=".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, encoding_errors="strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, on_bad_lines=None, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, storage_options=None, ): # noqa: PR01 """ Read data from `filepath_or_buffer` according to the passed `kwargs` parameters. For parameters description please refer to pandas API. Returns ------- BaseQueryCompiler Query compiler with imported data for further processing. Notes ----- Reading performed by using of `pyarrow.read_csv` function. """ items = locals().copy() mykwargs = {k: items[k] for k in items if k in cls.arg_keys} eng = str(engine).lower().strip() try: if eng in ["pandas", "c"]: return cls._read(**mykwargs) cls._validate_read_csv_kwargs(mykwargs) use_modin_impl, error_message = cls._read_csv_check_support( mykwargs, ) if not use_modin_impl: raise ArrowEngineException(error_message) if isinstance(dtype, dict): column_types = {c: cls._dtype_to_arrow(t) for c, t in dtype.items()} else: column_types = cls._dtype_to_arrow(dtype) if (type(parse_dates) is list) and type(column_types) is dict: for c in parse_dates: column_types[c] = pa.timestamp("s") if names not in [lib.no_default, None] and header == 0: skiprows = skiprows + 1 if skiprows is not None else 1 if delimiter is None and sep is not lib.no_default: delimiter = sep usecols_md = cls._prepare_pyarrow_usecols(mykwargs) po = ParseOptions( delimiter="\\s+" if delim_whitespace else delimiter, quote_char=quotechar, double_quote=doublequote, escape_char=escapechar, newlines_in_values=False, ignore_empty_lines=skip_blank_lines, ) co = ConvertOptions( check_utf8=None, column_types=column_types, null_values=None, true_values=None, false_values=None, # timestamp fields should be handled as strings if parse_dates # didn't passed explicitly as an array or a dict timestamp_parsers=[""] if isinstance(parse_dates, bool) else None, strings_can_be_null=None, include_columns=usecols_md, include_missing_columns=None, auto_dict_encode=None, auto_dict_max_cardinality=None, ) ro = ReadOptions( use_threads=True, block_size=None, skip_rows=skiprows, column_names=names if names is not lib.no_default else None, autogenerate_column_names=None, ) at = read_csv( filepath_or_buffer, read_options=ro, parse_options=po, convert_options=co, ) return cls.from_arrow(at) except ( pa.ArrowNotImplementedError, pa.ArrowInvalid, NotImplementedError, ArrowEngineException, ): if eng in ["arrow"]: raise ErrorMessage.default_to_pandas("`read_csv`") return cls._read(**mykwargs)
def test_column_options(self): # With column_names rows = b"1,2,3\n4,5,6" read_options = ReadOptions() read_options.column_names = ['d', 'e', 'f'] reader = self.open_bytes(rows, read_options=read_options) expected_schema = pa.schema([('d', pa.int64()), ('e', pa.int64()), ('f', pa.int64())]) self.check_reader(reader, expected_schema, [{ 'd': [1, 4], 'e': [2, 5], 'f': [3, 6] }]) # With include_columns convert_options = ConvertOptions() convert_options.include_columns = ['f', 'e'] reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('f', pa.int64()), ('e', pa.int64())]) self.check_reader(reader, expected_schema, [{ 'e': [2, 5], 'f': [3, 6] }]) # With column_types convert_options.column_types = {'e': pa.string()} reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'e': ["2", "5"], 'f': [3, 6] }]) # Missing columns in include_columns convert_options.include_columns = ['g', 'f', 'e'] with pytest.raises( KeyError, match="Column 'g' in include_columns does not exist"): reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) convert_options.include_missing_columns = True reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('g', pa.null()), ('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'g': [None, None], 'e': ["2", "5"], 'f': [3, 6] }]) convert_options.column_types = {'e': pa.string(), 'g': pa.float64()} reader = self.open_bytes(rows, read_options=read_options, convert_options=convert_options) expected_schema = pa.schema([('g', pa.float64()), ('f', pa.int64()), ('e', pa.string())]) self.check_reader(reader, expected_schema, [{ 'g': [None, None], 'e': ["2", "5"], 'f': [3, 6] }])