def test_byte_stream_split(use_legacy_dataset): # This is only a smoke test. arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) data_float = [arr_float, arr_float] table = pa.Table.from_arrays(data_float, names=['a', 'b']) # Check with byte_stream_split for both columns. _check_roundtrip(table, expected=table, compression="gzip", use_dictionary=False, use_byte_stream_split=True) # Check with byte_stream_split for column 'b' and dictionary # for column 'a'. _check_roundtrip(table, expected=table, compression="gzip", use_dictionary=['a'], use_byte_stream_split=['b']) # Check with a collision for both columns. _check_roundtrip(table, expected=table, compression="gzip", use_dictionary=['a', 'b'], use_byte_stream_split=['a', 'b']) # Check with mixed column types. mixed_table = pa.Table.from_arrays([arr_float, arr_int], names=['a', 'b']) _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=['b'], use_byte_stream_split=['a']) # Try to use the wrong data type with the byte_stream_split encoding. # This should throw an exception. table = pa.Table.from_arrays([arr_int], names=['tmp']) with pytest.raises(IOError): _check_roundtrip(table, expected=table, use_byte_stream_split=True, use_dictionary=False, use_legacy_dataset=use_legacy_dataset)
def test_compression_level(use_legacy_dataset): arr = pa.array(list(map(int, range(1000)))) data = [arr, arr] table = pa.Table.from_arrays(data, names=['a', 'b']) # Check one compression level. _check_roundtrip(table, expected=table, compression="gzip", compression_level=1, use_legacy_dataset=use_legacy_dataset) # Check another one to make sure that compression_level=1 does not # coincide with the default one in Arrow. _check_roundtrip(table, expected=table, compression="gzip", compression_level=5, use_legacy_dataset=use_legacy_dataset) # Check that the user can provide a compression per column _check_roundtrip(table, expected=table, compression={ 'a': "gzip", 'b': "snappy" }, use_legacy_dataset=use_legacy_dataset) # Check that the user can provide a compression level per column _check_roundtrip(table, expected=table, compression="gzip", compression_level={ 'a': 2, 'b': 3 }, use_legacy_dataset=use_legacy_dataset) # Check that specifying a compression level for a codec which does allow # specifying one, results into an error. # Uncompressed, snappy, lz4 and lzo do not support specifying a compression # level. # GZIP (zlib) allows for specifying a compression level but as of up # to version 1.2.11 the valid range is [-1, 9]. invalid_combinations = [("snappy", 4), ("lz4", 5), ("gzip", -1337), ("None", 444), ("lzo", 14)] buf = io.BytesIO() for (codec, level) in invalid_combinations: with pytest.raises((ValueError, OSError)): _write_table(table, buf, compression=codec, compression_level=level)
def test_writing_empty_lists(): # ARROW-2591: [Python] Segmentation fault issue in pq.write_table arr1 = pa.array([[], []], pa.list_(pa.int32())) table = pa.Table.from_arrays([arr1], ['list(int32)']) _check_roundtrip(table)
def test_empty_lists_table_roundtrip(use_legacy_dataset): # ARROW-2744: Shouldn't crash when writing an array of empty lists arr = pa.array([[], []], type=pa.list_(pa.int32())) table = pa.Table.from_arrays([arr], ["A"]) _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset)
def test_parquet_version_timestamp_differences(): i_s = pd.Timestamp('2010-01-01').value / 1000000000 # := 1262304000 d_s = np.arange(i_s, i_s + 10, 1, dtype='int64') d_ms = d_s * 1000 d_us = d_ms * 1000 d_ns = d_us * 1000 a_s = pa.array(d_s, type=pa.timestamp('s')) a_ms = pa.array(d_ms, type=pa.timestamp('ms')) a_us = pa.array(d_us, type=pa.timestamp('us')) a_ns = pa.array(d_ns, type=pa.timestamp('ns')) names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns'] table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names) # Using Parquet version 1.0, seconds should be coerced to milliseconds # and nanoseconds should be coerced to microseconds by default expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names) _check_roundtrip(table, expected) # Using Parquet version 2.0, seconds should be coerced to milliseconds # and nanoseconds should be retained by default expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names) _check_roundtrip(table, expected, version='2.6') # Using Parquet version 1.0, coercing to milliseconds or microseconds # is allowed expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names) _check_roundtrip(table, expected, coerce_timestamps='ms') # Using Parquet version 2.0, coercing to milliseconds or microseconds # is allowed expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names) _check_roundtrip(table, expected, version='2.6', coerce_timestamps='us') # TODO: after pyarrow allows coerce_timestamps='ns', tests like the # following should pass ... # Using Parquet version 1.0, coercing to nanoseconds is not allowed # expected = None # with pytest.raises(NotImplementedError): # _roundtrip_table(table, coerce_timestamps='ns') # Using Parquet version 2.0, coercing to nanoseconds is allowed # expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names) # _check_roundtrip(table, expected, version='2.6', coerce_timestamps='ns') # For either Parquet version, coercing to nanoseconds is allowed # if Int96 storage is used expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names) _check_roundtrip(table, expected, use_deprecated_int96_timestamps=True) _check_roundtrip(table, expected, version='2.6', use_deprecated_int96_timestamps=True)
def test_timestamp_restore_timezone(): # ARROW-5888, restore timezone from serialized metadata ty = pa.timestamp('ms', tz='America/New_York') arr = pa.array([1, 2, 3], type=ty) t = pa.table([arr], names=['f0']) _check_roundtrip(t)
def test_date_time_types(tempdir): t1 = pa.date32() data1 = np.array([17259, 17260, 17261], dtype='int32') a1 = pa.array(data1, type=t1) t2 = pa.date64() data2 = data1.astype('int64') * 86400000 a2 = pa.array(data2, type=t2) t3 = pa.timestamp('us') start = pd.Timestamp('2001-01-01').value / 1000 data3 = np.array([start, start + 1, start + 2], dtype='int64') a3 = pa.array(data3, type=t3) t4 = pa.time32('ms') data4 = np.arange(3, dtype='i4') a4 = pa.array(data4, type=t4) t5 = pa.time64('us') a5 = pa.array(data4.astype('int64'), type=t5) t6 = pa.time32('s') a6 = pa.array(data4, type=t6) ex_t6 = pa.time32('ms') ex_a6 = pa.array(data4 * 1000, type=ex_t6) t7 = pa.timestamp('ns') start = pd.Timestamp('2001-01-01').value data7 = np.array([start, start + 1000, start + 2000], dtype='int64') a7 = pa.array(data7, type=t7) table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) # date64 as date32 # time32[s] to time32[ms] expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [ 'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', 'time32_from64[s]', 'timestamp[ns]' ]) _check_roundtrip(table, expected=expected, version='2.6') t0 = pa.timestamp('ms') data0 = np.arange(4, dtype='int64') a0 = pa.array(data0, type=t0) t1 = pa.timestamp('us') data1 = np.arange(4, dtype='int64') a1 = pa.array(data1, type=t1) t2 = pa.timestamp('ns') data2 = np.arange(4, dtype='int64') a2 = pa.array(data2, type=t2) table = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) expected = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) # int64 for all timestamps supported by default filename = tempdir / 'int64_timestamps.parquet' _write_table(table, filename, version='2.6') parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT64' read_table = _read_table(filename) assert read_table.equals(expected) t0_ns = pa.timestamp('ns') data0_ns = np.array(data0 * 1000000, dtype='int64') a0_ns = pa.array(data0_ns, type=t0_ns) t1_ns = pa.timestamp('ns') data1_ns = np.array(data1 * 1000, dtype='int64') a1_ns = pa.array(data1_ns, type=t1_ns) expected = pa.Table.from_arrays([a0_ns, a1_ns, a2], ['ts[ms]', 'ts[us]', 'ts[ns]']) # int96 nanosecond timestamps produced upon request filename = tempdir / 'explicit_int96_timestamps.parquet' _write_table(table, filename, version='2.6', use_deprecated_int96_timestamps=True) parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT96' read_table = _read_table(filename) assert read_table.equals(expected) # int96 nanosecond timestamps implied by flavor 'spark' filename = tempdir / 'spark_int96_timestamps.parquet' _write_table(table, filename, version='2.6', flavor='spark') parquet_schema = pq.ParquetFile(filename).schema for i in range(3): assert parquet_schema.column(i).physical_type == 'INT96' read_table = _read_table(filename) assert read_table.equals(expected)
def test_column_encoding(use_legacy_dataset): arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) mixed_table = pa.Table.from_arrays([arr_float, arr_int], names=['a', 'b']) # Check "BYTE_STREAM_SPLIT" for column 'a' and "PLAIN" column_encoding for # column 'b'. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={ 'a': "BYTE_STREAM_SPLIT", 'b': "PLAIN" }, use_legacy_dataset=use_legacy_dataset) # Check "PLAIN" for all columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding="PLAIN", use_legacy_dataset=use_legacy_dataset) # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'. # This should throw an error as it is only supports FLOAT and DOUBLE. with pytest.raises(IOError, match="BYTE_STREAM_SPLIT only supports FLOAT and" " DOUBLE"): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'b': "BYTE_STREAM_SPLIT"}, use_legacy_dataset=use_legacy_dataset) # Try to pass "DELTA_BINARY_PACKED". # This should throw an error as it is only supported for reading. with pytest.raises(IOError, match="Not yet implemented: Selected encoding is" " not supported."): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'b': "DELTA_BINARY_PACKED"}, use_legacy_dataset=use_legacy_dataset) # Try to pass "RLE_DICTIONARY". # This should throw an error as dictionary encoding is already used by # default and not supported to be specified as "fallback" encoding with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding="RLE_DICTIONARY", use_legacy_dataset=use_legacy_dataset) # Try to pass unsupported encoding. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "MADE_UP_ENCODING"}, use_legacy_dataset=use_legacy_dataset) # Try to pass column_encoding and use_dictionary. # This should throw an error. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=['b'], column_encoding={'b': "PLAIN"}, use_legacy_dataset=use_legacy_dataset) # Try to pass column_encoding and use_dictionary=True (default value). # This should throw an error. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, column_encoding={'b': "PLAIN"}, use_legacy_dataset=use_legacy_dataset) # Try to pass column_encoding and use_byte_stream_split on same column. # This should throw an error. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, use_byte_stream_split=['a'], column_encoding={ 'a': "RLE", 'b': "BYTE_STREAM_SPLIT" }, use_legacy_dataset=use_legacy_dataset) # Try to pass column_encoding and use_byte_stream_split=True. # This should throw an error. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, use_byte_stream_split=True, column_encoding={ 'a': "RLE", 'b': "BYTE_STREAM_SPLIT" }, use_legacy_dataset=use_legacy_dataset) # Try to pass column_encoding=True. # This should throw an error. with pytest.raises(TypeError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding=True, use_legacy_dataset=use_legacy_dataset)
def test_empty_table_no_columns(use_legacy_dataset): df = pd.DataFrame() empty = pa.Table.from_pandas(df, preserve_index=False) _check_roundtrip(empty, use_legacy_dataset=use_legacy_dataset)