def test_decimal_array_with_none_and_nan(): values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')] array = pa.array(values) assert array.type == pa.decimal128(4, 3) assert array.to_pylist() == values[:2] + [None, None] array = pa.array(values, type=pa.decimal128(10, 4)) assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None]
def test_sequence_decimal_different_precisions(): data = [ decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234') ] type = pa.decimal128(precision=13, scale=3) arr = pa.array(data, type=type) assert arr.to_pylist() == data
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def field(jvm_field): """ Construct a Field from a org.apache.arrow.vector.types.pojo.Field instance. Parameters ---------- jvm_field: org.apache.arrow.vector.types.pojo.Field Returns ------- pyarrow.Field """ name = jvm_field.getName() jvm_type = jvm_field.getType() typ = None if not jvm_type.isComplex(): type_str = jvm_type.getTypeID().toString() if type_str == 'Null': typ = pa.null() elif type_str == 'Int': typ = _from_jvm_int_type(jvm_type) elif type_str == 'FloatingPoint': typ = _from_jvm_float_type(jvm_type) elif type_str == 'Utf8': typ = pa.string() elif type_str == 'Binary': typ = pa.binary() elif type_str == 'FixedSizeBinary': typ = pa.binary(jvm_type.getByteWidth()) elif type_str == 'Bool': typ = pa.bool_() elif type_str == 'Time': typ = _from_jvm_time_type(jvm_type) elif type_str == 'Timestamp': typ = _from_jvm_timestamp_type(jvm_type) elif type_str == 'Date': typ = _from_jvm_date_type(jvm_type) elif type_str == 'Decimal': typ = pa.decimal128(jvm_type.getPrecision(), jvm_type.getScale()) else: raise NotImplementedError( "Unsupported JVM type: {}".format(type_str)) else: # TODO: The following JVM types are not implemented: # Struct, List, FixedSizeList, Union, Dictionary raise NotImplementedError( "JVM field conversion only implemented for primitive types.") nullable = jvm_field.isNullable() if jvm_field.getMetadata().isEmpty(): metadata = None else: metadata = dict(jvm_field.getMetadata()) return pa.field(name, typ, nullable, metadata)
def test_decimal_64_from_pandas(self): expected = pd.DataFrame({ 'decimals': [ decimal.Decimal('-129934.123331'), decimal.Decimal('129534.123731'), ] }) converted = pa.Table.from_pandas(expected, preserve_index=False) field = pa.field('decimals', pa.decimal128(12, 6)) schema = pa.schema([field]) assert converted.schema.equals(schema)
def test_decimal_128_from_pandas(self): expected = pd.DataFrame({ 'decimals': [ decimal.Decimal('394092382910493.12341234678'), -decimal.Decimal('314292388910493.12343437128'), ] }) converted = pa.Table.from_pandas(expected, preserve_index=False) field = pa.field('decimals', pa.decimal128(26, 11)) schema = pa.schema([field]) assert converted.schema.equals(schema)
def test_bit_width(): for ty, expected in [(pa.bool_(), 1), (pa.int8(), 8), (pa.uint32(), 32), (pa.float16(), 16), (pa.decimal128(19, 4), 128), (pa.binary(42), 42 * 8)]: assert ty.bit_width == expected for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]: with pytest.raises(ValueError, match="fixed width"): ty.bit_width
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.union([ pa.field('a', pa.int8()), pa.field('b', pa.int16()) ], pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.int8()), pa.field('b', pa.int16()) ], pa.lib.UnionMode_DENSE), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal128(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def test_sequence_decimal_different_precisions(): data = [decimal.Decimal('1234234983.183'), decimal.Decimal('80943244.234')] type = pa.decimal128(precision=13, scale=3) arr = pa.array(data, type=type) assert arr.to_pylist() == data
def test_sequence_decimal_from_integers(): data = [0, 1, -39402950693754869342983] expected = [decimal.Decimal(x) for x in data] type = pa.decimal128(precision=28, scale=5) arr = pa.array(data, type=type) assert arr.to_pylist() == expected
def test_decimal_overflow(): pa.decimal128(1, 0) pa.decimal128(38, 0) for i in (0, -1, 39): with pytest.raises(ValueError): pa.decimal128(39, 0)
def test_is_decimal(): assert types.is_decimal(pa.decimal128(19, 4)) assert not types.is_decimal(pa.int32())
def test_sequence_decimal_large_integer(): data = [decimal.Decimal('-394029506937548693.42983'), decimal.Decimal('32358695912932.01033')] type = pa.decimal128(precision=23, scale=5) arr = pa.array(data, type=type) assert arr.to_pylist() == data
(pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",' '"timezone":null}'), (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"' ',"timezone":"UTC"}'), (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",' '"unit":"NANOSECOND","timezone":"Europe/Paris"}'), (pa.date32(), '{"name":"date","unit":"DAY"}'), (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'), (pa.decimal128(19, 4), '{"name":"decimal","precision":19,"scale":4}'), (pa.string(), '{"name":"utf8"}'), (pa.binary(), '{"name":"binary"}'), (pa.binary(10), '{"name":"fixedsizebinary","byteWidth":10}'), # TODO(ARROW-2609): complex types that have children # pa.list_(pa.int32()), # pa.struct([pa.field('a', pa.int32()), # pa.field('b', pa.int8()), # pa.field('c', pa.string())]), # pa.union([pa.field('a', pa.binary(10)), # pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), # pa.union([pa.field('a', pa.binary(10)), # pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # TODO: DictionaryType requires a vector in the type # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])), ])
def precision(self, value): self._validate(value, self.scale) self._typ = pa.decimal128(precision=value, scale=self.scale)
def generate_test_parquet(): import pyarrow as pa import datetime import decimal import json import pandas as pd import pathlib import pyarrow.parquet as pq import struct boolean = pa.array([True, False, None, False, True], type=pa.bool_()) uint8 = pa.array([None if i == 2 else 1 + i for i in range(5)], type=pa.uint8()) int8 = pa.array([None if i == 2 else -2 + i for i in range(5)], type=pa.int8()) uint16 = pa.array([None if i == 2 else 1 + i * 10000 for i in range(5)], type=pa.uint16()) int16 = pa.array( [None if i == 2 else -20000 + i * 10000 for i in range(5)], type=pa.int16()) uint32 = pa.array( [None if i == 2 else 1 + i * 1000000000 for i in range(5)], type=pa.uint32()) int32 = pa.array( [None if i == 2 else -2000000000 + i * 1000000000 for i in range(5)], type=pa.int32()) uint64 = pa.array( [None if i == 2 else 1 + i * 100000000000 for i in range(5)], type=pa.uint64()) int64 = pa.array([ None if i == 2 else -200000000000 + i * 100000000000 for i in range(5) ], type=pa.int64()) float32 = pa.array([None if i == 2 else 1.5 + i for i in range(5)], type=pa.float32()) float64 = pa.array([None if i == 2 else 1.5 + i for i in range(5)], type=pa.float64()) string = pa.array(["abcd", "", None, "c", "d"], type=pa.string()) large_string = pa.array(["abcd", "", None, "c", "d"], type=pa.large_string()) gmt_plus_2 = datetime.timezone(datetime.timedelta(hours=2)) timestamp_ms_gmt_plus_2 = pa.array([ pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt_plus_2) ] * 5, type=pa.timestamp('ms', tz=gmt_plus_2)) gmt = datetime.timezone(datetime.timedelta(hours=0)) timestamp_ms_gmt = pa.array([ pd.Timestamp( year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt) ] * 5, type=pa.timestamp('ms', tz=gmt)) gmt_minus_0215 = datetime.timezone(datetime.timedelta(hours=-2.25)) timestamp_ms_gmt_minus_0215 = pa.array([ pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt_minus_0215) ] * 5, type=pa.timestamp( 'ms', tz=gmt_minus_0215)) timestamp_s_no_tz = pa.array([ pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6) ] * 5, type=pa.timestamp('s')) time32_s = pa.array([3600 + 120 + 3, None, 3, 4, 5], type=pa.time32('s')) time32_ms = pa.array([(3600 + 120 + 3) * 1000 + 456, 2, 3, 4, 5], type=pa.time32('ms')) time64_us = pa.array([(3600 + 120 + 3) * 1e6, None, 3, 4, 5], type=pa.time64('us')) time64_ns = pa.array([(3600 + 120 + 3) * 1e9 + 456, 2, 3, 4, 5], type=pa.time64('ns')) date32 = pa.array([1, 2, 3, 4, 5], type=pa.date32()) date64 = pa.array([86400 * 1000, 2, 3, 4, 5], type=pa.date64()) duration_s = pa.array([1, 2, 3, 4, 5], type=pa.duration('s')) duration_ms = pa.array([1, 2, 3, 4, 5], type=pa.duration('ms')) binary = pa.array([b'\x00\x01'] * 5, type=pa.binary()) large_binary = pa.array([b'\x00\x01'] * 5, type=pa.large_binary()) fixed_size_binary = pa.array([b'\x00\x01'] * 5, type=pa.binary(2)) decimal128 = pa.array([ decimal.Decimal('1234.567'), decimal.Decimal('-1234.567'), None, decimal.Decimal('1234.567'), decimal.Decimal('-1234.567') ], type=pa.decimal128(7, 3)) decimal256 = pa.array([ decimal.Decimal('1234.567'), decimal.Decimal('-1234.567'), None, decimal.Decimal('1234.567'), decimal.Decimal('-1234.567') ], type=pa.decimal256(7, 3)) list_boolean = pa.array([ None if i == 2 else [ None if j == 0 else True if (j % 2) == 0 else False for j in range(i) ] for i in range(5) ], type=pa.list_(pa.bool_())) list_uint8 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint8())) list_int8 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int8())) list_uint16 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint16())) list_int16 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int16())) list_uint32 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint32())) list_int32 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int32())) list_uint64 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint64())) list_int64 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int64())) list_float32 = pa.array([ None if i == 2 else [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.float32())) list_float64 = pa.array([ None if i == 2 else [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.float64())) list_string = pa.array([ None if i == 2 else [ "".join(["%c" % (65 + j + k) for k in range(1 + j)]) for j in range(i) ] for i in range(5) ]) fixed_size_list_boolean = pa.array( [[True, False], [False, True], [True, False], [False, True], [True, False]], type=pa.list_(pa.bool_(), 2)) fixed_size_list_uint8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint8(), 2)) fixed_size_list_int8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int8(), 2)) fixed_size_list_uint16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint16(), 2)) fixed_size_list_int16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int16(), 2)) fixed_size_list_uint32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint32(), 2)) fixed_size_list_int32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int32(), 2)) fixed_size_list_uint64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint64(), 2)) fixed_size_list_int64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int64(), 2)) fixed_size_list_float32 = pa.array( [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.float32(), 2)) fixed_size_list_float64 = pa.array( [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.float64(), 2)) fixed_size_list_string = pa.array( [["a", "b"], ["c", "d"], ["e", "f"], ["g", "h"], ["i", "j"]], type=pa.list_(pa.string(), 2)) struct_field = pa.array([{ "a": 1, "b": 2.5, "c": { "d": "e", "f": "g" }, "h": [5, 6], "i": 3 }] * 5) #struct_val = { "a": 5 } #for i in range(123): # struct_val = { "a": struct_val } #struct_field = pa.array([struct_val] * 5) map_boolean = pa.array([[('x', None), ('y', True)], [('z', True)], None, [], []], type=pa.map_(pa.string(), pa.bool_())) map_uint8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint8())) map_int8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int8())) map_uint16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint16())) map_int16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int16())) map_uint32 = pa.array([[('x', 4 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint32())) map_int32 = pa.array([[('x', 2 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int32())) map_uint64 = pa.array([[('x', 4 * 1000 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint64())) map_int64 = pa.array([[('x', -2 * 1000 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int64())) map_float32 = pa.array([[('x', 1.5), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.float32())) map_float64 = pa.array([[('x', 1.5), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.float64())) map_string = pa.array([[('x', 'x_val'), ('y', None)], [('z', 'z_val')], None, [], []], type=pa.map_(pa.string(), pa.string())) indices = pa.array([0, 1, 2, None, 2]) dictionary = pa.array(['foo', 'bar', 'baz']) dict = pa.DictionaryArray.from_arrays(indices, dictionary) map_list = pa.array([[('x', []), ('y', [])], [('z', [])], None, [], []], type=pa.map_(pa.string(), pa.list_(pa.uint32()))) geometry = pa.array([ None if i == 1 else (b'\x01\x01\x00\x00\x00' + struct.pack('<dd', i, 2)) for i in range(5) ], type=pa.binary()) names = [ "boolean", "uint8", "int8", "uint16", "int16", "uint32", "int32", "uint64", "int64", "float32", "float64", "string", "large_string", "timestamp_ms_gmt", "timestamp_ms_gmt_plus_2", "timestamp_ms_gmt_minus_0215", "timestamp_s_no_tz", "time32_s", "time32_ms", "time64_us", "time64_ns", "date32", "date64", # "duration_s", # "duration_ms", "binary", "large_binary", "fixed_size_binary", "decimal128", "decimal256", "list_boolean", "list_uint8", "list_int8", "list_uint16", "list_int16", "list_uint32", "list_int32", "list_uint64", "list_int64", "list_float32", "list_float64", "list_string", "fixed_size_list_boolean", "fixed_size_list_uint8", "fixed_size_list_int8", "fixed_size_list_uint16", "fixed_size_list_int16", "fixed_size_list_uint32", "fixed_size_list_int32", "fixed_size_list_uint64", "fixed_size_list_int64", "fixed_size_list_float32", "fixed_size_list_float64", "fixed_size_list_string", "struct_field", "map_boolean", "map_uint8", "map_int8", "map_uint16", "map_int16", "map_uint32", "map_int32", "map_uint64", "map_int64", "map_float32", "map_float64", "map_string", # "map_list", "dict", "geometry", ] locals_ = locals() table = pa.table([locals_[x] for x in names], names=names) my_schema = table.schema.with_metadata({ "geo": json.dumps({ "version": "0.1.0", "primary_column": "geometry", "columns": { "geometry": { 'crs': wkt_epsg_4326, 'bbox': [0, 2, 4, 2], 'encoding': 'WKB' } } }) }) table = table.cast(my_schema) HERE = pathlib.Path(__file__).parent pq.write_table(table, HERE / "ogr/data/parquet/test.parquet", compression='NONE', row_group_size=3)
"FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE", pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES", pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() pyarrow.decimal128(38, scale=9).id: "NUMERIC", # The exact decimal's scale and precision are not important, as only # the type ID matters, and it's the same for all decimal128 instances. } else: # pragma: NO COVER BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER def bq_to_arrow_struct_data_type(field): arrow_fields = [] for subfield in field.fields: arrow_subfield = bq_to_arrow_field(subfield) if arrow_subfield:
_supported_pyarrow_types = [ pa.null(), pa.bool_(), pa.int32(), pa.time32("s"), pa.time64("us"), pa.date32(), pa.timestamp("us"), pa.timestamp("us", tz="UTC"), pa.timestamp("us", tz="Europe/Paris"), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.large_string(), pa.large_binary(), pa.list_(pa.int32()), pa.list_(pa.int32(), 2), pa.large_list(pa.uint16()), pa.struct( [ pa.field("a", pa.int32()), pa.field("b", pa.int8()), pa.field("c", pa.string()), ] ),
(pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time') ] ) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range(): arr = pa.array([2 ** 63], type=pa.uint64()) expected = pa.array(np.array([2 ** 63], dtype='u8')) assert arr.equals(expected)
class TestAbstractFileParserStatics: @pytest.mark.parametrize( # testing all datatypes as laid out here: https://json-schema.org/understanding-json-schema/reference/type.html "input_json_type, output_pyarrow_type", [ ("string", pa.large_string()), ("number", pa.float64()), ("integer", pa.int64()), ("object", pa.large_string()), ("array", pa.large_string()), ("boolean", pa.bool_()), ("null", pa.large_string()), ], ) def test_json_type_to_pyarrow_type(self, input_json_type: str, output_pyarrow_type: Any) -> None: # Json -> PyArrow direction LOGGER.info(f"asserting that JSON type '{input_json_type}' converts to PyArrow type '{output_pyarrow_type}'...") assert AbstractFileParser.json_type_to_pyarrow_type(input_json_type) == output_pyarrow_type @pytest.mark.parametrize( # testing all datatypes as laid out here: https://arrow.apache.org/docs/python/api/datatypes.html "input_pyarrow_types, output_json_type", [ ((pa.null(),), "string"), # null type ((pa.bool_(),), "boolean"), # boolean type ( (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()), "integer", ), # integer types ((pa.float16(), pa.float32(), pa.float64(), pa.decimal128(5, 10), pa.decimal256(3, 8)), "number"), # number types ((pa.time32("s"), pa.time64("ns"), pa.timestamp("ms"), pa.date32(), pa.date64()), "string"), # temporal types ((pa.binary(), pa.large_binary()), "string"), # binary types ((pa.string(), pa.utf8(), pa.large_string(), pa.large_utf8()), "string"), # string types ((pa.list_(pa.string()), pa.large_list(pa.timestamp("us"))), "string"), # array types ((pa.map_(pa.string(), pa.float32()), pa.dictionary(pa.int16(), pa.list_(pa.string()))), "string"), # object types ], ) def test_json_type_to_pyarrow_type_reverse(self, input_pyarrow_types: Tuple[Any], output_json_type: str) -> None: # PyArrow -> Json direction (reverse=True) for typ in input_pyarrow_types: LOGGER.info(f"asserting that PyArrow type '{typ}' converts to JSON type '{output_json_type}'...") assert AbstractFileParser.json_type_to_pyarrow_type(typ, reverse=True) == output_json_type @pytest.mark.parametrize( # if expecting fail, put pyarrow_schema as None "json_schema, pyarrow_schema", [ ( {"a": "string", "b": "number", "c": "integer", "d": "object", "e": "array", "f": "boolean", "g": "null"}, { "a": pa.large_string(), "b": pa.float64(), "c": pa.int64(), "d": pa.large_string(), "e": pa.large_string(), "f": pa.bool_(), "g": pa.large_string(), }, ), ({"single_column": "object"}, {"single_column": pa.large_string()}), ({}, {}), ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": pa.large_string(), "b": pa.large_string()}), (["string", "object"], None), # bad input type ], ) def test_json_schema_to_pyarrow_schema(self, json_schema: Mapping[str, Any], pyarrow_schema: Mapping[str, Any]) -> None: # Json -> PyArrow direction if pyarrow_schema is not None: assert AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) == pyarrow_schema else: with pytest.raises(Exception) as e_info: AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) LOGGER.debug(str(e_info)) @pytest.mark.parametrize( # if expecting fail, put json_schema as None "pyarrow_schema, json_schema", [ ( { "a": pa.utf8(), "b": pa.float16(), "c": pa.uint32(), "d": pa.map_(pa.string(), pa.float32()), "e": pa.bool_(), "f": pa.date64(), }, {"a": "string", "b": "number", "c": "integer", "d": "string", "e": "boolean", "f": "string"}, ), ({"single_column": pa.int32()}, {"single_column": "integer"}), ({}, {}), ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": "string", "b": "string"}), (["string", "object"], None), # bad input type ], ) def test_json_schema_to_pyarrow_schema_reverse(self, pyarrow_schema: Mapping[str, Any], json_schema: Mapping[str, Any]) -> None: # PyArrow -> Json direction (reverse=True) if json_schema is not None: assert AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) == json_schema else: with pytest.raises(Exception) as e_info: AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) LOGGER.debug(str(e_info))
def test_sequence_decimal_negative(): data = [decimal.Decimal('-1234.234983'), decimal.Decimal('-8.094324')] type = pa.decimal128(precision=10, scale=6) arr = pa.array(data, type=type) assert arr.to_pylist() == data
def _parquet_schema(dataframe: pd.DataFrame, custom_redshift_columns: dict = None): """ Translates pandas dtypes to PyArrow types and creates a Schema from them Args: dataframe (pd.DataFrame): Dataframe to pull the schema of custom_redshift_columns (dict, Optional): This dictionary contains custom column data type definitions for redshift. The params should be formatted as follows: - column name (str) - data type (str) Returns: PyArrow Schema of the given dataframe Potentially modified Dataframe """ fields = [] for col, dtype in dataframe.dtypes.items(): dtype = dtype.name if dtype == 'object': if custom_redshift_columns: # Detect if the Pandas object column contains Python decimal objects. if "[Decimal(" in str(dataframe[col].values)[:9]: # If Python decimal objects are present, parse out the precision and scale # from the custom_redshift_columns dictionary to use when converting # to PyArrow's decimal128 data type. s = custom_redshift_columns[col] precision = int(s[s.find('DECIMAL(') + len('DECIMAL('):s.rfind(',')].strip()) scale = int(s[s.find(',') + len(','):s.rfind(')')].strip()) pa_type = pa.decimal128(precision=precision, scale=scale) else: pa_type = pa.string() else: pa_type = pa.string() elif dtype.startswith('int32'): pa_type = pa.int32() elif dtype.startswith('int64'): pa_type = pa.int64() elif dtype.startswith('int8'): pa_type = pa.int8() elif dtype.startswith('Int32'): dataframe = dataframe.astype({col: 'object'}) pa_type = pa.int32() elif dtype.startswith('Int64'): dataframe = dataframe.astype({col: 'object'}) pa_type = pa.int64() elif dtype.startswith('float32'): pa_type = pa.float32() elif dtype.startswith('float64'): pa_type = pa.float64() elif dtype.startswith('float16'): pa_type = pa.float16() elif dtype.startswith('datetime'): pa_type = pa.timestamp('ns') elif dtype.startswith('date'): pa_type = pa.date64() elif dtype.startswith('category'): pa_type = pa.string() elif dtype == 'bool': pa_type = pa.bool_() else: raise NotImplementedError( f"Error: {dtype} is not a datatype which can be mapped to Parquet using s3parq." ) fields.append(pa.field(col, pa_type)) return (pa.schema(fields=fields), dataframe)
def test_from_arrow_max_precision(): with pytest.raises(ValueError): DecimalColumn.from_arrow( pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19)) )
def pyarrow_numeric(): return pyarrow.decimal128(38, 9)
def test_decimal_properties(): ty = pa.decimal128(19, 4) assert ty.byte_width == 16 assert ty.precision == 19 assert ty.scale == 4
def test_complex_unload_as_arrow(self, arrow_cursor): # NOT_SUPPORTED: Unsupported Hive type: time # NOT_SUPPORTED: Unsupported Hive type: json table = arrow_cursor.execute(""" SELECT col_boolean ,col_tinyint ,col_smallint ,col_int ,col_bigint ,col_float ,col_double ,col_string ,col_varchar ,col_timestamp ,col_date ,col_binary ,col_array ,col_map ,col_struct ,col_decimal FROM one_row_complex """).as_arrow() assert table.shape[0] == 1 assert table.shape[1] == 16 assert table.schema == pa.schema([ pa.field("col_boolean", pa.bool_()), pa.field("col_tinyint", pa.int32()), pa.field("col_smallint", pa.int32()), pa.field("col_int", pa.int32()), pa.field("col_bigint", pa.int64()), pa.field("col_float", pa.float32()), pa.field("col_double", pa.float64()), pa.field("col_string", pa.string()), pa.field("col_varchar", pa.string()), pa.field("col_timestamp", pa.timestamp("ns")), pa.field("col_date", pa.date32()), pa.field("col_binary", pa.binary()), pa.field("col_array", pa.list_(pa.field("array_element", pa.int32()))), pa.field("col_map", pa.map_(pa.int32(), pa.field("entries", pa.int32()))), pa.field( "col_struct", pa.struct( [pa.field("a", pa.int32()), pa.field("b", pa.int32())]), ), pa.field("col_decimal", pa.decimal128(10, 1)), ]) assert [row for row in zip(*table.to_pydict().values())] == [( True, 127, 32767, 2147483647, 9223372036854775807, 0.5, 0.25, "a string", "varchar", pd.Timestamp(2017, 1, 1, 0, 0, 0), datetime(2017, 1, 2).date(), b"123", [1, 2], [(1, 2), (3, 4)], { "a": 1, "b": 2 }, Decimal("0.1"), )]
@pytest.mark.parametrize( argnames="meta_type,arrow_type", argvalues=[ ("bool_", pa.bool_()), ("int8", pa.int8()), ("int16", pa.int16()), ("int32", pa.int32()), ("int64", pa.int64()), ("uint8", pa.uint8()), ("uint16", pa.uint16()), ("uint32", pa.uint32()), ("uint64", pa.uint64()), ("float16", pa.float16()), ("float32", pa.float32()), ("float64", pa.float64()), ("decimal128(38,1)", pa.decimal128(38, 1)), ("decimal128(1,2)", pa.decimal128(1, 2)), ("time32(s)", pa.time32("s")), ("time32(ms)", pa.time32("ms")), ("time64(us)", pa.time64("us")), ("time64(ns)", pa.time64("ns")), ("timestamp(s)", pa.timestamp("s")), ("timestamp(ms)", pa.timestamp("ms")), ("timestamp(us)", pa.timestamp("us")), ("timestamp(ns)", pa.timestamp("ns")), ("date32", pa.date32()), ("date64", pa.date64()), ("string", pa.string()), ("large_string", pa.large_string()), ("utf8", pa.utf8()), ("large_utf8", pa.large_utf8()),
def test_decimal_byte_width(): ty = pa.decimal128(19, 4) assert ty.byte_width == 16
def test_generate_from_meta(): md = Metadata.from_dict({ "name": "test_table", "file_format": "test-format", "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", "nullable": False, }, { "name": "my_double", "type": "float64", "nullable": True }, { "name": "my_date", "type": "date64" }, { "name": "my_decimal", "type": "decimal128(10,2)" }, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], }) ac = ArrowConverter() assert isinstance(ac.options, BaseConverterOptions) schema1 = ac.generate_from_meta(md) schema2 = ac.generate_from_meta(md, False) assert isinstance(schema1, pa.Schema) assert isinstance(schema2, pa.Schema) expected_names = ["my_int", "my_double", "my_date", "my_decimal"] expected_types = [ pa.int64(), pa.float64(), pa.date64(), pa.decimal128(10, 2) ] assert schema1.names == expected_names checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)] assert all(checks1) # Do schema2 assertions expected_names.append("my_timestamp") expected_types.append(pa.timestamp("s")) assert schema2.names == expected_names checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)] assert all(checks2) # Also check specific type properties assert schema2.field("my_decimal").type.precision == 10 assert schema2.field("my_decimal").type.scale == 2 assert schema2.field("my_timestamp").type.unit == "s"
(pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",' '"timezone":null}'), (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"' ',"timezone":"UTC"}'), (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",' '"unit":"NANOSECOND","timezone":"Europe/Paris"}'), (pa.date32(), '{"name":"date","unit":"DAY"}'), (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'), (pa.decimal128(19, 4), '{"name":"decimal","precision":19,"scale":4}'), (pa.string(), '{"name":"utf8"}'), (pa.binary(), '{"name":"binary"}'), (pa.binary(10), '{"name":"fixedsizebinary","byteWidth":10}'), # TODO(ARROW-2609): complex types that have children # pa.list_(pa.int32()), # pa.struct([pa.field('a', pa.int32()), # pa.field('b', pa.int8()), # pa.field('c', pa.string())]), # pa.union([pa.field('a', pa.binary(10)), # pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), # pa.union([pa.field('a', pa.binary(10)), # pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # TODO: DictionaryType requires a vector in the type # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])), ])
def test_sequence_decimal_no_whole_part(): data = [decimal.Decimal('-.4234983'), decimal.Decimal('.0103943')] type = pa.decimal128(precision=7, scale=7) arr = pa.array(data, type=type) assert arr.to_pylist() == data
def test_sequence_decimal(): data = [decimal.Decimal('1234.183'), decimal.Decimal('8094.234')] type = pa.decimal128(precision=7, scale=3) arr = pa.array(data, type=type) assert arr.to_pylist() == data
def test_sequence_decimal_no_scale(): data = [decimal.Decimal('1234234983'), decimal.Decimal('8094324')] type = pa.decimal128(precision=10) arr = pa.array(data, type=type) assert arr.to_pylist() == data
def get_type_and_builtins(self, n, type_name): """ Return a `(arrow type, list)` tuple where the arrow type corresponds to the given logical *type_name*, and the list is a list of *n* random-generated Python objects compatible with the arrow type. """ size = None if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'): kind = type_name elif type_name.startswith(('int', 'uint')): kind = 'int' elif type_name.startswith('float'): kind = 'float' elif type_name.startswith('struct'): kind = 'struct' elif type_name == 'binary': kind = 'varying binary' elif type_name.startswith('binary'): kind = 'fixed binary' size = int(type_name[6:]) assert size > 0 else: raise ValueError("unrecognized type %r" % (type_name,)) if kind in ('int', 'float'): ty = getattr(pa, type_name)() elif kind == 'bool': ty = pa.bool_() elif kind == 'decimal': ty = pa.decimal128(9, 9) elif kind == 'fixed binary': ty = pa.binary(size) elif kind == 'varying binary': ty = pa.binary() elif kind in ('ascii', 'unicode'): ty = pa.string() elif kind == 'int64 list': ty = pa.list_(pa.int64()) elif kind == 'struct': ty = pa.struct([pa.field('u', pa.int64()), pa.field('v', pa.float64()), pa.field('w', pa.bool_())]) factories = { 'int': self.generate_int_list, 'float': self.generate_float_list, 'bool': self.generate_bool_list, 'decimal': self.generate_decimal_list, 'fixed binary': partial(self.generate_fixed_binary_list, size=size), 'varying binary': partial(self.generate_varying_binary_list, min_size=3, max_size=40), 'ascii': partial(self.generate_ascii_string_list, min_size=3, max_size=40), 'unicode': partial(self.generate_unicode_string_list, min_size=3, max_size=40), 'int64 list': partial(self.generate_int_list_list, min_size=0, max_size=20), 'struct': self.generate_dict_list, 'struct from tuples': self.generate_tuple_list, } data = factories[kind](n) return ty, data
[ [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], [1], [-1], [1, 2, 3, 4], [42, 1729, 4104], [1, 2, None, 4], [None, None, None], [], ], ) @pytest.mark.parametrize( "typ", [ pa.decimal128(precision=4, scale=2), pa.decimal128(precision=5, scale=3), pa.decimal128(precision=6, scale=4), ], ) def test_round_trip_decimal_column(data, typ): pa_arr = pa.array(data, type=typ) col = DecimalColumn.from_arrow(pa_arr) assert pa_arr.equals(col.to_arrow()) def test_from_arrow_max_precision(): with pytest.raises(ValueError): DecimalColumn.from_arrow( pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19)) )
"time": { "type": "long", "logicalType": "time-micros" }, "timestamp": { "type": "long", "logicalType": "timestamp-micros" }, } # This dictionary is duplicated in bigquery/google/cloud/bigquery/_pandas_helpers.py # When modifying it be sure to update it there as well. BQ_TO_ARROW_TYPES = { "int64": pyarrow.int64(), "float64": pyarrow.float64(), "bool": pyarrow.bool_(), "numeric": pyarrow.decimal128(38, 9), "string": pyarrow.utf8(), "bytes": pyarrow.binary(), "date": pyarrow.date32(), # int32 days since epoch "datetime": pyarrow.timestamp("us"), "time": pyarrow.time64("us"), "timestamp": pyarrow.timestamp("us", tz="UTC"), } SCALAR_COLUMNS = [ { "name": "int_col", "type": "int64" }, { "name": "float_col", "type": "float64"
def test_sql(redshift_table, postgresql_table, mysql_table, databases_parameters, db_type): if db_type == "postgresql": table = postgresql_table elif db_type == "mysql": table = mysql_table else: table = redshift_table df = get_df() if db_type == "redshift": df.drop(["binary"], axis=1, inplace=True) engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}", echo=False) index = True if engine.name == "redshift" else False wr.db.to_sql( df=df, con=engine, name=table, schema=databases_parameters[db_type]["schema"], if_exists="replace", index=index, index_label=None, chunksize=None, method=None, dtype={"iint32": sqlalchemy.types.Integer}, ) df = wr.db.read_sql_query( sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}", con=engine) ensure_data_types(df, has_list=False) engine = wr.db.get_engine( db_type=db_type, host=databases_parameters[db_type]["host"], port=databases_parameters[db_type]["port"], database=databases_parameters[db_type]["database"], user=databases_parameters["user"], password=databases_parameters["password"], echo=False, ) dfs = wr.db.read_sql_query( sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}", con=engine, chunksize=1, dtype={ "iint8": pa.int8(), "iint16": pa.int16(), "iint32": pa.int32(), "iint64": pa.int64(), "float": pa.float32(), "double": pa.float64(), "decimal": pa.decimal128(3, 2), "string_object": pa.string(), "string": pa.string(), "date": pa.date32(), "timestamp": pa.timestamp(unit="ns"), "binary": pa.binary(), "category": pa.float64(), }, ) for df in dfs: ensure_data_types(df, has_list=False) if db_type != "redshift": account_id = boto3.client("sts").get_caller_identity().get("Account") engine = wr.catalog.get_engine( connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id) wr.db.to_sql( df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"), con=engine, name=table, schema=databases_parameters[db_type]["schema"], if_exists="replace", index=True, index_label="index", ) schema = None if db_type == "postgresql": schema = databases_parameters[db_type]["schema"] df = wr.db.read_sql_table(con=engine, table=table, schema=schema, index_col="index") assert df.shape == (3, 1)
def test_iterate_over_decimal_chunk(): random.seed(datetime.datetime.now()) precision = random.randint(1, 38) scale = random.randint(0, precision) datatype = None if precision <= 2: datatype = pyarrow.int8() elif precision <= 4: datatype = pyarrow.int16() elif precision <= 9: datatype = pyarrow.int32() elif precision <= 19: datatype = pyarrow.int64() else: datatype = pyarrow.decimal128(precision, scale) def decimal_generator(_precision, _scale): def decimal128_generator(precision, scale): data = [] for _ in range(precision): data.append(str(random.randint(0, 9))) if scale: data.insert(-scale, '.') return decimal.Decimal("".join(data)) def int64_generator(precision): data = random.randint(-9223372036854775808, 9223372036854775807) return int(str(data)[:precision if data >= 0 else precision + 1]) def int32_generator(precision): data = random.randint(-2147483648, 2147483637) return int(str(data)[:precision if data >= 0 else precision + 1]) def int16_generator(precision): data = random.randint(-32768, 32767) return int(str(data)[:precision if data >= 0 else precision + 1]) def int8_generator(precision): data = random.randint(-128, 127) return int(str(data)[:precision if data >= 0 else precision + 1]) if _precision <= 2: return int8_generator(_precision) elif _precision <= 4: return int16_generator(_precision) elif _precision <= 9: return int32_generator(_precision) elif _precision <= 19: return int64_generator(_precision) else: return decimal128_generator(_precision, _scale) def expected_data_transform_decimal(_precision, _scale): def expected_data_transform_decimal_impl(data, precision=_precision, scale=_scale): if precision <= 19: return decimal.Decimal(data).scaleb(-scale) else: return data return expected_data_transform_decimal_impl column_meta = { "logicalType": "FIXED", "precision": str(precision), "scale": str(scale) } iterate_over_test_chunk([datatype, datatype], [column_meta, column_meta], lambda: decimal_generator(precision, scale), expected_data_transform_decimal(precision, scale))
np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy()) np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy()) np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range(): arr = pa.array([2**63], type=pa.uint64()) expected = pa.array(np.array([2**63], dtype='u8')) assert arr.equals(expected) def test_array_conversions_no_sentinel_values(): arr = np.array([1, 2, 3, 4], dtype='int8') refcount = sys.getrefcount(arr)
def get_type_and_builtins(self, n, type_name): """ Return a `(arrow type, list)` tuple where the arrow type corresponds to the given logical *type_name*, and the list is a list of *n* random-generated Python objects compatible with the arrow type. """ size = None if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'): kind = type_name elif type_name.startswith(('int', 'uint')): kind = 'int' elif type_name.startswith('float'): kind = 'float' elif type_name.startswith('struct'): kind = 'struct' elif type_name == 'binary': kind = 'varying binary' elif type_name.startswith('binary'): kind = 'fixed binary' size = int(type_name[6:]) assert size > 0 else: raise ValueError("unrecognized type %r" % (type_name, )) if kind in ('int', 'float'): ty = getattr(pa, type_name)() elif kind == 'bool': ty = pa.bool_() elif kind == 'decimal': ty = pa.decimal128(9, 9) elif kind == 'fixed binary': ty = pa.binary(size) elif kind == 'varying binary': ty = pa.binary() elif kind in ('ascii', 'unicode'): ty = pa.string() elif kind == 'int64 list': ty = pa.list_(pa.int64()) elif kind == 'struct': ty = pa.struct([ pa.field('u', pa.int64()), pa.field('v', pa.float64()), pa.field('w', pa.bool_()) ]) factories = { 'int': self.generate_int_list, 'float': self.generate_float_list, 'bool': self.generate_bool_list, 'decimal': self.generate_decimal_list, 'fixed binary': partial(self.generate_fixed_binary_list, size=size), 'varying binary': partial(self.generate_varying_binary_list, min_size=3, max_size=40), 'ascii': partial(self.generate_ascii_string_list, min_size=3, max_size=40), 'unicode': partial(self.generate_unicode_string_list, min_size=3, max_size=40), 'int64 list': partial(self.generate_int_list_list, min_size=0, max_size=20), 'struct': self.generate_dict_list, 'struct from tuples': self.generate_tuple_list, } data = factories[kind](n) return ty, data
def to_arrow_type(dt: DataType) -> "pa.DataType": """ Convert Spark data type to pyarrow type """ from distutils.version import LooseVersion import pyarrow as pa if type(dt) == BooleanType: arrow_type = pa.bool_() elif type(dt) == ByteType: arrow_type = pa.int8() elif type(dt) == ShortType: arrow_type = pa.int16() elif type(dt) == IntegerType: arrow_type = pa.int32() elif type(dt) == LongType: arrow_type = pa.int64() elif type(dt) == FloatType: arrow_type = pa.float32() elif type(dt) == DoubleType: arrow_type = pa.float64() elif type(dt) == DecimalType: arrow_type = pa.decimal128(dt.precision, dt.scale) elif type(dt) == StringType: arrow_type = pa.string() elif type(dt) == BinaryType: arrow_type = pa.binary() elif type(dt) == DateType: arrow_type = pa.date32() elif type(dt) == TimestampType: # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read arrow_type = pa.timestamp('us', tz='UTC') elif type(dt) == TimestampNTZType: arrow_type = pa.timestamp('us', tz=None) elif type(dt) == ArrayType: if type(dt.elementType) in [StructType, TimestampType]: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.list_(to_arrow_type(dt.elementType)) elif type(dt) == MapType: if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError( "MapType is only supported with pyarrow 2.0.0 and above") if type(dt.keyType) in [StructType, TimestampType] or \ type(dt.valueType) in [StructType, TimestampType]: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.map_(to_arrow_type(dt.keyType), to_arrow_type(dt.valueType)) elif type(dt) == StructType: if any(type(field.dataType) == StructType for field in dt): raise TypeError( "Nested StructType not supported in conversion to Arrow") fields = [ pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable) for field in dt ] arrow_type = pa.struct(fields) elif type(dt) == NullType: arrow_type = pa.null() else: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) return arrow_type
MANY_TYPES = [ pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # XXX Needs array pickling # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])), ]