def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_schema(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert sch[0].name == 'foo' assert sch[0].type == fields[0].type assert sch.field_by_name('foo').name == 'foo' assert sch.field_by_name('foo').type == fields[0].type assert repr(sch) == """\ foo: int32 bar: string baz: list<item: int8> child 0, item: int8""" with pytest.raises(TypeError): pa.schema([None])
def test_field_metadata(): f1 = pa.field('a', pa.int8()) f2 = pa.field('a', pa.int8(), metadata={}) f3 = pa.field('a', pa.int8(), metadata={b'bizz': b'bazz'}) assert f1.metadata is None assert f2.metadata == {} assert f3.metadata[b'bizz'] == b'bazz'
def test_field_equality_operators(): f1 = pa.field('a', pa.int8(), nullable=True) f2 = pa.field('a', pa.int8(), nullable=True) f3 = pa.field('b', pa.int8(), nullable=True) f4 = pa.field('b', pa.int8(), nullable=False) assert f1 == f2 assert f1 != f3 assert f3 != f4 assert f1 != 'foo'
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])) assert ty0.index_type == pa.int32() assert isinstance(ty0.dictionary, pa.Array) assert ty0.dictionary.to_pylist() == ['a', 'b', 'c'] assert ty0.ordered is False ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True) assert ty1.index_type == pa.int8() assert isinstance(ty0.dictionary, pa.Array) assert ty1.dictionary.to_pylist() == [1.0, 2.0] assert ty1.ordered is True
def _from_jvm_int_type(jvm_type): """ Convert a JVM int type to its Python equivalent. Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int Returns ------- typ: pyarrow.DataType """ if jvm_type.isSigned: if jvm_type.bitWidth == 8: return pa.int8() elif jvm_type.bitWidth == 16: return pa.int16() elif jvm_type.bitWidth == 32: return pa.int32() elif jvm_type.bitWidth == 64: return pa.int64() else: if jvm_type.bitWidth == 8: return pa.uint8() elif jvm_type.bitWidth == 16: return pa.uint16() elif jvm_type.bitWidth == 32: return pa.uint32() elif jvm_type.bitWidth == 64: return pa.uint64()
def test_is_union(): for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]: assert types.is_union(pa.union([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())], mode=mode)) assert not types.is_union(pa.list_(pa.int32()))
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.string()) assert ty0.index_type == pa.int32() assert ty0.value_type == pa.string() assert ty0.ordered is False ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True) assert ty1.index_type == pa.int8() assert ty1.value_type == pa.float64() assert ty1.ordered is True # construct from non-arrow objects ty2 = pa.dictionary('int8', 'string') assert ty2.index_type == pa.int8() assert ty2.value_type == pa.string() assert ty2.ordered is False
def test_struct_array_slice(): # ARROW-2311: slicing nested arrays needs special care ty = pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.float32())]) arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, {'a': 5, 'b': 6.5}]
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_buffers_nested(): a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64())) buffers = a.buffers() assert len(buffers) == 4 # The parent buffers null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 offsets = buffers[1].to_pybytes() assert struct.unpack('4i', offsets) == (0, 2, 2, 6) # The child buffers null_bitmap = buffers[2].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00110111 values = buffers[3].to_pybytes() assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5) a = pa.array([(42, None), None, (None, 43)], type=pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.int16())])) buffers = a.buffers() assert len(buffers) == 5 # The parent buffer null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 # The child buffers: 'a' null_bitmap = buffers[1].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000001 values = buffers[2].to_pybytes() assert struct.unpack('bxx', values) == (42,) # The child buffers: 'b' null_bitmap = buffers[3].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000100 values = buffers[4].to_pybytes() assert struct.unpack('4xh', values) == (43,)
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_orcfile_empty(): from pyarrow import orc f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile')) table = f.read() assert table.num_rows == 0 schema = table.schema expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert schema == expected_schema
def test_array_from_pandas_type_cast(self): arr = np.arange(10, dtype='int64') target_type = pa.int8() result = pa.array(arr, type=target_type) expected = pa.array(arr.astype('int8')) assert result.equals(expected)
def test_schema_from_tuples(): fields = [ ('foo', pa.int32()), ('bar', pa.string()), ('baz', pa.list_(pa.int8())), ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert repr(sch) == """\ foo: int32 bar: string baz: list<item: int8> child 0, item: int8""" with pytest.raises(TypeError): pa.schema([('foo', None)])
def dataframe_with_arrays(include_index=False): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float32()), ('f8', pa.float64())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_bit_width(): for ty, expected in [(pa.bool_(), 1), (pa.int8(), 8), (pa.uint32(), 32), (pa.float16(), 16), (pa.decimal128(19, 4), 128), (pa.binary(42), 42 * 8)]: assert ty.bit_width == expected for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]: with pytest.raises(ValueError, match="fixed width"): ty.bit_width
def test_is_nested_or_struct(): struct_ex = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) assert types.is_struct(struct_ex) assert not types.is_struct(pa.list_(pa.int32())) assert types.is_nested(struct_ex) assert types.is_nested(pa.list_(pa.int32())) assert not types.is_nested(pa.int32())
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_schema_duplicate_fields(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('foo', pa.list_(pa.int8())), ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'foo'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert repr(sch) == """\ foo: int32 bar: string foo: list<item: int8> child 0, item: int8""" assert sch[0].name == 'foo' assert sch[0].type == fields[0].type assert sch.field_by_name('bar') == fields[1] assert sch.field_by_name('xxx') is None with pytest.warns(UserWarning): assert sch.field_by_name('foo') is None
def test_recordbatch_pickle(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] schema = pa.schema([pa.field('ints', pa.int8()), pa.field('floats', pa.float32()), ]).add_metadata({b'foo': b'bar'}) batch = pa.RecordBatch.from_arrays(data, schema) result = pickle.loads(pickle.dumps(batch)) assert result.equals(batch) assert result.schema == schema
def test_schema_equals(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] sch1 = pa.schema(fields) sch2 = pa.schema(fields) assert sch1.equals(sch2) del fields[-1] sch3 = pa.schema(fields) assert not sch1.equals(sch3)
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
def test_schema_serialization_with_metadata(): field_metadata = {b'foo': b'bar', b'kind': b'field'} schema_metadata = {b'foo': b'bar', b'kind': b'schema'} f0 = pa.field('a', pa.int8()) f1 = pa.field('b', pa.string(), metadata=field_metadata) schema = pa.schema([f0, f1], metadata=schema_metadata) s_schema = schema.serialize() recons_schema = pa.read_schema(s_schema) assert recons_schema.equals(schema) assert recons_schema.metadata == schema_metadata assert recons_schema[0].metadata is None assert recons_schema[1].metadata == field_metadata
def test_column_of_arrays_to_py(self): # Test regression in ARROW-1199 not caught in above test dtype = 'i1' arr = np.array([ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ]) type_ = pa.list_(pa.int8()) parr = pa.array(arr, type=type_) assert parr[0].as_py() == list(range(10)) assert parr[1].as_py() == list(range(5)) assert parr[2].as_py() is None assert parr[3].as_py() == [0]
def test_schema(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert len(sch) == 3 assert sch[0].name == 'foo' assert sch[0].type == fields[0].type assert sch.field_by_name('foo').name == 'foo' assert sch.field_by_name('foo').type == fields[0].type assert repr(sch) == """\
def test_union_array_slice(): # ARROW-2314 arr = pa.UnionArray.from_sparse(pa.array([0, 0, 1, 1], type=pa.int8()), [pa.array(["a", "b", "c", "d"]), pa.array([1, 2, 3, 4])]) assert arr[1:].to_pylist() == ["b", 3, 4] binary = pa.array([b'a', b'b', b'c', b'd'], type='binary') int64 = pa.array([1, 2, 3], type='int64') types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8') value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32') arr = pa.UnionArray.from_dense(types, value_offsets, [binary, int64]) lst = arr.to_pylist() for i in range(len(arr)): for j in range(i, len(arr)): assert arr[i:j].to_pylist() == lst[i:j]
def test_is_integer(): signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()] unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] for t in signed_ints + unsigned_ints: assert types.is_integer(t) for t in signed_ints: assert types.is_signed_integer(t) assert not types.is_unsigned_integer(t) for t in unsigned_ints: assert types.is_unsigned_integer(t) assert not types.is_signed_integer(t) assert not types.is_integer(pa.float32()) assert not types.is_signed_integer(pa.float32())
def test_schema_negative_indexing(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] schema = pa.schema(fields) assert schema[-1].equals(schema[2]) assert schema[-2].equals(schema[1]) assert schema[-3].equals(schema[0]) with pytest.raises(IndexError): schema[-4] with pytest.raises(IndexError): schema[3]
def test_sql_types(redshift_table, redshift_con): table = redshift_table df = get_df() df.drop(["binary"], axis=1, inplace=True) wr.redshift.to_sql( df=df, con=redshift_con, table=table, schema="public", mode="overwrite", index=True, dtype={"iint32": "INTEGER"}, ) df = wr.redshift.read_sql_query(f"SELECT * FROM public.{table}", redshift_con) ensure_data_types(df, has_list=False) dfs = wr.redshift.read_sql_query( sql=f"SELECT * FROM public.{table}", con=redshift_con, chunksize=1, dtype={ "iint8": pa.int8(), "iint16": pa.int16(), "iint32": pa.int32(), "iint64": pa.int64(), "float": pa.float32(), "ddouble": pa.float64(), "decimal": pa.decimal128(3, 2), "string_object": pa.string(), "string": pa.string(), "date": pa.date32(), "timestamp": pa.timestamp(unit="ns"), "binary": pa.binary(), "category": pa.float64(), }, ) for df in dfs: ensure_data_types(df, has_list=False)
def test_union(): # sparse arr = pa.UnionArray.from_sparse( pa.array([0, 0, 1, 1], type=pa.int8()), [pa.array(["a", "b", "c", "d"]), pa.array([1, 2, 3, 4])]) for s in arr: assert isinstance(s, pa.UnionScalar) assert s.type.equals(arr.type) assert s.is_valid is True with pytest.raises(pa.ArrowNotImplementedError): pickle.loads(pickle.dumps(s)) assert arr[0].as_py() == "a" assert arr[1].as_py() == "b" assert arr[2].as_py() == 3 assert arr[3].as_py() == 4 # dense arr = pa.UnionArray.from_dense(types=pa.array([0, 1, 0, 0, 1, 1, 0], type='int8'), value_offsets=pa.array( [0, 0, 2, 1, 1, 2, 3], type='int32'), children=[ pa.array([b'a', b'b', b'c', b'd'], type='binary'), pa.array([1, 2, 3], type='int64') ]) for s in arr: assert isinstance(s, pa.UnionScalar) assert s.type.equals(arr.type) assert s.is_valid is True with pytest.raises(pa.ArrowNotImplementedError): pickle.loads(pickle.dumps(s)) assert arr[0].as_py() == b'a' assert arr[5].as_py() == 3
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string()) ]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_literals(): import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() builder.make_literal(True, pa.bool_()) builder.make_literal(0, pa.uint8()) builder.make_literal(1, pa.uint16()) builder.make_literal(2, pa.uint32()) builder.make_literal(3, pa.uint64()) builder.make_literal(4, pa.int8()) builder.make_literal(5, pa.int16()) builder.make_literal(6, pa.int32()) builder.make_literal(7, pa.int64()) builder.make_literal(8.0, pa.float32()) builder.make_literal(9.0, pa.float64()) builder.make_literal("hello", pa.string()) builder.make_literal(b"world", pa.binary()) builder.make_literal(True, "bool") builder.make_literal(0, "uint8") builder.make_literal(1, "uint16") builder.make_literal(2, "uint32") builder.make_literal(3, "uint64") builder.make_literal(4, "int8") builder.make_literal(5, "int16") builder.make_literal(6, "int32") builder.make_literal(7, "int64") builder.make_literal(8.0, "float32") builder.make_literal(9.0, "float64") builder.make_literal("hello", "string") builder.make_literal(b"world", "binary") with pytest.raises(TypeError): builder.make_literal("hello", pa.int64()) with pytest.raises(TypeError): builder.make_literal(True, None)
def test_dictionary_delta(stream_fixture): ty = pa.dictionary(pa.int8(), pa.utf8()) data = [ ["foo", "foo", None], ["foo", "bar", "foo"], # potential delta ["foo", "bar"], ["foo", None, "bar", "quux"], # potential delta ["bar", "quux"], # replacement ] batches = [ pa.RecordBatch.from_arrays([pa.array(v, type=ty)], names=['dicts']) for v in data ] schema = batches[0].schema def write_batches(): with stream_fixture._get_writer(pa.MockOutputStream(), schema) as writer: for batch in batches: writer.write_batch(batch) return writer.stats st = write_batches() assert st.num_record_batches == 5 assert st.num_dictionary_batches == 4 assert st.num_replaced_dictionaries == 3 assert st.num_dictionary_deltas == 0 stream_fixture.use_legacy_ipc_format = None stream_fixture.options = pa.ipc.IpcWriteOptions( emit_dictionary_deltas=True) st = write_batches() assert st.num_record_batches == 5 assert st.num_dictionary_batches == 4 assert st.num_replaced_dictionaries == 1 assert st.num_dictionary_deltas == 2
def athena2pyarrow(dtype: str) -> pa.DataType: # pylint: disable=too-many-return-statements """Athena to PyArrow data types conversion.""" dtype = dtype.lower().replace(" ", "") if dtype == "tinyint": return pa.int8() if dtype == "smallint": return pa.int16() if dtype in ("int", "integer"): return pa.int32() if dtype == "bigint": return pa.int64() if dtype in ("float", "real"): return pa.float32() if dtype == "double": return pa.float64() if dtype == "boolean": return pa.bool_() if (dtype == "string") or dtype.startswith("char") or dtype.startswith("varchar"): return pa.string() if dtype == "timestamp": return pa.timestamp(unit="ns") if dtype == "date": return pa.date32() if dtype in ("binary" or "varbinary"): return pa.binary() if dtype.startswith("decimal") is True: precision, scale = dtype.replace("decimal(", "").replace(")", "").split(sep=",") return pa.decimal128(precision=int(precision), scale=int(scale)) if dtype.startswith("array") is True: return pa.list_(value_type=athena2pyarrow(dtype=dtype[6:-1]), list_size=-1) if dtype.startswith("struct") is True: return pa.struct([(f.split(":", 1)[0], athena2pyarrow(f.split(":", 1)[1])) for f in _split_struct(dtype[7:-1])]) if dtype.startswith("map") is True: parts: List[str] = _split_map(s=dtype[4:-1]) return pa.map_(athena2pyarrow(parts[0]), athena2pyarrow(parts[1])) raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}")
def test_from_numpy_dtype(): cases = [ (np.dtype('bool'), pa.bool_()), (np.dtype('int8'), pa.int8()), (np.dtype('int16'), pa.int16()), (np.dtype('int32'), pa.int32()), (np.dtype('int64'), pa.int64()), (np.dtype('uint8'), pa.uint8()), (np.dtype('uint16'), pa.uint16()), (np.dtype('uint32'), pa.uint32()), (np.dtype('float16'), pa.float16()), (np.dtype('float32'), pa.float32()), (np.dtype('float64'), pa.float64()), (np.dtype('U'), pa.string()), (np.dtype('S'), pa.binary()), (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns')) ] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt # Things convertible to numpy dtypes work assert pa.from_numpy_dtype('U') == pa.string() assert pa.from_numpy_dtype(np.unicode) == pa.string() assert pa.from_numpy_dtype('int32') == pa.int32() assert pa.from_numpy_dtype(bool) == pa.bool_() with pytest.raises(NotImplementedError): pa.from_numpy_dtype(np.dtype('O')) with pytest.raises(TypeError): pa.from_numpy_dtype('not_convertible_to_dtype')
def test_types_hashable(): types = [ pa.null(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i
def test_integer_no_nulls(self): data = OrderedDict() fields = [] numpy_dtypes = [ ('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('longlong', pa.int64()), ('ulonglong', pa.uint64()) ] num_values = 100 for dtype, arrow_dtype in numpy_dtypes: info = np.iinfo(dtype) values = np.random.randint(max(info.min, np.iinfo(np.int_).min), min(info.max, np.iinfo(np.int_).max), size=num_values) data[dtype] = values.astype(dtype) fields.append(pa.field(dtype, arrow_dtype)) df = pd.DataFrame(data) schema = pa.schema(fields) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_field_equals(): meta1 = {b'foo': b'bar'} meta2 = {b'bizz': b'bazz'} f1 = pa.field('a', pa.int8(), nullable=True) f2 = pa.field('a', pa.int8(), nullable=True) f3 = pa.field('a', pa.int8(), nullable=False) f4 = pa.field('a', pa.int16(), nullable=False) f5 = pa.field('b', pa.int16(), nullable=False) f6 = pa.field('a', pa.int8(), nullable=True, metadata=meta1) f7 = pa.field('a', pa.int8(), nullable=True, metadata=meta1) f8 = pa.field('a', pa.int8(), nullable=True, metadata=meta2) assert f1.equals(f2) assert f6.equals(f7) assert not f1.equals(f3) assert not f1.equals(f4) assert not f3.equals(f4) assert not f1.equals(f6) assert not f4.equals(f5) assert not f7.equals(f8)
def test_sql(redshift_table, postgresql_table, mysql_table, databases_parameters, db_type): if db_type == "postgresql": table = postgresql_table elif db_type == "mysql": table = mysql_table else: table = redshift_table df = get_df() if db_type == "redshift": df.drop(["binary"], axis=1, inplace=True) engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}", echo=False) index = True if engine.name == "redshift" else False wr.db.to_sql( df=df, con=engine, name=table, schema=databases_parameters[db_type]["schema"], if_exists="replace", index=index, index_label=None, chunksize=None, method=None, dtype={"iint32": sqlalchemy.types.Integer}, ) df = wr.db.read_sql_query( sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}", con=engine) ensure_data_types(df, has_list=False) engine = wr.db.get_engine( db_type=db_type, host=databases_parameters[db_type]["host"], port=databases_parameters[db_type]["port"], database=databases_parameters[db_type]["database"], user=databases_parameters["user"], password=databases_parameters["password"], echo=False, ) dfs = wr.db.read_sql_query( sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}", con=engine, chunksize=1, dtype={ "iint8": pa.int8(), "iint16": pa.int16(), "iint32": pa.int32(), "iint64": pa.int64(), "float": pa.float32(), "double": pa.float64(), "decimal": pa.decimal128(3, 2), "string_object": pa.string(), "string": pa.string(), "date": pa.date32(), "timestamp": pa.timestamp(unit="ns"), "binary": pa.binary(), "category": pa.float64(), }, ) for df in dfs: ensure_data_types(df, has_list=False) if db_type != "redshift": account_id = boto3.client("sts").get_caller_identity().get("Account") engine = wr.catalog.get_engine( connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id) wr.db.to_sql( df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"), con=engine, name=table, schema=databases_parameters[db_type]["schema"], if_exists="replace", index=True, index_label="index", ) schema = None if db_type == "postgresql": schema = databases_parameters[db_type]["schema"] df = wr.db.read_sql_table(con=engine, table=table, schema=schema, index_col="index") assert df.shape == (3, 1)
def test_argparse_types(): _map = {} csv2parquet.main_with_args(capture_args(_map), ['foo.csv', '--type', '0=string', '0=int8?']) assert _map['raw_types'] == [('0', pa.string(), False), ('0', pa.int8(), True)]
import numpy as np import pyarrow as pa # TODO(kszucs): alphanum_text, surrogate_text custom_text = st.text( alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E)) null_type = st.just(pa.null()) bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) signed_integer_types = st.sampled_from( [pa.int8(), pa.int16(), pa.int32(), pa.int64()]) unsigned_integer_types = st.sampled_from( [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()]) decimal_type = st.builds(pa.decimal128, precision=st.integers(min_value=0, max_value=38), scale=st.integers(min_value=0, max_value=38)) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([pa.date32(), pa.date64()]) time_types = st.sampled_from( [pa.time32('s'),
def test_dictionary_delta(format_fixture): ty = pa.dictionary(pa.int8(), pa.utf8()) data = [ ["foo", "foo", None], ["foo", "bar", "foo"], # potential delta ["foo", "bar"], # nothing new ["foo", None, "bar", "quux"], # potential delta ["bar", "quux"], # replacement ] batches = [ pa.RecordBatch.from_arrays([pa.array(v, type=ty)], names=['dicts']) for v in data ] batches_delta_only = batches[:4] schema = batches[0].schema def write_batches(batches, as_table=False): with format_fixture._get_writer(pa.MockOutputStream(), schema) as writer: if as_table: table = pa.Table.from_batches(batches) writer.write_table(table) else: for batch in batches: writer.write_batch(batch) return writer.stats if format_fixture.is_file: # File format cannot handle replacement with pytest.raises(pa.ArrowInvalid): write_batches(batches) # File format cannot handle delta if emit_deltas # is not provided with pytest.raises(pa.ArrowInvalid): write_batches(batches_delta_only) else: st = write_batches(batches) assert st.num_record_batches == 5 assert st.num_dictionary_batches == 4 assert st.num_replaced_dictionaries == 3 assert st.num_dictionary_deltas == 0 format_fixture.use_legacy_ipc_format = None format_fixture.options = pa.ipc.IpcWriteOptions( emit_dictionary_deltas=True) if format_fixture.is_file: # File format cannot handle replacement with pytest.raises(pa.ArrowInvalid): write_batches(batches) else: st = write_batches(batches) assert st.num_record_batches == 5 assert st.num_dictionary_batches == 4 assert st.num_replaced_dictionaries == 1 assert st.num_dictionary_deltas == 2 st = write_batches(batches_delta_only) assert st.num_record_batches == 4 assert st.num_dictionary_batches == 3 assert st.num_replaced_dictionaries == 0 assert st.num_dictionary_deltas == 2 format_fixture.options = pa.ipc.IpcWriteOptions(unify_dictionaries=True) st = write_batches(batches, as_table=True) assert st.num_record_batches == 5 if format_fixture.is_file: assert st.num_dictionary_batches == 1 assert st.num_replaced_dictionaries == 0 assert st.num_dictionary_deltas == 0 else: assert st.num_dictionary_batches == 4 assert st.num_replaced_dictionaries == 3 assert st.num_dictionary_deltas == 0
def as_column(arbitrary, nan_as_null=True, dtype=None): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * cuda array interface * numpy array * pyarrow array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - DatetimeColumn for datetime input - NumericalColumn for all other inputs. """ from cudf.dataframe import numerical, categorical, datetime, string from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance(arbitrary, Column): categories = None if hasattr(arbitrary, "categories"): categories = arbitrary.categories data = build_column(arbitrary.data, arbitrary.dtype, mask=arbitrary.mask, categories=categories) elif isinstance(arbitrary, Series): data = arbitrary._column elif isinstance(arbitrary, Index): data = arbitrary._values elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif isinstance(arbitrary, nvstrings.nvstrings): data = string.StringColumn(data=arbitrary) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif cuda.is_cuda_array(arbitrary): # Use cuda array interface to do create a numba device array by # reference new_dev_array = cuda.as_cuda_array(arbitrary) # Allocate new output array using rmm and copy the numba device array # to an rmm owned device array out_dev_array = rmm.device_array_like(new_dev_array) out_dev_array.copy_to_device(new_dev_array) data = as_column(out_dev_array) elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags['C_CONTIGUOUS']: arbitrary = np.ascontiguousarray(arbitrary) if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ('O', 'U'): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): count = len(arbitrary) null_count = arbitrary.null_count buffers = arbitrary.buffers() # Buffer of actual strings values if buffers[2] is not None: sbuf = np.frombuffer(buffers[2], dtype='int8') else: sbuf = np.empty(0, dtype='int8') # Buffer of offsets values obuf = np.frombuffer(buffers[1], dtype='int32') # Buffer of null bitmask nbuf = None if null_count > 0: nbuf = np.frombuffer(buffers[0], dtype='int8') data = as_column( nvstrings.from_offsets(sbuf, obuf, count, nbuf=nbuf, ncount=null_count)) elif isinstance(arbitrary, pa.NullArray): new_dtype = dtype if (type(dtype) == str and dtype == 'empty') or dtype is None: new_dtype = np.dtype(arbitrary.type.to_pandas_dtype()) if pd.api.types.is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary), ), dtype=new_dtype) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary), ), dtype=new_dtype) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): pamask, padata = buffers_from_pyarrow(arbitrary) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date64Array): pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = numerical.NumericalColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype) else: pamask, padata = buffers_from_pyarrow(arbitrary) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != 'empty': new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = 'category' else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = Column._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if pd.api.types.is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.array(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, 'dtype'): data_type = _gdf.np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary)) except TypeError: try: pa_type = None if dtype is not None: if pd.api.types.is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type) data = as_column(pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), nan_as_null=nan_as_null) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): np_type = None if pd.api.types.is_categorical_dtype(dtype): data = as_column(pd.Series(arbitrary, dtype='category'), nan_as_null=nan_as_null) else: if dtype is None: np_type = None else: np_type = np.dtype(dtype) data = as_column(np.array(arbitrary, dtype=np_type), nan_as_null=nan_as_null) return data
np.arange(10, dtype=np.float64), np.arange(10, dtype=np.float32), np.arange(10, dtype=np.float16), ]) def test_to_numpy_roundtrip(narr): arr = pa.array(narr) assert narr.dtype == arr.to_numpy().dtype np.testing.assert_array_equal(narr, arr.to_numpy()) np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy()) np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy()) np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range():
from pyarrow.pandas_compat import _pandas_api # noqa import pyarrow as pa import collections import datetime import decimal import itertools import math import traceback import numpy as np import pytz int_type_pairs = [ (np.int8, pa.int8()), (np.int16, pa.int16()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint16()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst
def test_schema_pyarrow_types(): field_name = "column1" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "int", "bitWidth": 8, "isSigned": True}, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.int8() assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "column_timestamp_no_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "timestamp"}, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.timestamp("ns") assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "column_timestamp_with_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "timestamp", "unit": "MICROSECOND"}, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.timestamp("us") assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "date_with_day_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "date", "unit": "DAY"}, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.date32() assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "simple_list" pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "list"}, "children": [{"type": {"name": "int", "bitWidth": 32, "isSigned": True}}], } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.list_( pyarrow.field("element", pyarrow.int32()) ) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "dictionary" pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "int", "bitWidth": 32, "isSigned": True}, "children": [], "dictionary": { "id": 0, "indexType": {"name": "int", "bitWidth": 16, "isSigned": True}, }, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_(pyarrow.int16(), pyarrow.int32()) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "struct_array" pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "nullable": False, "metadata": metadata, "type": {"name": "list"}, "children": [], "dictionary": { "id": 0, "indexType": {"name": "int", "bitWidth": 32, "isSigned": True}, }, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_( pyarrow.int32(), pyarrow.list_( pyarrow.field( "element", pyarrow.struct( [pyarrow.field("val", pyarrow.int32(), False, metadata)] ), ) ), ) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "simple_dictionary" pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "metadata": {"metadata_k": "metadata_v"}, "nullable": False, "type": {"name": "dictionary"}, "dictionary": {"indexType": {"type": {"name": "int", "bitWidth": 8}}}, "children": [{"type": {"name": "int", "bitWidth": 32}}], } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_(pyarrow.int8(), pyarrow.int32()) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False pyarrow_field = pyarrow_field_from_dict( { "name": field_name, "type": {"name": "struct"}, "children": [ { "name": "x", "type": {"name": "int", "bitWidth": 64}, "nullable": True, "metadata": {}, } ], "metadata": {"metadata_k": "metadata_v"}, "nullable": False, } ) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.struct( [pyarrow.field("x", pyarrow.int64(), True, {})] ) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False