def test_orcfile_empty(): from pyarrow import orc f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile')) table = f.read() assert table.num_rows == 0 schema = table.schema expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert schema == expected_schema
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_sequence_nesting_levels(): data = [1, 2, None] arr = pa.array(data) assert arr.type == pa.int64() assert arr.to_pylist() == data data = [[1], [2], None] arr = pa.array(data) assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == data data = [[1], [2, 3, 4], [None]] arr = pa.array(data) assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == data data = [None, [[None, 1]], [[2, 3, 4], None], [None]] arr = pa.array(data) assert arr.type == pa.list_(pa.list_(pa.int64())) assert arr.to_pylist() == data exceptions = (pa.ArrowInvalid, pa.ArrowTypeError) # Mixed nesting levels are rejected with pytest.raises(exceptions): pa.array([1, 2, [1]]) with pytest.raises(exceptions): pa.array([1, 2, []]) with pytest.raises(exceptions): pa.array([[1], [2], [None, [1]]])
def test_schema(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert sch[0].name == 'foo' assert sch[0].type == fields[0].type assert sch.field_by_name('foo').name == 'foo' assert sch.field_by_name('foo').type == fields[0].type assert repr(sch) == """\ foo: int32 bar: string baz: list<item: int8> child 0, item: int8""" with pytest.raises(TypeError): pa.schema([None])
def test_type_list(): value_type = pa.int32() list_type = pa.list_(value_type) assert str(list_type) == 'list<item: int32>' field = pa.field('my_item', pa.string()) l2 = pa.list_(field) assert str(l2) == 'list<my_item: string>'
def test_array_from_pandas_typed_array_with_mask(self, t, data, expected): m = np.array([True, False, True]) s = pd.Series(data) result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t())) assert pa.Array.from_pandas(expected, type=pa.list_(t())).equals(result)
def dataframe_with_arrays(include_index=False): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float32()), ('f8', pa.float64())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_is_nested_or_struct(): struct_ex = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) assert types.is_struct(struct_ex) assert not types.is_struct(pa.list_(pa.int32())) assert types.is_nested(struct_ex) assert types.is_nested(pa.list_(pa.int32())) assert not types.is_nested(pa.int32())
def dataframe_with_lists(include_index=False): """ Dataframe with list columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [], np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, dtype=np.int64)[::2] ] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [], np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) arrays['bytes_list'] = [ [b"1", b"f"], None, [b"1"], [b"1", b"2", b"3"], [], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"ä"], None, [u"1"], [u"1", u"2", u"3"], [], ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_nested_lists(seq): data = [[], [1, 2], None] arr = pa.array(seq(data)) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == data # With explicit type arr = pa.array(seq(data), type=pa.list_(pa.int32())) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pa.list_(pa.int32()) assert arr.to_pylist() == data
def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type): # ARROW-2806: numpy.nan is a double value and thus should produce # a double array. _, pa_type = np_scalar_pa_type with pytest.raises(ValueError): pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False) arr = pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=True) expected = [[None]] assert len(arr) == 1 assert arr.null_count == 0 assert arr.type == pa.list_(pa_type) assert arr.to_pylist() == expected
def test_list_array_flatten(): typ2 = pa.list_( pa.list_( pa.int64() ) ) arr2 = pa.array([ None, [ [1, None, 2], None, [3, 4] ], [], [ [], [5, 6], None ], [ [7, 8] ] ]) assert arr2.type.equals(typ2) typ1 = pa.list_(pa.int64()) arr1 = pa.array([ [1, None, 2], None, [3, 4], [], [5, 6], None, [7, 8] ]) assert arr1.type.equals(typ1) typ0 = pa.int64() arr0 = pa.array([ 1, None, 2, 3, 4, 5, 6, 7, 8 ]) assert arr0.type.equals(typ0) assert arr2.flatten().equals(arr1) assert arr1.flatten().equals(arr0) assert arr2.flatten().flatten().equals(arr0)
def test_buffers_nested(): a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64())) buffers = a.buffers() assert len(buffers) == 4 # The parent buffers null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 offsets = buffers[1].to_pybytes() assert struct.unpack('4i', offsets) == (0, 2, 2, 6) # The child buffers null_bitmap = buffers[2].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00110111 values = buffers[3].to_pybytes() assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5) a = pa.array([(42, None), None, (None, 43)], type=pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.int16())])) buffers = a.buffers() assert len(buffers) == 5 # The parent buffer null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 # The child buffers: 'a' null_bitmap = buffers[1].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000001 values = buffers[2].to_pybytes() assert struct.unpack('bxx', values) == (42,) # The child buffers: 'b' null_bitmap = buffers[3].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000100 values = buffers[4].to_pybytes() assert struct.unpack('4xh', values) == (43,)
def test_list_of_int(self): data = [[1, 2, 3], [], None, [1, 2]] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pyarrow.list_(pyarrow.int64()) assert arr.to_pylist() == data
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.struct([ pa.field('a', 'int8'), pa.field('b', 'string') ]), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal(12, 2), pa.field('a', 'string', metadata={b'foo': b'bar'}) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def test_nested_arrays(seq): arr = pa.array(seq([np.array([], dtype=np.int64), np.array([1, 2], dtype=np.int64), None])) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == [[], [1, 2], None]
def test_nested_ndarray_different_dtypes(): data = [ np.array([1, 2, 3], dtype='int64'), None, np.array([4, 5, 6], dtype='uint32') ] arr = pa.array(data) expected = pa.array([[1, 2, 3], None, [4, 5, 6]], type=pa.list_(pa.int64())) assert arr.equals(expected) t2 = pa.list_(pa.uint32()) arr2 = pa.array(data, type=t2) expected2 = expected.cast(t2) assert arr2.equals(expected2)
def test_is_union(): for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]: assert types.is_union(pa.union([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())], mode=mode)) assert not types.is_union(pa.list_(pa.int32()))
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_infer_lists(self): data = OrderedDict([ ('nan_ints', [[None, 1], [2, 3]]), ('ints', [[0, 1], [2, 3]]), ('strs', [[None, u'b'], [u'c', u'd']]), ('nested_strs', [[[None, u'b'], [u'c', u'd']], None]) ]) df = pd.DataFrame(data) expected_schema = pa.schema([ pa.field('nan_ints', pa.list_(pa.int64())), pa.field('ints', pa.list_(pa.int64())), pa.field('strs', pa.list_(pa.string())), pa.field('nested_strs', pa.list_(pa.list_(pa.string()))) ]) self._check_pandas_roundtrip(df, expected_schema=expected_schema)
def test_schema_from_tuples(): fields = [ ('foo', pa.int32()), ('bar', pa.string()), ('baz', pa.list_(pa.int8())), ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'baz'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert repr(sch) == """\ foo: int32 bar: string baz: list<item: int8> child 0, item: int8""" with pytest.raises(TypeError): pa.schema([('foo', None)])
def test_list_metadata(self): df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]}) schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))]) table = pa.Table.from_pandas(df, schema=schema) metadata = table.schema.metadata assert b'mixed' not in metadata[b'pandas'] js = json.loads(metadata[b'pandas'].decode('utf8')) data_column = js['columns'][0] assert data_column['pandas_type'] == 'list[int64]' assert data_column['numpy_type'] == 'object'
def test_bit_width(): for ty, expected in [(pa.bool_(), 1), (pa.int8(), 8), (pa.uint32(), 32), (pa.float16(), 16), (pa.decimal128(19, 4), 128), (pa.binary(42), 42 * 8)]: assert ty.bit_width == expected for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]: with pytest.raises(ValueError, match="fixed width"): ty.bit_width
def test_schema_duplicate_fields(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('foo', pa.list_(pa.int8())), ] sch = pa.schema(fields) assert sch.names == ['foo', 'bar', 'foo'] assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())] assert len(sch) == 3 assert repr(sch) == """\ foo: int32 bar: string foo: list<item: int8> child 0, item: int8""" assert sch[0].name == 'foo' assert sch[0].type == fields[0].type assert sch.field_by_name('bar') == fields[1] assert sch.field_by_name('xxx') is None with pytest.warns(UserWarning): assert sch.field_by_name('foo') is None
def test_infer_numpy_array(self): data = OrderedDict([ ('ints', [ np.array([0, 1], dtype=np.int64), np.array([2, 3], dtype=np.int64) ]) ]) df = pd.DataFrame(data) expected_schema = pa.schema([ pa.field('ints', pa.list_(pa.int64())) ]) self._check_pandas_roundtrip(df, expected_schema=expected_schema)
def dataframe_with_lists(): """ Dataframe with list columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [0] ] fields.append(pa.field('double', pa.list_(pa.double()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [0.] ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"ä"], None, [u"1"], [u"1", u"2", u"3"] ] df = pd.DataFrame(arrays) schema = pa.Schema.from_fields(fields) return df, schema
def test_schema_equals(): fields = [ pa.field('foo', pa.int32()), pa.field('bar', pa.string()), pa.field('baz', pa.list_(pa.int8())) ] sch1 = pa.schema(fields) sch2 = pa.schema(fields) assert sch1.equals(sch2) del fields[-1] sch3 = pa.schema(fields) assert not sch1.equals(sch3)
def test_nested_lists_all_none(self): data = np.array([[None, None], None], dtype=object) arr = pa.array(data) expected = pa.array(list(data)) assert arr.equals(expected) assert arr.type == pa.list_(pa.null()) data2 = np.array([None, None, [None, None], np.array([None, None], dtype=object)], dtype=object) arr = pa.array(data2) expected = pa.array([None, None, [None, None], [None, None]]) assert arr.equals(expected)
def test_array_from_py_float32(): data = [[1.2, 3.4], [9.0, 42.0]] t = pa.float32() arr1 = pa.array(data[0], type=t) arr2 = pa.array(data, type=pa.list_(t)) expected1 = np.array(data[0], dtype=np.float32) expected2 = pd.Series([np.array(data[0], dtype=np.float32), np.array(data[1], dtype=np.float32)]) assert arr1.type == t assert arr1.equals(pa.array(expected1)) assert arr2.equals(pa.array(expected2))
def test_column_of_arrays_to_py(self): # Test regression in ARROW-1199 not caught in above test dtype = 'i1' arr = np.array([ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ]) type_ = pa.list_(pa.int8()) parr = pa.array(arr, type=type_) assert parr[0].as_py() == list(range(10)) assert parr[1].as_py() == list(range(5)) assert parr[2].as_py() is None assert parr[3].as_py() == [0]
def test_is_primitive(): assert types.is_primitive(pa.int32()) assert not types.is_primitive(pa.list_(pa.int32()))
]), ], "type_schema": OrderedDict([ ("a", int), ("b", float), ("c", str), ("d", np.ndarray), ("e", bytes), ]), "pyarrow_schema": pa.schema([ ("a", pa.int64()), ("b", pa.float64()), ("c", pa.string()), ("d", pa.list_(pa.int64())), ("e", pa.binary()), ]) if pa is not None else None, "avro_schema": { "namespace": "example.avro", "name": "User", "type": "record", "fields": [ { "name": "a", "type": "int" }, {
buf = s.as_buffer() assert isinstance(buf, pa.Buffer) assert buf.to_pybytes() == value def test_fixed_size_binary(): s = pa.scalar(b'foof', type=pa.binary(4)) assert isinstance(s, pa.FixedSizeBinaryScalar) assert s.as_py() == b'foof' with pytest.raises(pa.ArrowInvalid): pa.scalar(b'foof5', type=pa.binary(4)) @pytest.mark.parametrize(('ty', 'klass'), [(pa.list_(pa.string()), pa.ListScalar), (pa.large_list(pa.string()), pa.LargeListScalar)]) def test_list(ty, klass): v = ['foo', None] s = pa.scalar(v, type=ty) assert s.type == ty assert len(s) == 2 assert isinstance(s.values, pa.Array) assert s.values == v assert isinstance(s, klass) assert repr(v) in repr(s) assert s.as_py() == v assert s[0].as_py() == 'foo' assert s[1].as_py() is None assert s[-1] == s[1] assert s[-2] == s[0]
def test_arrow_list_functions(): lst = np.array([['a, bc'], ['de'], ['e', 'ee'], ['中文', '中文2']], dtype=object) has_na_lst = lst.copy() has_na_lst[1] = None for pandas_only in [False, True]: with option_context({'dataframe.arrow_array.pandas_only': pandas_only}): arrow_array = ArrowListArray(lst) has_na_arrow_array = ArrowListArray(has_na_lst) # getitem, scalar assert arrow_array[1] == lst[1] assert list(arrow_array[-1]) == lst[-1] # getitem, slice np.testing.assert_array_equal(arrow_array[:2].to_numpy(), lst[:2]) # setitem arrow_array2 = arrow_array.copy() lst2 = lst.copy() for s in [['ss'], pd.Series(['ss'])]: arrow_array2[0] = s lst2[0] = ['ss'] np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2) arrow_array2[0] = None lst2[0] = None np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2) with pytest.raises(ValueError): # must set list like object arrow_array2[0] = 'ss' # test to_numpy np.testing.assert_array_equal(arrow_array.to_numpy(), lst) np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), lst) np.testing.assert_array_equal( has_na_arrow_array.to_numpy(na_value=1), pd.Series(has_na_lst).fillna(1).to_numpy()) # test fillna if not pandas_only: arrow_array3 = has_na_arrow_array.fillna(lst[1]) np.testing.assert_array_equal(arrow_array3.to_numpy(), lst) # test astype with pytest.raises(TypeError): arrow_array.astype(np.int64) with pytest.raises(TypeError): arrow_array.astype(ArrowListDtype(np.int64)) arrow_array4 = ArrowListArray([[1, 2], [3]]) expected = np.array([['1', '2'], ['3']], dtype=object) np.testing.assert_array_equal( arrow_array4.astype(ArrowListDtype(str)), expected) np.testing.assert_array_equal( arrow_array4.astype(ArrowListDtype(arrow_array4.dtype)), arrow_array4) np.testing.assert_array_equal( arrow_array4.astype(ArrowListDtype(arrow_array4.dtype), copy=False), arrow_array4) # test nbytes assert arrow_array.nbytes < pd.Series(lst).memory_usage(deep=True) # test memory_usage if not pandas_only: assert arrow_array.memory_usage( deep=True) == arrow_array.nbytes # test isna np.testing.assert_array_equal(has_na_arrow_array.isna(), pd.Series(has_na_lst).isna()) # test take assert list(arrow_array.take([1, 2, -1])) == list( pd.Series(lst).take([1, 2, -1])) # test shift assert list(arrow_array.shift( 2, fill_value=['aa'])) == [['aa']] * 2 + lst[:-2].tolist() # test all any if _use_bool_any_all: assert arrow_array.all() == pd.array(lst).all() assert arrow_array.any() == pd.array(lst).any() else: assert arrow_array.all() == lst.all() assert arrow_array.any() == lst.any() # test repr assert 'ArrowListArray' in repr(arrow_array) # test concat empty arrow_array5 = ArrowListArray( pa.chunked_array([], type=pa.list_(pa.string()))) concatenated = ArrowListArray._concat_same_type( [arrow_array5, arrow_array5]) if not pandas_only: assert len(concatenated._arrow_array.chunks) == 1 pd.testing.assert_series_equal(pd.Series(arrow_array5), pd.Series(concatenated))
all_array_types = [ ('bool', [True, False, False, True, True]), ('uint8', np.arange(5)), ('int8', np.arange(5)), ('uint16', np.arange(5)), ('int16', np.arange(5)), ('uint32', np.arange(5)), ('int32', np.arange(5)), ('uint64', np.arange(5, 10)), ('int64', np.arange(5, 10)), ('float', np.arange(0, 0.5, 0.1)), ('double', np.arange(0, 0.5, 0.1)), ('string', ['a', 'b', None, 'ddd', 'ee']), ('binary', [b'a', b'b', b'c', b'ddd', b'ee']), (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']), (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]), (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]), (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [{ 'a': 1, 'b': 2 }, None, { 'a': 3, 'b': 4 }, None, { 'a': 5, 'b': 6 }]), ] exported_functions = [ func for (name, func) in sorted(pc.__dict__.items())
def test_writing_empty_lists(): # ARROW-2591: [Python] Segmentation fault issue in pq.write_table arr1 = pa.array([[], []], pa.list_(pa.int32())) table = pa.Table.from_arrays([arr1], ['list(int32)']) _check_roundtrip(table)
(np.float16(1.0), pa.float16(), pa.HalfFloatScalar, pa.HalfFloatValue), (1.0, pa.float32(), pa.FloatScalar, pa.FloatValue), (decimal.Decimal("1.123"), None, pa.Decimal128Scalar, pa.Decimal128Value), (decimal.Decimal("1.1234567890123456789012345678901234567890"), None, pa.Decimal256Scalar, pa.Decimal256Value), ("string", None, pa.StringScalar, pa.StringValue), (b"bytes", None, pa.BinaryScalar, pa.BinaryValue), ("largestring", pa.large_string(), pa.LargeStringScalar, pa.LargeStringValue), (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar, pa.LargeBinaryValue), (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar, pa.FixedSizeBinaryValue), ([1, 2, 3], None, pa.ListScalar, pa.ListValue), ([1, 2, 3, 4], pa.large_list( pa.int8()), pa.LargeListScalar, pa.LargeListValue), ([1, 2, 3, 4, 5], pa.list_( pa.int8(), 5), pa.FixedSizeListScalar, pa.FixedSizeListValue), (datetime.date.today(), None, pa.Date32Scalar, pa.Date32Value), (datetime.date.today(), pa.date64(), pa.Date64Scalar, pa.Date64Value), (datetime.datetime.now(), None, pa.TimestampScalar, pa.TimestampValue), (datetime.datetime.now().time().replace(microsecond=0), pa.time32('s'), pa.Time32Scalar, pa.Time32Value), (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value), (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue), ({ 'a': 1, 'b': [1, 2] }, None, pa.StructScalar, pa.StructValue), ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar, pa.MapValue), ]) def test_basics(value, ty, klass, deprecated):
def test_recordbatch_from_arrays_validate_schema(): # ARROW-6263 arr = pa.array([1, 2]) schema = pa.schema([pa.field('f0', pa.list_(pa.utf8()))]) with pytest.raises(NotImplementedError): pa.record_batch([arr], schema=schema)
def test_list_type(): ty = pa.list_(pa.int64()) assert ty.value_type == pa.int64()
def test_schema_pyarrow_types(): field_name = "column1" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "int", "bitWidth": 8, "isSigned": True }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.int8() assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "column_timestamp_no_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "timestamp" }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.timestamp("ns") assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "column_timestamp_with_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "timestamp", "unit": "MICROSECOND" }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.timestamp("us") assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "date_with_day_unit" metadata = {b"metadata_k": b"metadata_v"} pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "date", "unit": "DAY" }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.date32() assert dict(pyarrow_field.metadata) == metadata assert pyarrow_field.nullable is False field_name = "simple_list" pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "list" }, "children": [{ "type": { "name": "int", "bitWidth": 32, "isSigned": True } }], }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.list_( pyarrow.field("element", pyarrow.int32())) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "dictionary" pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "int", "bitWidth": 32, "isSigned": True }, "children": [], "dictionary": { "id": 0, "indexType": { "name": "int", "bitWidth": 16, "isSigned": True }, }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_(pyarrow.int16(), pyarrow.int32()) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "struct_array" pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "nullable": False, "metadata": metadata, "type": { "name": "list" }, "children": [], "dictionary": { "id": 0, "indexType": { "name": "int", "bitWidth": 32, "isSigned": True }, }, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_( pyarrow.int32(), pyarrow.list_( pyarrow.field( "element", pyarrow.struct( [pyarrow.field("val", pyarrow.int32(), False, metadata)]), )), ) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False field_name = "simple_dictionary" pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "metadata": { "metadata_k": "metadata_v" }, "nullable": False, "type": { "name": "dictionary" }, "dictionary": { "indexType": { "type": { "name": "int", "bitWidth": 8 } } }, "children": [{ "type": { "name": "int", "bitWidth": 32 } }], }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.map_(pyarrow.int8(), pyarrow.int32()) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False pyarrow_field = pyarrow_field_from_dict({ "name": field_name, "type": { "name": "struct" }, "children": [{ "name": "x", "type": { "name": "int", "bitWidth": 64 }, "nullable": True, "metadata": {}, }], "metadata": { "metadata_k": "metadata_v" }, "nullable": False, }) assert pyarrow_field.name == field_name assert pyarrow_field.type == pyarrow.struct( [pyarrow.field("x", pyarrow.int64(), True, {})]) assert pyarrow_field.metadata == metadata assert pyarrow_field.nullable is False
def test_list_with_non_list(seq): # List types don't accept non-sequences with pytest.raises(pa.ArrowTypeError): pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64()))
def test_is_null(): assert types.is_null(pa.null()) assert not types.is_null(pa.list_(pa.int32()))
)), type_fails_on_stable_pandas( FletcherTestType( pa.float64(), [2.5, 1.0, -1.0, 0, 66.6] * 20, [None, 1.1], [2.5, 2.5, None, None, -100.1, -100.1, 2.5, 100.1], [2.5, 100.99, -10.1], [2.5, None, -10.1], lambda: choices([2.5, 1.0, -1.0, 0, 66.6], k=10), )), # Most of the tests fail as assert_extension_array_equal casts to numpy object # arrays and on them equality is not defined. pytest.param( FletcherTestType( pa.list_(pa.string()), [["B", "C"], ["A"], [None], ["A", "A"], []], [None, ["A"]], [["B"], ["B"], None, None, ["A"], ["A"], ["B"], ["C"]], [["B"], ["C"], ["A"]], [["B"], None, ["A"]], lambda: choices([["B", "C"], ["A"], [None], ["A", "A"]], k=10), ), marks=pytest.mark.xfail, ), FletcherTestType( pa.date64(), [ datetime.date(2015, 1, 1), datetime.date(2010, 12, 31), datetime.date(1970, 1, 1),
def test_list_from_numpy(): s = pa.scalar(np.array([1, 2, 3], dtype=np.int64())) assert s.type == pa.list_(pa.int64()) assert s.as_py() == [1, 2, 3]
def test_empty_lists_table_roundtrip(use_legacy_dataset): # ARROW-2744: Shouldn't crash when writing an array of empty lists arr = pa.array([[], []], type=pa.list_(pa.int32())) table = pa.Table.from_arrays([arr], ["A"]) _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset)
def __arrow_array__(self, type=None): """This function is called when calling pa.array(typed_sequence)""" assert type is None, "TypedSequence is supposed to be used with pa.array(typed_sequence, type=None)" trying_type = False if type is not None: # user explicitly passed the feature pass elif type is None and self.try_type: type = self.try_type trying_type = True else: type = self.type try: if isinstance(type, _ArrayXDExtensionType): if isinstance(self.data, np.ndarray): storage = numpy_to_pyarrow_listarray(self.data, type=type.value_type) else: storage = pa.array(self.data, type.storage_dtype) out = pa.ExtensionArray.from_storage(type, storage) elif isinstance(self.data, np.ndarray): out = numpy_to_pyarrow_listarray(self.data) elif isinstance(self.data, list) and self.data and isinstance(self.data[0], np.ndarray): out = list_of_np_array_to_pyarrow_listarray(self.data) else: out = pa.array(cast_to_python_objects(self.data, only_1d_for_numpy=True), type=type) if trying_type and out[0].as_py() != self.data[0]: raise TypeError( "Specified try_type alters data. Please check that the type/feature that you provided match the type/features of the data." ) if self.optimized_int_type and self.type is None and self.try_type is None: if pa.types.is_int64(out.type): out = out.cast(self.optimized_int_type) elif pa.types.is_list(out.type): if pa.types.is_int64(out.type.value_type): out = out.cast(pa.list_(self.optimized_int_type)) elif pa.types.is_list(out.type.value_type) and pa.types.is_int64(out.type.value_type.value_type): out = out.cast(pa.list_(pa.list_(self.optimized_int_type))) return out except (TypeError, pa.lib.ArrowInvalid) as e: # handle type errors and overflows if trying_type: try: # second chance if isinstance(self.data, np.ndarray): return numpy_to_pyarrow_listarray(self.data, type=None) else: return pa.array(self.data, type=None) except pa.lib.ArrowInvalid as e: if "overflow" in str(e): raise OverflowError( "There was an overflow with type {}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({})".format( type_(self.data), e ) ) from None else: raise elif "overflow" in str(e): raise OverflowError( "There was an overflow with type {}. Try to reduce writer_batch_size to have batches smaller than 2GB.\n({})".format( type_(self.data), e ) ) from None else: raise
def __call__(self): return pa.struct({ "language": pa.list_(pa.string()), "translation": pa.list_(pa.string()) })
def ArrowSchema(self): return pa.schema( [pa.field(c, pa.list_(pa.int32())) for c in self._columns])
def test_store_schema_metadata(store, df_all_types): store_schema_metadata( schema=make_meta(df_all_types, origin="df_all_types"), dataset_uuid="some_uuid", store=store, table="some_table", ) key = "some_uuid/some_table/_common_metadata" assert key in store.keys() pq_file = pq.ParquetFile(store.open(key)) actual_schema = pq_file.schema.to_arrow_schema() fields = [ pa.field("array_float32", pa.list_(pa.float64())), pa.field("array_float64", pa.list_(pa.float64())), pa.field("array_int16", pa.list_(pa.int64())), pa.field("array_int32", pa.list_(pa.int64())), pa.field("array_int64", pa.list_(pa.int64())), pa.field("array_int8", pa.list_(pa.int64())), pa.field("array_uint16", pa.list_(pa.uint64())), pa.field("array_uint32", pa.list_(pa.uint64())), pa.field("array_uint64", pa.list_(pa.uint64())), pa.field("array_uint8", pa.list_(pa.uint64())), pa.field("array_unicode", pa.list_(pa.string())), pa.field("bool", pa.bool_()), pa.field("byte", pa.binary()), pa.field("date", pa.date32()), pa.field("datetime64", pa.timestamp("us")), pa.field("float32", pa.float64()), pa.field("float64", pa.float64()), pa.field("int16", pa.int64()), pa.field("int32", pa.int64()), pa.field("int64", pa.int64()), pa.field("int8", pa.int64()), pa.field("null", pa.null()), pa.field("uint16", pa.uint64()), pa.field("uint32", pa.uint64()), pa.field("uint64", pa.uint64()), pa.field("uint8", pa.uint64()), pa.field("unicode", pa.string()), ] if not ARROW_LARGER_EQ_0130: fields.append(pa.field("__index_level_0__", pa.int64())) expected_schema = pa.schema(fields) assert actual_schema.remove_metadata() == expected_schema
def test_field_id_metadata(): # ARROW-7080 field_id = b'PARQUET:field_id' inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'}) middle = pa.field('middle', pa.struct([inner]), metadata={field_id: b'101'}) fields = [ pa.field('basic', pa.int32(), metadata={ b'other': b'abc', field_id: b'1' }), pa.field('list', pa.list_( pa.field('list-inner', pa.int32(), metadata={field_id: b'10'})), metadata={field_id: b'11'}), pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}), pa.field('no-metadata', pa.int32()), pa.field('non-integral-field-id', pa.int32(), metadata={field_id: b'xyz'}), pa.field('negative-field-id', pa.int32(), metadata={field_id: b'-1000'}) ] arrs = [[] for _ in fields] table = pa.table(arrs, schema=pa.schema(fields)) bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() pf = pq.ParquetFile(pa.BufferReader(contents)) schema = pf.schema_arrow assert schema[0].metadata[field_id] == b'1' assert schema[0].metadata[b'other'] == b'abc' list_field = schema[1] assert list_field.metadata[field_id] == b'11' list_item_field = list_field.type.value_field assert list_item_field.metadata[field_id] == b'10' struct_field = schema[2] assert struct_field.metadata[field_id] == b'102' struct_middle_field = struct_field.type[0] assert struct_middle_field.metadata[field_id] == b'101' struct_inner_field = struct_middle_field.type[0] assert struct_inner_field.metadata[field_id] == b'100' assert schema[3].metadata is None # Invalid input is passed through (ok) but does not # have field_id in parquet (not tested) assert schema[4].metadata[field_id] == b'xyz' assert schema[5].metadata[field_id] == b'-1000'
def from_ibis_set(dtype): return pa.list_(to_pyarrow_type(dtype.value_type))
def test_struct_from_dicts_inference(): expected_type = pa.struct([ pa.field('a', pa.int64()), pa.field('b', pa.string()), pa.field('c', pa.bool_()) ]) data = [{ 'a': 5, 'b': u'foo', 'c': True }, { 'a': 6, 'b': u'bar', 'c': False }] arr = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == data # With omitted values data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': u'bar'}] expected = [{ 'a': 5, 'b': None, 'c': True }, None, { 'a': None, 'b': None, 'c': None }, { 'a': None, 'b': u'bar', 'c': None }] arr = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == expected # Nested expected_type = pa.struct([ pa.field( 'a', pa.struct([ pa.field('aa', pa.list_(pa.int64())), pa.field('ab', pa.bool_()) ])), pa.field('b', pa.string()) ]) data = [{ 'a': { 'aa': [5, 6], 'ab': True }, 'b': 'foo' }, { 'a': { 'aa': None, 'ab': False }, 'b': None }, { 'a': None, 'b': 'bar' }] arr = pa.array(data) assert arr.to_pylist() == data # Edge cases arr = pa.array([{}]) assert arr.type == pa.struct([]) assert arr.to_pylist() == [{}] # Mixing structs and scalars is rejected with pytest.raises(pa.ArrowInvalid): pa.array([1, {'a': 2}])
def test_query_indices_external(store, metadata_version): expected = { "dataset_metadata_version": metadata_version, "dataset_uuid": "uuid+namespace-attribute12_underscored", "partitions": { "part_1": { "files": { "core_data": "file.parquest" } }, "part_2": { "files": { "core_data": "file2.parquest" } }, }, "indices": { "product_id": "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet", "location_id": { "1": ["part_1"], "2": ["part_2"], "3": ["part_1"], "4": ["part_2"], }, }, } store.put( "uuid+namespace-attribute12_underscored.by-dataset-metadata.json", simplejson.dumps(expected).encode("utf-8"), ) df = pd.DataFrame({ "product_id": [1, 2, 100, 34], "partition": [ np.array(["part_1"], dtype=object), np.array(["part_2"], dtype=object), np.array(["part_1", "part_2"], dtype=object), np.array(["part_1"], dtype=object), ], }) schema = pa.schema([ pa.field("partition", pa.list_(pa.string())), pa.field("product_id", pa.int64()), ]) table = pa.Table.from_pandas(df, schema=schema) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put( "uuid+namespace-attribute12_underscored.product_id.by-dataset-index.parquet", buf.getvalue().to_pybytes(), ) store_schema_metadata( make_meta(df, origin="core"), "uuid+namespace-attribute12_underscored", store, "core_data", ) dmd = DatasetMetadata.load_from_store( "uuid+namespace-attribute12_underscored", store) dmd = dmd.load_index("product_id", store) assert dmd.query(product_id=2) == ["part_2"] dmd = dmd.load_all_indices(store) assert dmd.query(product_id=2, location_id=2) == ["part_2"] assert dmd.query(product_id=100, location_id=3) == ["part_1"] assert dmd.query(product_id=2, location_id=2, something_else="bla") == ["part_2"] additional_index = ExplicitSecondaryIndex.from_v2( "another_column", {"1": ["part_2", "part_3"]}) assert dmd.query(indices=[additional_index], another_column="1", product_id=2, location_id=2) == ["part_2"]
pa.float32().id: float, pa.float64().id: float, pa.date32().id: datetime.date, pa.date64().id: datetime.date, pa.timestamp("ms").id: datetime.datetime, pa.binary().id: six.binary_type, pa.string().id: six.text_type, # Use any list type here, only LIST is important pa.list_(pa.string()).id: list, } _string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()} class FletcherDtype(ExtensionDtype): def __init__(self, arrow_dtype): self.arrow_dtype = arrow_dtype def __hash__(self): return hash(self.arrow_dtype) def __str__(self): return "fletcher[{}]".format(self.arrow_dtype)
def test_nested_arrays(seq): arr = pa.array(seq([np.array([], dtype=int), np.array([1, 2]), None])) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == [[], [1, 2], None]
pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), # XXX Needs array pickling # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])), ]
def arrow_type(self): if isinstance(self._value_type, ArrowDtype): arrow_subdtype = self._value_type.arrow_type else: arrow_subdtype = pa.from_numpy_dtype(self._value_type) return pa.list_(arrow_subdtype)
d = pa.array([0, 2, 0, 3], type=pa.int32()) eq([a], [a]) ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) assert not pa.chunked_array([], type=pa.int32()).equals(None) @pytest.mark.parametrize( ('data', 'typ'), [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))]) def test_chunked_array_pickle(data, typ): arrays = [] while data: arrays.append(pa.array(data[:2], type=typ)) data = data[2:] array = pa.chunked_array(arrays, type=typ) array.validate() result = pickle.loads(pickle.dumps(array)) result.validate() assert result.equals(array)
def test_is_list(): assert types.is_list(pa.list_(pa.int32())) assert not types.is_list(pa.int32())
# limitations under the License. # """ A DoFn that coverts a batch of features into an Arrow table.""" import apache_beam as beam import pyarrow as pa from typing import Dict, List, Mapping, Union from tensorflow_metadata.proto.v0 import schema_pb2 from tensorflow_metadata.proto.v0 import statistics_pb2 _ARROW_TYPE_MAP = { ColumnType.UNKNOWN: pa.null(), ColumnType.INT: pa.list_(pa.int64()), ColumnType.FLOAT: pa.list_(pa.float32()), ColumnType.STRING: pa.list_(pa.binary()), } SimpleFeatureList = List[Union[int, str, float, bool]] ColumnName = Union[bytes, Text] @beam.typehints.with_input_types(List[SimpleFeatureList]) @beam.typehints.with_output_types(pa.RecordBatch) class BatchedFeatureListsToRecordBatch(beam.DoFn): """A DoFn to convert a batch of input instances in a feature list format to an Arrow table. """