def test_cast_signed_to_unsigned(): safe_cases = [ (np.array([0, 1, 2, 3], dtype='i1'), pa.uint8(), np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()), (np.array([0, 1, 2, 3], dtype='i2'), pa.uint16(), np.array([0, 1, 2, 3], dtype='u2'), pa.uint16()) ] for case in safe_cases: _check_cast_case(case)
def _from_jvm_int_type(jvm_type): """ Convert a JVM int type to its Python equivalent. Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int Returns ------- typ: pyarrow.DataType """ if jvm_type.isSigned: if jvm_type.bitWidth == 8: return pa.int8() elif jvm_type.bitWidth == 16: return pa.int16() elif jvm_type.bitWidth == 32: return pa.int32() elif jvm_type.bitWidth == 64: return pa.int64() else: if jvm_type.bitWidth == 8: return pa.uint8() elif jvm_type.bitWidth == 16: return pa.uint16() elif jvm_type.bitWidth == 32: return pa.uint32() elif jvm_type.bitWidth == 64: return pa.uint64()
def test_cast_integers_safe(): safe_cases = [ (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='i4'), pa.int32()), (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()), (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()), (np.array([0, 1, 2, 3], dtype='i1'), 'int8', np.array([0, 1, 2, 3], dtype='f8'), pa.float64()) ] for case in safe_cases: _check_cast_case(case) unsafe_cases = [ (np.array([50000], dtype='i4'), 'int32', 'int16'), (np.array([70000], dtype='i4'), 'int32', 'uint16'), (np.array([-1], dtype='i4'), 'int32', 'uint16'), (np.array([50000], dtype='u2'), 'uint16', 'int16') ] for in_data, in_type, out_type in unsafe_cases: in_arr = pa.array(in_data, type=in_type) with pytest.raises(pa.ArrowInvalid): in_arr.cast(out_type)
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string())]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_invalid_table_construct(): array = np.array([0, 1], dtype=np.uint8) u8 = pa.uint8() arrays = [pa.array(array, type=u8), pa.array(array[1:], type=u8)] with pytest.raises(pa.lib.ArrowInvalid): pa.Table.from_arrays(arrays, names=["a1", "a2"])
def test_large_table_int32_overflow(): size = np.iinfo('int32').max + 1 arr = np.ones(size, dtype='uint8') parr = pa.array(arr, type=pa.uint8()) table = pa.Table.from_arrays([parr], names=['one']) f = io.BytesIO() _write_table(table, f)
def dataframe_with_arrays(include_index=False): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float32()), ('f8', pa.float64())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_is_integer(): signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()] unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] for t in signed_ints + unsigned_ints: assert types.is_integer(t) for t in signed_ints: assert types.is_signed_integer(t) assert not types.is_unsigned_integer(t) for t in unsigned_ints: assert types.is_unsigned_integer(t) assert not types.is_signed_integer(t) assert not types.is_integer(pa.float32()) assert not types.is_signed_integer(pa.float32())
def test_integer_no_nulls(self): data = {} fields = [] numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()), ('i4', A.int32()), ('i8', A.int64()), ('u1', A.uint8()), ('u2', A.uint16()), ('u4', A.uint32()), ('u8', A.uint64())] num_values = 100 for dtype, arrow_dtype in numpy_dtypes: info = np.iinfo(dtype) values = np.random.randint(info.min, min(info.max, np.iinfo('i8').max), size=num_values) data[dtype] = values.astype(dtype) fields.append(A.Field.from_py(dtype, arrow_dtype)) df = pd.DataFrame(data) schema = A.Schema.from_fields(fields) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_take(ty, values): arr = pa.array(values, type=ty) for indices_type in [pa.uint8(), pa.int64()]: indices = pa.array([0, 4, 2, None], type=indices_type) result = arr.take(indices) expected = pa.array([values[0], values[4], values[2], None], type=ty) assert result.equals(expected) # empty indices indices = pa.array([], type=indices_type) result = arr.take(indices) expected = pa.array([], type=ty) assert result.equals(expected) indices = pa.array([2, 5]) with pytest.raises(IndexError): arr.take(indices) indices = pa.array([2, -1]) with pytest.raises(IndexError): arr.take(indices)
def test_literals(): import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() builder.make_literal(True, pa.bool_()) builder.make_literal(0, pa.uint8()) builder.make_literal(1, pa.uint16()) builder.make_literal(2, pa.uint32()) builder.make_literal(3, pa.uint64()) builder.make_literal(4, pa.int8()) builder.make_literal(5, pa.int16()) builder.make_literal(6, pa.int32()) builder.make_literal(7, pa.int64()) builder.make_literal(8.0, pa.float32()) builder.make_literal(9.0, pa.float64()) builder.make_literal("hello", pa.string()) builder.make_literal(b"world", pa.binary()) builder.make_literal(True, "bool") builder.make_literal(0, "uint8") builder.make_literal(1, "uint16") builder.make_literal(2, "uint32") builder.make_literal(3, "uint64") builder.make_literal(4, "int8") builder.make_literal(5, "int16") builder.make_literal(6, "int32") builder.make_literal(7, "int64") builder.make_literal(8.0, "float32") builder.make_literal(9.0, "float64") builder.make_literal("hello", "string") builder.make_literal(b"world", "binary") with pytest.raises(TypeError): builder.make_literal("hello", pa.int64()) with pytest.raises(TypeError): builder.make_literal(True, None)
def test_cast_from_null(): in_data = [None] * 3 in_type = pa.null() out_types = [ pa.null(), pa.uint8(), pa.float16(), pa.utf8(), pa.binary(), pa.binary(10), pa.list_(pa.int16()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.list_(pa.int8())), pa.field('c', pa.string()) ]), ] for out_type in out_types: _check_cast_case((in_data, in_type, in_data, out_type)) out_types = [ pa.dictionary(pa.int32(), pa.string()), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: with pytest.raises(NotImplementedError): in_arr.cast(out_type)
def test_from_numpy_dtype(): cases = [ (np.dtype('bool'), pa.bool_()), (np.dtype('int8'), pa.int8()), (np.dtype('int16'), pa.int16()), (np.dtype('int32'), pa.int32()), (np.dtype('int64'), pa.int64()), (np.dtype('uint8'), pa.uint8()), (np.dtype('uint16'), pa.uint16()), (np.dtype('uint32'), pa.uint32()), (np.dtype('float16'), pa.float16()), (np.dtype('float32'), pa.float32()), (np.dtype('float64'), pa.float64()), (np.dtype('U'), pa.string()), (np.dtype('S'), pa.binary()), (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns')) ] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt # Things convertible to numpy dtypes work assert pa.from_numpy_dtype('U') == pa.string() assert pa.from_numpy_dtype(np.unicode) == pa.string() assert pa.from_numpy_dtype('int32') == pa.int32() assert pa.from_numpy_dtype(bool) == pa.bool_() with pytest.raises(NotImplementedError): pa.from_numpy_dtype(np.dtype('O')) with pytest.raises(TypeError): pa.from_numpy_dtype('not_convertible_to_dtype')
def test_integer_no_nulls(self): data = OrderedDict() fields = [] numpy_dtypes = [ ('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('longlong', pa.int64()), ('ulonglong', pa.uint64()) ] num_values = 100 for dtype, arrow_dtype in numpy_dtypes: info = np.iinfo(dtype) values = np.random.randint(max(info.min, np.iinfo(np.int_).min), min(info.max, np.iinfo(np.int_).max), size=num_values) data[dtype] = values.astype(dtype) fields.append(pa.field(dtype, arrow_dtype)) df = pd.DataFrame(data) schema = pa.schema(fields) self._check_pandas_roundtrip(df, expected_schema=schema)
def generate_type_mapper( pd_boolean=None, pd_integer=None, pd_string=None, pd_date_type=None, pd_timestamp_type=None, ): """Specifies the pyarrow data types mapping to corresponding Pandas data types. Args: pd_boolean: if not noe, use the new Pandas bool type. Defaults to None. pd_integer: if not None, use the new Pandas nullable integer type rather than defaulting to floats. Defaults to None. pd_string: if not None, use the new Pandas str type. Defaults to None. pd_date_type: Defaults to None. pd_timestamp_type: Defaults to None. Returns: Type mappings between pyarrow and pandas data types. """ tm = {} if pd_boolean: bool_map = {pa.bool_(): pd.BooleanDtype()} tm = {**tm, **bool_map} if pd_string: string_map = {pa.string(): pd.StringDtype()} tm = {**tm, **string_map} if pd_integer: int_map = { pa.int8(): pd.Int64Dtype(), pa.int16(): pd.Int64Dtype(), pa.int32(): pd.Int64Dtype(), pa.int64(): pd.Int64Dtype(), pa.uint8(): pd.Int64Dtype(), pa.uint16(): pd.Int64Dtype(), pa.uint32(): pd.Int64Dtype(), pa.uint64(): pd.Int64Dtype(), } tm = {**tm, **int_map} else: # No brackets for either keys or values in this dictionary # This lets types_mapper understand the numpy data type float_map = { pa.int8: np.float64, pa.int16: np.float64, pa.int32: np.float64, pa.int64: np.float64, pa.uint8: np.float64, pa.uint16: np.float64, pa.uint32: np.float64, pa.uint64: np.float64, } tm = {**tm, **float_map} if pd_date_type == "pd_period": date_map = {pa.date64(): pd.PeriodDtype("ms")} tm = {**tm, **date_map} if pd_timestamp_type == "pd_period": datetime_map = { pa.timestamp("s"): pd.PeriodDtype("s"), pa.timestamp("ms"): pd.PeriodDtype("ms"), pa.timestamp("us"): pd.PeriodDtype("us"), pa.timestamp("ns"): pd.PeriodDtype("ns"), } tm = {**tm, **datetime_map} if tm: return tm.get else: return None
d = pa.array([0, 2, 0, 3], type=pa.int32()) eq([a], [a]) ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) assert not pa.chunked_array([], type=pa.int32()).equals(None) @pytest.mark.parametrize( ('data', 'typ'), [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))]) def test_chunked_array_pickle(data, typ): arrays = [] while data: arrays.append(pa.array(data[:2], type=typ)) data = data[2:] array = pa.chunked_array(arrays, type=typ) array.validate() result = pickle.loads(pickle.dumps(array)) result.validate() assert result.equals(array)
eq([a], [a]) ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) @pytest.mark.parametrize( ('data', 'typ'), [ ([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) ] ) def test_chunked_array_pickle(data, typ): arrays = [] while data: arrays.append(pa.array(data[:2], type=typ)) data = data[2:] array = pa.chunked_array(arrays, type=typ) result = pickle.loads(pickle.dumps(array)) assert result.equals(array)
import numpy as np import pyarrow as pa import tensorflow as tf from tfx_bsl.tfxio import tensor_adapter from tfx_bsl.tfxio import tensor_to_arrow from google.protobuf import text_format from absl.testing import absltest from absl.testing import parameterized from tensorflow_metadata.proto.v0 import schema_pb2 _TF_TYPE_TO_ARROW_TYPE = { tf.int8: pa.int8(), tf.int16: pa.int16(), tf.int32: pa.int32(), tf.int64: pa.int64(), tf.uint8: pa.uint8(), tf.uint16: pa.uint16(), tf.uint32: pa.uint32(), tf.uint64: pa.uint64(), tf.float32: pa.float32(), tf.float64: pa.float64(), tf.string: pa.large_binary(), } _ROW_PARTITION_DTYPES = {"INT64": np.int64, "INT32": np.int32} def _make_2d_varlen_sparse_tensor_test_cases(): result = [] for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items(): if tf_type == tf.string:
np.arange(10, dtype=np.float16), ]) def test_to_numpy_roundtrip(narr): arr = pa.array(narr) assert narr.dtype == arr.to_numpy().dtype np.testing.assert_array_equal(narr, arr.to_numpy()) np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy()) np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy()) np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_uint64_from_py_over_range(): arr = pa.array([2**63], type=pa.uint64()) expected = pa.array(np.array([2**63], dtype='u8'))
import collections import datetime import decimal import itertools import numpy as np import six import pytz int_type_pairs = [ (np.int8, pa.int8()), (np.int16, pa.int16()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint16()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__()
import collections import datetime import decimal import itertools import math import traceback import sys import numpy as np import pytz import six int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int16()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint16()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__() class MyInt: def __init__(self, value):
from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar _NA_REP = "<NA>" _np_pa_dtypes = { np.float64: pa.float64(), np.float32: pa.float32(), np.int64: pa.int64(), np.longlong: pa.int64(), np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(), np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(),
pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i @pytest.mark.parametrize('t,check_func', [ (pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64) ]) def test_exact_primitive_types(t, check_func): assert check_func(t)
def read_type(doc): t = doc[TYPE] if PARAM in doc: tp = doc[PARAM] else: tp = None if t == 'null': return pyarrow.null() if t == 'bool': return pyarrow.bool_() if t == 'int8': return pyarrow.int8() if t == 'int16': return pyarrow.int16() if t == 'int32': return pyarrow.int32() if t == 'int64': return pyarrow.int64() if t == 'uint8': return pyarrow.uint8() if t == 'uint16': return pyarrow.uint16() if t == 'uint32': return pyarrow.uint32() if t == 'uint64': return pyarrow.uint64() if t == 'float16': return pyarrow.float16() if t == 'float32': return pyarrow.float32() if t == 'float64': return pyarrow.float64() if t == 'date[d]': return pyarrow.date32() if t == 'date[ms]': return pyarrow.date64() if t == 'timestamp[s]': return pyarrow.timestamp('s') if t == 'timestamp[ms]': return pyarrow.timestamp('ms') if t == 'timestamp[us]': return pyarrow.timestamp('us') if t == 'timestamp[ns]': return pyarrow.timestamp('ns') if t == 'time[s]': return pyarrow.time32('s') if t == 'time[ms]': return pyarrow.time32('ms') if t == 'time[us]': return pyarrow.time64('us') if t == 'time[ns]': return pyarrow.time64('ns') if t == 'utf8': return pyarrow.utf8() if t == 'bytes': return pyarrow.binary() if t == 'factor': if tp is None: index_type = pyarrow.int32() dict_type = pyarrow.utf8() else: index_type = read_type(tp[INDEX]) dict_type = read_type(tp[DICT]) return pyarrow.dictionary(index_type, dict_type, False) if t == 'ordered': if tp is None: index_type = pyarrow.int32() dict_type = pyarrow.utf8() else: index_type = read_type(tp[INDEX]) dict_type = read_type(tp[DICT]) return pyarrow.dictionary(index_type, dict_type, True) if t == 'opaque': return pyarrow.binary(tp) if t == 'list': return pyarrow.list_(read_type(tp)) if t == 'struct': return pyarrow.struct( [pyarrow.field(f[NAME], read_type(f)) for f in tp]) raise ValueError(f'{t} is not supported BSON DataFrame type')
ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) assert not pa.chunked_array([], type=pa.int32()).equals(None) @pytest.mark.parametrize( ('data', 'typ'), [ ([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) ] ) def test_chunked_array_pickle(data, typ): arrays = [] while data: arrays.append(pa.array(data[:2], type=typ)) data = data[2:] array = pa.chunked_array(arrays, type=typ) result = pickle.loads(pickle.dumps(array)) assert result.equals(array)
) null_type = st.just(pa.null()) bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) signed_integer_types = st.sampled_from([ pa.int8(), pa.int16(), pa.int32(), pa.int64() ]) unsigned_integer_types = st.sampled_from([ pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64() ]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([ pa.float16(), pa.float32(), pa.float64() ]) decimal_type = st.builds( pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38)
# them using Java code as well as enables us to define them as parameters # without to invoke the JVM. # # The specifications were created using: # # om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')() # field = … # Code to instantiate the field # jvm_spec = om.writeValueAsString(field) @pytest.mark.parametrize('pa_type,jvm_spec', [ (pa.null(), '{"name":"null"}'), (pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'), (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
Args: meta_type ([type]): str """ ac = ArrowConverter() _ = ac.convert_col_type(meta_type) @pytest.mark.parametrize( argnames="meta_type,arrow_type", argvalues=[ ("bool_", pa.bool_()), ("int8", pa.int8()), ("int16", pa.int16()), ("int32", pa.int32()), ("int64", pa.int64()), ("uint8", pa.uint8()), ("uint16", pa.uint16()), ("uint32", pa.uint32()), ("uint64", pa.uint64()), ("float16", pa.float16()), ("float32", pa.float32()), ("float64", pa.float64()), ("decimal128(38,1)", pa.decimal128(38, 1)), ("decimal128(1,2)", pa.decimal128(1, 2)), ("time32(s)", pa.time32("s")), ("time32(ms)", pa.time32("ms")), ("time64(us)", pa.time64("us")), ("time64(ns)", pa.time64("ns")), ("timestamp(s)", pa.timestamp("s")), ("timestamp(ms)", pa.timestamp("ms")), ("timestamp(us)", pa.timestamp("us")),
# under the License. import os import sys import pytest import numpy as np import pyarrow as pa tensor_type_pairs = [ ('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f2', pa.float16()), ('f4', pa.float32()), ('f8', pa.float64()) ] def test_tensor_attrs(): data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) assert tensor.ndim == 2
# This is needed to ensure that allocator tests can be reliable. array = pa.array(data, type=typ) result = pickle.loads(pickle.dumps(array)) assert array.equals(result) @pytest.mark.parametrize( ('type', 'expected'), [ (pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'),
def test_get_eq_func(): for t in [ pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), ]: assert not get_eq_func(t)(0, 1) assert not get_eq_func(t)(None, 1) assert get_eq_func(t)(1, 1) assert get_eq_func(t)(None, None) t = pa.null() assert get_eq_func(t)("0", "1") assert get_eq_func(t)(None, "1") assert get_eq_func(t)("1", "1") assert get_eq_func(t)(None, None) t = pa.string() assert not get_eq_func(t)("0", "1") assert not get_eq_func(t)(None, "1") assert get_eq_func(t)("1", "1") assert get_eq_func(t)(None, None) t = pa.bool_() assert not get_eq_func(t)(False, True) assert not get_eq_func(t)(None, False) assert not get_eq_func(t)(None, True) assert get_eq_func(t)(True, True) assert get_eq_func(t)(False, False) assert get_eq_func(t)(None, None) for t in [pa.float16(), pa.float32(), pa.float64()]: assert not get_eq_func(t)(0.0, 1.1) assert get_eq_func(t)(1.1, 1.1) assert get_eq_func(t)(None, float("nan")) for n in [None, float("nan"), float("inf"), float("-inf")]: assert not get_eq_func(t)(None, 1.1) assert get_eq_func(t)(None, None) for t in [pa.timestamp("ns")]: for n in [None, pd.NaT]: assert not get_eq_func(t)(datetime(2020, 1, 1, 0), datetime(2020, 1, 1, 1)) assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1)) assert get_eq_func(t)(datetime(2020, 1, 1, 1), datetime(2020, 1, 1, 1)) assert get_eq_func(t)(n, n) assert get_eq_func(pa.timestamp("ns"))(None, pd.NaT) for t in [pa.date32()]: for n in [None, pd.NaT]: assert get_eq_func(t)(datetime(2020, 1, 1, 0), datetime(2020, 1, 1, 1)) assert not get_eq_func(t)(datetime(2020, 1, 1), datetime( 2020, 1, 2).date()) assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1)) assert get_eq_func(t)(datetime(2020, 1, 1).date(), datetime(2020, 1, 1, 1)) assert get_eq_func(t)(n, n) t = pa.struct([pa.field("a", pa.int32())]) assert not get_eq_func(t)(dict(a=0), dict(a=1)) assert not get_eq_func(t)(None, dict(a=1)) assert get_eq_func(t)(dict(a=1), dict(a=1)) assert get_eq_func(t)(None, None) t = pa.list_(pa.int32()) assert not get_eq_func(t)([0], [1]) assert not get_eq_func(t)(None, [1]) assert get_eq_func(t)([1], [1]) assert get_eq_func(t)(None, None)
# TODO(kszucs): alphanum_text, surrogate_text custom_text = st.text( alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E)) null_type = st.just(pa.null()) bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) signed_integer_types = st.sampled_from( [pa.int8(), pa.int16(), pa.int32(), pa.int64()]) unsigned_integer_types = st.sampled_from( [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()]) decimal_type = st.builds(pa.decimal128, precision=st.integers(min_value=0, max_value=38), scale=st.integers(min_value=0, max_value=38)) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([pa.date32(), pa.date64()]) time_types = st.sampled_from( [pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns')])
field = pa.field('a', pa.int32()) wr = weakref.ref(field) assert wr() is not None del field assert wr() is None @pytest.mark.parametrize('t,check_func', [(pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64)]) def test_exact_primitive_types(t, check_func): assert check_func(t) def test_type_id(): # enum values are not exposed publicly for ty in get_many_types(): assert isinstance(ty.id, int)
import datetime import decimal import itertools import math import traceback import numpy as np import pytz int_type_pairs = [ (np.int8, pa.int8()), (np.int16, pa.int16()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint16()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__()
"TIME": pyarrow_time, "TIMESTAMP": pyarrow_timestamp, } ARROW_SCALAR_IDS_TO_BQ = { # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes pyarrow.bool_().id: "BOOL", pyarrow.int8().id: "INT64", pyarrow.int16().id: "INT64", pyarrow.int32().id: "INT64", pyarrow.int64().id: "INT64", pyarrow.uint8().id: "INT64", pyarrow.uint16().id: "INT64", pyarrow.uint32().id: "INT64", pyarrow.uint64().id: "INT64", pyarrow.float16().id: "FLOAT64", pyarrow.float32().id: "FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME",
def cast_to_compatible_types(table): """ Cast PyArrow table to be fully compatible with OmniSci. Parameters ---------- table : pyarrow.Table Source table. Returns ------- pyarrow.Table Table with fully compatible types with OmniSci. """ schema = table.schema new_schema = schema need_cast = False uint_to_int_cast = False new_cols = {} uint_to_int_map = { pa.uint8(): pa.int16(), pa.uint16(): pa.int32(), pa.uint32(): pa.int64(), pa.uint64(): pa.int64(), # May cause overflow } for i, field in enumerate(schema): # Currently OmniSci doesn't support Arrow table import with # dictionary columns. Here we cast dictionaries until support # is in place. # https://github.com/modin-project/modin/issues/1738 if pa.types.is_dictionary(field.type): # Conversion for dictionary of null type to string is not supported # in Arrow. Build new column for this case for now. if pa.types.is_null(field.type.value_type): mask = np.full(table.num_rows, True, dtype=bool) new_col_data = np.empty(table.num_rows, dtype=str) new_col = pa.array(new_col_data, pa.string(), mask) new_cols[i] = new_col else: need_cast = True new_field = pa.field(field.name, pa.string(), field.nullable, field.metadata) new_schema = new_schema.set(i, new_field) # OmniSci doesn't support importing Arrow's date type: # https://github.com/omnisci/omniscidb/issues/678 elif pa.types.is_date(field.type): # Arrow's date is the number of days since the UNIX-epoch, so we can convert it # to a timestamp[s] (number of seconds since the UNIX-epoch) without losing precision new_field = pa.field(field.name, pa.timestamp("s"), field.nullable, field.metadata) new_schema = new_schema.set(i, new_field) need_cast = True # OmniSci doesn't support unsigned types elif pa.types.is_unsigned_integer(field.type): new_field = pa.field( field.name, uint_to_int_map[field.type], field.nullable, field.metadata, ) new_schema = new_schema.set(i, new_field) need_cast = True uint_to_int_cast = True # Such cast may affect the data, so we have to raise a warning about it if uint_to_int_cast: ErrorMessage.single_warning( "OmniSci does not support unsigned integer types, such types will be rounded up to the signed equivalent." ) for i, col in new_cols.items(): table = table.set_column(i, new_schema[i], col) if need_cast: try: table = table.cast(new_schema) except pa.lib.ArrowInvalid as e: raise ( OverflowError if uint_to_int_cast else RuntimeError )("An error occurred when trying to convert unsupported by OmniSci 'dtypes' " + f"to the supported ones, the schema to cast was: \n{new_schema}." ) from e return table
def test_index_store_roundtrip_ts(store, dtype, timestamps): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex( column="col", index_dct=dict(zip(timestamps, [["part_1", "part_2"], ["part_3"]])), index_storage_key=storage_key, dtype=dtype, ) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store) assert index1 == index2 @pytest.mark.parametrize( "dtype,expected", [(pa.int8(), pa.int64()), (pa.uint8(), pa.uint64()), (None, None)] ) def test_index_normalize_dtype(dtype, expected): index = ExplicitSecondaryIndex( column="col", dtype=dtype, index_storage_key="dataset_uuid/some_index.parquet" ) assert index.dtype == expected def test_index_raises_nested_dtype(): with pytest.raises(NotImplementedError) as exc: ExplicitSecondaryIndex( column="col", dtype=pa.list_(pa.int8()), index_storage_key="dataset_uuid/some_index.parquet", )
# Copyright (c) 2020, NVIDIA CORPORATION. import random import pandas as pd import pyarrow as pa pyarrow_dtypes_to_pandas_dtypes = { pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), pa.int64(): pd.Int64Dtype(), pa.bool_(): pd.BooleanDtype(), pa.string(): pd.StringDtype(), } def _generate_rand_meta(obj, dtypes_list): obj._current_params = {} num_rows = obj._rand(obj._max_rows) num_cols = obj._rand(obj._max_columns) dtypes_meta = [] for _ in range(num_cols): dtype = random.choice(dtypes_list) null_frequency = random.uniform(0, 1)
storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex( column="col", index_dct=dict(zip(timestamps, [["part_1", "part_2"], ["part_3"]])), index_storage_key=storage_key, dtype=dtype, ) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store) assert index1 == index2 @pytest.mark.parametrize("dtype,expected", [(pa.int8(), pa.int64()), (pa.uint8(), pa.uint64()), (None, None)]) def test_index_normalize_dtype(dtype, expected): index = ExplicitSecondaryIndex( column="col", dtype=dtype, index_storage_key="dataset_uuid/some_index.parquet") assert index.dtype == expected def test_index_raises_nested_dtype(): with pytest.raises(NotImplementedError) as exc: ExplicitSecondaryIndex( column="col", dtype=pa.list_(pa.int8()), index_storage_key="dataset_uuid/some_index.parquet",
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import numpy as np import pytest import pyarrow as pa @pytest.mark.parametrize('arrow_type', [ pa.int8(), pa.int16(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint64(), pa.float32(), pa.float64() ]) def test_sum(arrow_type): arr = pa.array([1, 2, 3, 4], type=arrow_type) assert arr.sum() == 10 @pytest.mark.parametrize( ('ty', 'values'), [('bool', [True, False, False, True, True]), ('uint8', np.arange(5)), ('int8', np.arange(5)), ('uint16', np.arange(5)), ('int16', np.arange(5)), ('uint32', np.arange(5)), ('int32', np.arange(5)),
class MisraGriesSketchTest(parameterized.TestCase): @parameterized.named_parameters( ("binary", [b"a", b"a", b"b", b"c", None], pa.binary()), ("large_binary", [b"a", b"a", b"b", b"c"], pa.large_binary()), ("string", ["a", "a", "b", "c", None], pa.string()), ("large_string", ["a", "a", "b", "c"], pa.large_string()), ) def test_add_binary_like(self, values, binary_like_type): expected_counts = [{ "values": b"a", "counts": 2.0 }, { "values": b"b", "counts": 1.0 }, { "values": b"c", "counts": 1.0 }] sketch = _create_basic_sketch(pa.array(values, type=binary_like_type)) estimate = sketch.Estimate() estimate.validate(full=True) self.assertEqual(estimate.to_pylist(), expected_counts) @parameterized.named_parameters( ("int8", [1, 1, 2, 3, None], pa.int8()), ("int16", [1, 1, 2, 3], pa.int16()), ("int32", [1, 1, 2, 3, None], pa.int32()), ("int64", [1, 1, 2, 3], pa.int64()), ("uint8", [1, 1, 2, 3], pa.uint8()), ("uint16", [1, None, 1, 2, 3], pa.uint16()), ("uint32", [1, 1, 2, 3], pa.uint32()), ("uint64", [1, 1, 2, 3, None], pa.uint64()), ) def test_add_integer(self, values, integer_type): expected_counts = [{ "values": b"1", "counts": 2.0 }, { "values": b"2", "counts": 1.0 }, { "values": b"3", "counts": 1.0 }] sketch = _create_basic_sketch(pa.array(values, type=integer_type)) estimate = sketch.Estimate() estimate.validate(full=True) self.assertEqual(estimate.to_pylist(), expected_counts) def test_add_weighted_values(self): items = pa.array(["a", "a", "b", "c"], type=pa.string()) weights = pa.array([4, 3, 2, 1], type=pa.float32()) sketch = _create_basic_sketch(items, weights=weights) expected_counts = [{ "values": b"a", "counts": 7.0 }, { "values": b"b", "counts": 2.0 }, { "values": b"c", "counts": 1.0 }] estimate = sketch.Estimate() estimate.validate(full=True) self.assertEqual(estimate.to_pylist(), expected_counts) def test_add_invalid_weights(self): items = pa.array(["a", "a", "b", "c"], type=pa.string()) weights = pa.array([4, 3, 2, 1], type=pa.int64()) with self.assertRaisesRegex( RuntimeError, "INVALID_ARGUMENT: Weight array must be float type."): _create_basic_sketch(items, weights=weights) def test_add_unsupported_type(self): values = pa.array([True, False], pa.bool_()) sketch = sketches.MisraGriesSketch(_NUM_BUCKETS) with self.assertRaisesRegex(RuntimeError, "UNIMPLEMENTED: bool"): sketch.AddValues(values) def test_replace_invalid_utf8(self): values1 = pa.array([ b"a", b"\x80", # invalid b"\xC1", # invalid ]) values2 = pa.array([ b"\xc0\x80", # invalid b"a" ]) sketch1 = sketches.MisraGriesSketch( _NUM_BUCKETS, invalid_utf8_placeholder=b"<BYTES>") sketch1.AddValues(values1) sketch2 = sketches.MisraGriesSketch( _NUM_BUCKETS, invalid_utf8_placeholder=b"<BYTES>") sketch2.AddValues(values2) serialized1 = sketch1.Serialize() serialized2 = sketch2.Serialize() sketch1 = sketches.MisraGriesSketch.Deserialize(serialized1) sketch2 = sketches.MisraGriesSketch.Deserialize(serialized2) sketch1.AddValues(values2) sketch1.Merge(sketch2) actual = sketch1.Estimate() actual.validate(full=True) self.assertEqual(actual.to_pylist(), [ { "values": b"<BYTES>", "counts": 4.0 }, { "values": b"a", "counts": 3.0 }, ]) def test_no_replace_invalid_utf8(self): sketch = sketches.MisraGriesSketch(_NUM_BUCKETS) sketch.AddValues(pa.array([b"\x80"])) actual = sketch.Estimate() self.assertEqual(actual.to_pylist(), [ { "values": b"\x80", "counts": 1.0 }, ]) def test_large_string_threshold(self): values1 = pa.array(["a", "bbb", "c", "d", "eeff"]) values2 = pa.array(["a", "gghh"]) sketch1 = sketches.MisraGriesSketch( _NUM_BUCKETS, large_string_threshold=2, large_string_placeholder=b"<LARGE>") sketch1.AddValues(values1) sketch2 = sketches.MisraGriesSketch( _NUM_BUCKETS, large_string_threshold=2, large_string_placeholder=b"<LARGE>") sketch2.AddValues(values2) serialized1 = sketch1.Serialize() serialized2 = sketch2.Serialize() sketch1 = sketches.MisraGriesSketch.Deserialize(serialized1) sketch2 = sketches.MisraGriesSketch.Deserialize(serialized2) sketch1.AddValues(values2) sketch1.Merge(sketch2) actual = sketch1.Estimate() actual.validate(full=True) self.assertEqual(actual.to_pylist(), [ { "values": b"<LARGE>", "counts": 4.0 }, { "values": b"a", "counts": 3.0 }, { "values": b"c", "counts": 1.0 }, { "values": b"d", "counts": 1.0 }, ]) def test_invalid_large_string_replacing_config(self): with self.assertRaisesRegex( RuntimeError, "Must provide both or neither large_string_threshold and " "large_string_placeholder"): _ = sketches.MisraGriesSketch(_NUM_BUCKETS, large_string_threshold=1024) with self.assertRaisesRegex( RuntimeError, "Must provide both or neither large_string_threshold and " "large_string_placeholder"): _ = sketches.MisraGriesSketch(_NUM_BUCKETS, large_string_placeholder=b"<L>") def test_many_uniques(self): # Test that the tail elements with equal counts are not discarded after # `AddValues` call. sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"]), num_buckets=2) estimate = sketch.Estimate() estimate.validate(full=True) # Since "b" and "c" have equal counts and neither token has count > 4/2, any # combination is possible. all_counts = [{ "values": b"a", "counts": 2.0 }, { "values": b"b", "counts": 1.0 }, { "values": b"c", "counts": 1.0 }] self.assertIn(tuple(estimate.to_pylist()), list(itertools.combinations(all_counts, 2))) def test_merge(self): sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) sketch2 = _create_basic_sketch(pa.array(["d", "a"])) sketch1.Merge(sketch2) estimate = sketch1.Estimate() estimate.validate(full=True) expected_counts = [{ "values": b"a", "counts": 3.0 }, { "values": b"b", "counts": 1.0 }, { "values": b"c", "counts": 1.0 }, { "values": b"d", "counts": 1.0 }] self.assertEqual(estimate.to_pylist(), expected_counts) def test_merge_equal_to_kth_weights(self): # Test that tail elements with equal counts are not discarded after # `Compress` call. sketch1 = _create_basic_sketch(pa.array(["a"] * 5 + ["b"] * 5 + ["c"] * 4 + ["a"] * 4), num_buckets=3) sketch2 = _create_basic_sketch(pa.array(["d"] * 4 + ["a"] * 2), num_buckets=3) sketch1.Merge(sketch2) estimate = sketch1.Estimate() estimate.validate(full=True) # Since "c" and "d" have equal counts, the last entry may be either. expected_counts1 = [{ "values": b"a", "counts": 11.0 }, { "values": b"b", "counts": 5.0 }, { "values": b"c", "counts": 4.0 }] expected_counts2 = expected_counts1.copy() expected_counts2[2] = {"values": b"d", "counts": 4.0} self.assertIn(estimate.to_pylist(), [expected_counts1, expected_counts2]) def test_merge_with_extra_items(self): # Each of these sketches get more values than `num_buckets`. This will # result into removal of less frequent elements from the main buffer and # adding them to a buffer of extra elements. # Here we're testing that merging of sketches having extra elements is # correct and results in a sketch that produces the requested number of # elements. sketch1 = _create_basic_sketch(pa.array(["a"] * 3 + ["b"] * 2 + ["c", "d"]), num_buckets=3) sketch2 = _create_basic_sketch(pa.array(["e"] * 3 + ["f"] * 2 + ["g", "h"]), num_buckets=3) sketch3 = _create_basic_sketch(pa.array(["i"] * 2 + ["j", "k", "l"]), num_buckets=3) sketch1.Merge(sketch2) sketch1.Merge(sketch3) estimate = sketch1.Estimate() estimate.validate(full=True) # Due to large number of unique elements (relative to `num_buckets`), the # total estimated count error is 5. def get_expected_counts(): for least_frequent_item in [b"b", b"f", b"i"]: yield [{ "values": b"a", "counts": 5.0 }, { "values": b"e", "counts": 5.0 }, { "values": least_frequent_item, "counts": 5.0 }] self.assertIn(estimate.to_pylist(), list(get_expected_counts())) def test_picklable(self): sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) pickled = pickle.dumps(sketch, 2) self.assertIsInstance(pickled, bytes) unpickled = pickle.loads(pickled) self.assertIsInstance(unpickled, sketches.MisraGriesSketch) estimate = unpickled.Estimate() estimate.validate(full=True) expected_counts = [{ "values": b"a", "counts": 2.0 }, { "values": b"b", "counts": 1.0 }, { "values": b"c", "counts": 1.0 }] self.assertEqual(estimate.to_pylist(), expected_counts) def test_serialization(self): sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"])) serialized = sketch.Serialize() self.assertIsInstance(serialized, bytes) deserialized = sketches.MisraGriesSketch.Deserialize(serialized) self.assertIsInstance(deserialized, sketches.MisraGriesSketch) estimate = deserialized.Estimate() estimate.validate(full=True) expected_counts = [{ "values": b"a", "counts": 2.0 }, { "values": b"b", "counts": 1.0 }, { "values": b"c", "counts": 1.0 }] self.assertEqual(estimate.to_pylist(), expected_counts) def test_deserialize_fails_with_exception(self): with self.assertRaisesRegex(RuntimeError, "Failed to parse MisraGries sketch"): sketches.MisraGriesSketch.Deserialize("I am no proto")
import pytest from pyarrow.compat import unittest, u # noqa import pyarrow as pa import collections import datetime import decimal import itertools import numpy as np import six import pytz int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int64()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint64()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__() def check_struct_type(ty, expected): """
import six import tensorflow as tf from tfx_bsl.tfxio import tensor_adapter from google.protobuf import text_format from absl.testing import absltest from absl.testing import parameterized from tensorflow.python.framework import test_util # pylint: disable=g-direct-tensorflow-import from tensorflow_metadata.proto.v0 import schema_pb2 _ALL_SUPPORTED_INT_VALUE_TYPES = [ pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), ] _ALL_SUPPORTED_FLOATING_VALUE_TYPES = [pa.float32(), pa.float64()] _ALL_SUPPORTED_STRING_VALUE_TYPES = [ pa.binary(), pa.large_binary(), pa.string(), pa.large_string() ] _ALL_SUPPORTED_VALUE_TYPES = (_ALL_SUPPORTED_INT_VALUE_TYPES + _ALL_SUPPORTED_FLOATING_VALUE_TYPES + _ALL_SUPPORTED_STRING_VALUE_TYPES) _ARROW_TYPE_TO_TF_TYPE = {
def test_simple_type_construction(): result = pa.lib.TimestampType() with pytest.raises(TypeError): str(result) @pytest.mark.parametrize( ('type', 'expected'), [ (pa.null(), 'float64'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'),
import numpy as np import pyarrow as pa try: from scipy.sparse import csr_matrix, coo_matrix except ImportError: coo_matrix = None csr_matrix = None try: import sparse except ImportError: sparse = None tensor_type_pairs = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f2', pa.float16()), ('f4', pa.float32()), ('f8', pa.float64())] @pytest.mark.parametrize('sparse_tensor_type', [ pa.SparseCSRMatrix, pa.SparseCOOTensor, ]) def test_sparse_tensor_attrs(sparse_tensor_type): data = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 6],
import pickle import pytest import weakref import numpy as np import pyarrow as pa @pytest.mark.parametrize(['value', 'ty', 'klass', 'deprecated'], [ (False, None, pa.BooleanScalar, pa.BooleanValue), (True, None, pa.BooleanScalar, pa.BooleanValue), (1, None, pa.Int64Scalar, pa.Int64Value), (-1, None, pa.Int64Scalar, pa.Int64Value), (1, pa.int8(), pa.Int8Scalar, pa.Int8Value), (1, pa.uint8(), pa.UInt8Scalar, pa.UInt8Value), (1, pa.int16(), pa.Int16Scalar, pa.Int16Value), (1, pa.uint16(), pa.UInt16Scalar, pa.UInt16Value), (1, pa.int32(), pa.Int32Scalar, pa.Int32Value), (1, pa.uint32(), pa.UInt32Scalar, pa.UInt32Value), (1, pa.int64(), pa.Int64Scalar, pa.Int64Value), (1, pa.uint64(), pa.UInt64Scalar, pa.UInt64Value), (1.0, None, pa.DoubleScalar, pa.DoubleValue), (np.float16(1.0), pa.float16(), pa.HalfFloatScalar, pa.HalfFloatValue), (1.0, pa.float32(), pa.FloatScalar, pa.FloatValue), (decimal.Decimal("1.123"), None, pa.Decimal128Scalar, pa.Decimal128Value), ("string", None, pa.StringScalar, pa.StringValue), (b"bytes", None, pa.BinaryScalar, pa.BinaryValue), ("largestring", pa.large_string(), pa.LargeStringScalar, pa.LargeStringValue), (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar,
tensor = pa.Tensor.from_numpy(data2) assert not tensor.is_mutable def test_tensor_base_object(): tensor = pa.Tensor.from_numpy(np.random.randn(10, 4)) n = sys.getrefcount(tensor) array = tensor.to_numpy() assert sys.getrefcount(tensor) == n + 1 @pytest.mark.parametrize('dtype_str,arrow_type', [ ('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f2', pa.float16()), ('f4', pa.float32()), ('f8', pa.float64()) ]) def test_tensor_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = (100 * np.random.randn(10, 4)).astype(dtype) tensor = pa.Tensor.from_numpy(data) assert tensor.type == arrow_type repr(tensor)