예제 #1
0
def test_cast_signed_to_unsigned():
    safe_cases = [
        (np.array([0, 1, 2, 3], dtype='i1'), pa.uint8(),
         np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()),
        (np.array([0, 1, 2, 3], dtype='i2'), pa.uint16(),
         np.array([0, 1, 2, 3], dtype='u2'), pa.uint16())
    ]

    for case in safe_cases:
        _check_cast_case(case)
예제 #2
0
파일: jvm.py 프로젝트: rok/arrow
def _from_jvm_int_type(jvm_type):
    """
    Convert a JVM int type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int

    Returns
    -------
    typ: pyarrow.DataType
    """
    if jvm_type.isSigned:
        if jvm_type.bitWidth == 8:
            return pa.int8()
        elif jvm_type.bitWidth == 16:
            return pa.int16()
        elif jvm_type.bitWidth == 32:
            return pa.int32()
        elif jvm_type.bitWidth == 64:
            return pa.int64()
    else:
        if jvm_type.bitWidth == 8:
            return pa.uint8()
        elif jvm_type.bitWidth == 16:
            return pa.uint16()
        elif jvm_type.bitWidth == 32:
            return pa.uint32()
        elif jvm_type.bitWidth == 64:
            return pa.uint64()
예제 #3
0
def test_cast_integers_safe():
    safe_cases = [
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='i4'), pa.int32()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='f8'), pa.float64())
    ]

    for case in safe_cases:
        _check_cast_case(case)

    unsafe_cases = [
        (np.array([50000], dtype='i4'), 'int32', 'int16'),
        (np.array([70000], dtype='i4'), 'int32', 'uint16'),
        (np.array([-1], dtype='i4'), 'int32', 'uint16'),
        (np.array([50000], dtype='u2'), 'uint16', 'int16')
    ]
    for in_data, in_type, out_type in unsafe_cases:
        in_arr = pa.array(in_data, type=in_type)

        with pytest.raises(pa.ArrowInvalid):
            in_arr.cast(out_type)
예제 #4
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
예제 #5
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
예제 #6
0
파일: test_array.py 프로젝트: rok/arrow
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
예제 #7
0
def test_invalid_table_construct():
    array = np.array([0, 1], dtype=np.uint8)
    u8 = pa.uint8()
    arrays = [pa.array(array, type=u8), pa.array(array[1:], type=u8)]

    with pytest.raises(pa.lib.ArrowInvalid):
        pa.Table.from_arrays(arrays, names=["a1", "a2"])
예제 #8
0
def test_large_table_int32_overflow():
    size = np.iinfo('int32').max + 1

    arr = np.ones(size, dtype='uint8')

    parr = pa.array(arr, type=pa.uint8())

    table = pa.Table.from_arrays([parr], names=['one'])
    f = io.BytesIO()
    _write_table(table, f)
예제 #9
0
def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
예제 #10
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
예제 #11
0
def test_is_integer():
    signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
    unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]

    for t in signed_ints + unsigned_ints:
        assert types.is_integer(t)

    for t in signed_ints:
        assert types.is_signed_integer(t)
        assert not types.is_unsigned_integer(t)

    for t in unsigned_ints:
        assert types.is_unsigned_integer(t)
        assert not types.is_signed_integer(t)

    assert not types.is_integer(pa.float32())
    assert not types.is_signed_integer(pa.float32())
예제 #12
0
    def test_integer_no_nulls(self):
        data = {}
        fields = []

        numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()),
                        ('i4', A.int32()), ('i8', A.int64()),
                        ('u1', A.uint8()), ('u2', A.uint16()),
                        ('u4', A.uint32()), ('u8', A.uint64())]
        num_values = 100

        for dtype, arrow_dtype in numpy_dtypes:
            info = np.iinfo(dtype)
            values = np.random.randint(info.min,
                                       min(info.max, np.iinfo('i8').max),
                                       size=num_values)
            data[dtype] = values.astype(dtype)
            fields.append(A.Field.from_py(dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = A.Schema.from_fields(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
예제 #13
0
파일: test_compute.py 프로젝트: rok/arrow
def test_take(ty, values):
    arr = pa.array(values, type=ty)
    for indices_type in [pa.uint8(), pa.int64()]:
        indices = pa.array([0, 4, 2, None], type=indices_type)
        result = arr.take(indices)
        expected = pa.array([values[0], values[4], values[2], None], type=ty)
        assert result.equals(expected)

        # empty indices
        indices = pa.array([], type=indices_type)
        result = arr.take(indices)
        expected = pa.array([], type=ty)
        assert result.equals(expected)

    indices = pa.array([2, 5])
    with pytest.raises(IndexError):
        arr.take(indices)

    indices = pa.array([2, -1])
    with pytest.raises(IndexError):
        arr.take(indices)
예제 #14
0
def test_literals():
    import pyarrow.gandiva as gandiva

    builder = gandiva.TreeExprBuilder()

    builder.make_literal(True, pa.bool_())
    builder.make_literal(0, pa.uint8())
    builder.make_literal(1, pa.uint16())
    builder.make_literal(2, pa.uint32())
    builder.make_literal(3, pa.uint64())
    builder.make_literal(4, pa.int8())
    builder.make_literal(5, pa.int16())
    builder.make_literal(6, pa.int32())
    builder.make_literal(7, pa.int64())
    builder.make_literal(8.0, pa.float32())
    builder.make_literal(9.0, pa.float64())
    builder.make_literal("hello", pa.string())
    builder.make_literal(b"world", pa.binary())

    builder.make_literal(True, "bool")
    builder.make_literal(0, "uint8")
    builder.make_literal(1, "uint16")
    builder.make_literal(2, "uint32")
    builder.make_literal(3, "uint64")
    builder.make_literal(4, "int8")
    builder.make_literal(5, "int16")
    builder.make_literal(6, "int32")
    builder.make_literal(7, "int64")
    builder.make_literal(8.0, "float32")
    builder.make_literal(9.0, "float64")
    builder.make_literal("hello", "string")
    builder.make_literal(b"world", "binary")

    with pytest.raises(TypeError):
        builder.make_literal("hello", pa.int64())
    with pytest.raises(TypeError):
        builder.make_literal(True, None)
예제 #15
0
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([
            pa.field('a', pa.int32()),
            pa.field('b', pa.list_(pa.int8())),
            pa.field('c', pa.string())
        ]),
    ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())],
                 mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())],
                 mode=pa.lib.UnionMode_SPARSE),
    ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
예제 #16
0
def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')
    def test_integer_no_nulls(self):
        data = OrderedDict()
        fields = []

        numpy_dtypes = [
            ('i1', pa.int8()), ('i2', pa.int16()),
            ('i4', pa.int32()), ('i8', pa.int64()),
            ('u1', pa.uint8()), ('u2', pa.uint16()),
            ('u4', pa.uint32()), ('u8', pa.uint64()),
            ('longlong', pa.int64()), ('ulonglong', pa.uint64())
        ]
        num_values = 100

        for dtype, arrow_dtype in numpy_dtypes:
            info = np.iinfo(dtype)
            values = np.random.randint(max(info.min, np.iinfo(np.int_).min),
                                       min(info.max, np.iinfo(np.int_).max),
                                       size=num_values)
            data[dtype] = values.astype(dtype)
            fields.append(pa.field(dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = pa.schema(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
예제 #18
0
    def test_integer_no_nulls(self):
        data = OrderedDict()
        fields = []

        numpy_dtypes = [
            ('i1', pa.int8()), ('i2', pa.int16()),
            ('i4', pa.int32()), ('i8', pa.int64()),
            ('u1', pa.uint8()), ('u2', pa.uint16()),
            ('u4', pa.uint32()), ('u8', pa.uint64()),
            ('longlong', pa.int64()), ('ulonglong', pa.uint64())
        ]
        num_values = 100

        for dtype, arrow_dtype in numpy_dtypes:
            info = np.iinfo(dtype)
            values = np.random.randint(max(info.min, np.iinfo(np.int_).min),
                                       min(info.max, np.iinfo(np.int_).max),
                                       size=num_values)
            data[dtype] = values.astype(dtype)
            fields.append(pa.field(dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = pa.schema(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
예제 #19
0
def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')
def generate_type_mapper(
    pd_boolean=None,
    pd_integer=None,
    pd_string=None,
    pd_date_type=None,
    pd_timestamp_type=None,
):
    """Specifies the pyarrow data types mapping to corresponding Pandas data types.

    Args:
        pd_boolean: if not noe, use the new Pandas bool type. Defaults to None.
        pd_integer: if not None, use the new Pandas nullable integer type rather than
            defaulting to floats. Defaults to None.
        pd_string: if not None, use the new Pandas str type. Defaults to None.
        pd_date_type: Defaults to None.
        pd_timestamp_type: Defaults to None.

    Returns:
        Type mappings between pyarrow and pandas data types.
    """
    tm = {}
    if pd_boolean:
        bool_map = {pa.bool_(): pd.BooleanDtype()}
        tm = {**tm, **bool_map}
    if pd_string:
        string_map = {pa.string(): pd.StringDtype()}
        tm = {**tm, **string_map}

    if pd_integer:
        int_map = {
            pa.int8(): pd.Int64Dtype(),
            pa.int16(): pd.Int64Dtype(),
            pa.int32(): pd.Int64Dtype(),
            pa.int64(): pd.Int64Dtype(),
            pa.uint8(): pd.Int64Dtype(),
            pa.uint16(): pd.Int64Dtype(),
            pa.uint32(): pd.Int64Dtype(),
            pa.uint64(): pd.Int64Dtype(),
        }
        tm = {**tm, **int_map}
    else:
        # No brackets for either keys or values in this dictionary
        # This lets types_mapper understand the numpy data type
        float_map = {
            pa.int8: np.float64,
            pa.int16: np.float64,
            pa.int32: np.float64,
            pa.int64: np.float64,
            pa.uint8: np.float64,
            pa.uint16: np.float64,
            pa.uint32: np.float64,
            pa.uint64: np.float64,
        }
        tm = {**tm, **float_map}

    if pd_date_type == "pd_period":
        date_map = {pa.date64(): pd.PeriodDtype("ms")}
        tm = {**tm, **date_map}

    if pd_timestamp_type == "pd_period":
        datetime_map = {
            pa.timestamp("s"): pd.PeriodDtype("s"),
            pa.timestamp("ms"): pd.PeriodDtype("ms"),
            pa.timestamp("us"): pd.PeriodDtype("us"),
            pa.timestamp("ns"): pd.PeriodDtype("ns"),
        }
        tm = {**tm, **datetime_map}
    if tm:
        return tm.get
    else:
        return None
예제 #21
0
    d = pa.array([0, 2, 0, 3], type=pa.int32())

    eq([a], [a])
    ne([a], [b])
    eq([a, c], [a, c])
    eq([a, c], [d])
    ne([c, a], [a, c])

    assert not pa.chunked_array([], type=pa.int32()).equals(None)


@pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])
def test_chunked_array_pickle(data, typ):
    arrays = []
    while data:
        arrays.append(pa.array(data[:2], type=typ))
        data = data[2:]
    array = pa.chunked_array(arrays, type=typ)
    array.validate()
    result = pickle.loads(pickle.dumps(array))
    result.validate()
    assert result.equals(array)
예제 #22
0
파일: test_table.py 프로젝트: dremio/arrow
    eq([a], [a])
    ne([a], [b])
    eq([a, c], [a, c])
    eq([a, c], [d])
    ne([c, a], [a, c])


@pytest.mark.parametrize(
    ('data', 'typ'),
    [
        ([True, False, True, True], pa.bool_()),
        ([1, 2, 4, 6], pa.int64()),
        ([1.0, 2.5, None], pa.float64()),
        (['a', None, 'b'], pa.string()),
        ([], pa.list_(pa.uint8())),
        ([[1, 2], [3]], pa.list_(pa.int64())),
        ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
        ([(1, 'a'), (2, 'c'), None],
            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
    ]
)
def test_chunked_array_pickle(data, typ):
    arrays = []
    while data:
        arrays.append(pa.array(data[:2], type=typ))
        data = data[2:]
    array = pa.chunked_array(arrays, type=typ)
    result = pickle.loads(pickle.dumps(array))
    assert result.equals(array)
예제 #23
0
import numpy as np
import pyarrow as pa
import tensorflow as tf
from tfx_bsl.tfxio import tensor_adapter
from tfx_bsl.tfxio import tensor_to_arrow
from google.protobuf import text_format
from absl.testing import absltest
from absl.testing import parameterized
from tensorflow_metadata.proto.v0 import schema_pb2

_TF_TYPE_TO_ARROW_TYPE = {
    tf.int8: pa.int8(),
    tf.int16: pa.int16(),
    tf.int32: pa.int32(),
    tf.int64: pa.int64(),
    tf.uint8: pa.uint8(),
    tf.uint16: pa.uint16(),
    tf.uint32: pa.uint32(),
    tf.uint64: pa.uint64(),
    tf.float32: pa.float32(),
    tf.float64: pa.float64(),
    tf.string: pa.large_binary(),
}

_ROW_PARTITION_DTYPES = {"INT64": np.int64, "INT32": np.int32}


def _make_2d_varlen_sparse_tensor_test_cases():
    result = []
    for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items():
        if tf_type == tf.string:
예제 #24
0
    np.arange(10, dtype=np.float16),
])
def test_to_numpy_roundtrip(narr):
    arr = pa.array(narr)
    assert narr.dtype == arr.to_numpy().dtype
    np.testing.assert_array_equal(narr, arr.to_numpy())
    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2**63], type=pa.uint64())
    expected = pa.array(np.array([2**63], dtype='u8'))
예제 #25
0
import collections
import datetime
import decimal
import itertools
import numpy as np
import six
import pytz


int_type_pairs = [
    (np.int8, pa.int8()),
    (np.int16, pa.int16()),
    (np.int32, pa.int32()),
    (np.int64, pa.int64()),
    (np.uint8, pa.uint8()),
    (np.uint16, pa.uint16()),
    (np.uint32, pa.uint32()),
    (np.uint64, pa.uint64())]


np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst

    def __iter__(self):
        return self.lst.__iter__()
예제 #26
0
import collections
import datetime
import decimal
import itertools
import math
import traceback
import sys

import numpy as np
import pytz
import six

int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int16()),
                  (np.int32, pa.int32()), (np.int64, pa.int64()),
                  (np.uint8, pa.uint8()), (np.uint16, pa.uint16()),
                  (np.uint32, pa.uint32()), (np.uint64, pa.uint64())]

np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst

    def __iter__(self):
        return self.lst.__iter__()


class MyInt:
    def __init__(self, value):
예제 #27
0
파일: dtypes.py 프로젝트: mikest18/cudf
from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar

_NA_REP = "<NA>"
_np_pa_dtypes = {
    np.float64: pa.float64(),
    np.float32: pa.float32(),
    np.int64: pa.int64(),
    np.longlong: pa.int64(),
    np.int32: pa.int32(),
    np.int16: pa.int16(),
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
예제 #28
0
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i


@pytest.mark.parametrize('t,check_func', [
    (pa.date32(), types.is_date32),
    (pa.date64(), types.is_date64),
    (pa.time32('s'), types.is_time32),
    (pa.time64('ns'), types.is_time64),
    (pa.int8(), types.is_int8),
    (pa.int16(), types.is_int16),
    (pa.int32(), types.is_int32),
    (pa.int64(), types.is_int64),
    (pa.uint8(), types.is_uint8),
    (pa.uint16(), types.is_uint16),
    (pa.uint32(), types.is_uint32),
    (pa.uint64(), types.is_uint64),
    (pa.float16(), types.is_float16),
    (pa.float32(), types.is_float32),
    (pa.float64(), types.is_float64)
])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)
예제 #29
0
def read_type(doc):
    t = doc[TYPE]

    if PARAM in doc:
        tp = doc[PARAM]
    else:
        tp = None

    if t == 'null':
        return pyarrow.null()

    if t == 'bool':
        return pyarrow.bool_()

    if t == 'int8':
        return pyarrow.int8()

    if t == 'int16':
        return pyarrow.int16()

    if t == 'int32':
        return pyarrow.int32()

    if t == 'int64':
        return pyarrow.int64()

    if t == 'uint8':
        return pyarrow.uint8()

    if t == 'uint16':
        return pyarrow.uint16()

    if t == 'uint32':
        return pyarrow.uint32()

    if t == 'uint64':
        return pyarrow.uint64()

    if t == 'float16':
        return pyarrow.float16()

    if t == 'float32':
        return pyarrow.float32()

    if t == 'float64':
        return pyarrow.float64()

    if t == 'date[d]':
        return pyarrow.date32()

    if t == 'date[ms]':
        return pyarrow.date64()

    if t == 'timestamp[s]':
        return pyarrow.timestamp('s')

    if t == 'timestamp[ms]':
        return pyarrow.timestamp('ms')

    if t == 'timestamp[us]':
        return pyarrow.timestamp('us')

    if t == 'timestamp[ns]':
        return pyarrow.timestamp('ns')

    if t == 'time[s]':
        return pyarrow.time32('s')

    if t == 'time[ms]':
        return pyarrow.time32('ms')

    if t == 'time[us]':
        return pyarrow.time64('us')

    if t == 'time[ns]':
        return pyarrow.time64('ns')

    if t == 'utf8':
        return pyarrow.utf8()

    if t == 'bytes':
        return pyarrow.binary()

    if t == 'factor':
        if tp is None:
            index_type = pyarrow.int32()
            dict_type = pyarrow.utf8()
        else:
            index_type = read_type(tp[INDEX])
            dict_type = read_type(tp[DICT])
        return pyarrow.dictionary(index_type, dict_type, False)

    if t == 'ordered':
        if tp is None:
            index_type = pyarrow.int32()
            dict_type = pyarrow.utf8()
        else:
            index_type = read_type(tp[INDEX])
            dict_type = read_type(tp[DICT])
        return pyarrow.dictionary(index_type, dict_type, True)

    if t == 'opaque':
        return pyarrow.binary(tp)

    if t == 'list':
        return pyarrow.list_(read_type(tp))

    if t == 'struct':
        return pyarrow.struct(
            [pyarrow.field(f[NAME], read_type(f)) for f in tp])

    raise ValueError(f'{t} is not supported BSON DataFrame type')
예제 #30
0
    ne([a], [b])
    eq([a, c], [a, c])
    eq([a, c], [d])
    ne([c, a], [a, c])

    assert not pa.chunked_array([], type=pa.int32()).equals(None)


@pytest.mark.parametrize(
    ('data', 'typ'),
    [
        ([True, False, True, True], pa.bool_()),
        ([1, 2, 4, 6], pa.int64()),
        ([1.0, 2.5, None], pa.float64()),
        (['a', None, 'b'], pa.string()),
        ([], pa.list_(pa.uint8())),
        ([[1, 2], [3]], pa.list_(pa.int64())),
        ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
        ([(1, 'a'), (2, 'c'), None],
            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
    ]
)
def test_chunked_array_pickle(data, typ):
    arrays = []
    while data:
        arrays.append(pa.array(data[:2], type=typ))
        data = data[2:]
    array = pa.chunked_array(arrays, type=typ)
    result = pickle.loads(pickle.dumps(array))
    assert result.equals(array)
예제 #31
0
)

null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())

signed_integer_types = st.sampled_from([
    pa.int8(),
    pa.int16(),
    pa.int32(),
    pa.int64()
])
unsigned_integer_types = st.sampled_from([
    pa.uint8(),
    pa.uint16(),
    pa.uint32(),
    pa.uint64()
])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([
    pa.float16(),
    pa.float32(),
    pa.float64()
])
decimal_type = st.builds(
    pa.decimal128,
    precision=st.integers(min_value=1, max_value=38),
    scale=st.integers(min_value=1, max_value=38)
예제 #32
0
# them using Java code as well as enables us to define them as parameters
# without to invoke the JVM.
#
# The specifications were created using:
#
#   om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')()
#   field = …  # Code to instantiate the field
#   jvm_spec = om.writeValueAsString(field)
@pytest.mark.parametrize('pa_type,jvm_spec', [
    (pa.null(), '{"name":"null"}'),
    (pa.bool_(), '{"name":"bool"}'),
    (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
    (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
    (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
    (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
    (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
    (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
    (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
    (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
    (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
    (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
    (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
    (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
    (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
    (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
    (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
    (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
        '"timezone":null}'),
    (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
        '"timezone":null}'),
    (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
    Args:
        meta_type ([type]): str
    """
    ac = ArrowConverter()
    _ = ac.convert_col_type(meta_type)


@pytest.mark.parametrize(
    argnames="meta_type,arrow_type",
    argvalues=[
        ("bool_", pa.bool_()),
        ("int8", pa.int8()),
        ("int16", pa.int16()),
        ("int32", pa.int32()),
        ("int64", pa.int64()),
        ("uint8", pa.uint8()),
        ("uint16", pa.uint16()),
        ("uint32", pa.uint32()),
        ("uint64", pa.uint64()),
        ("float16", pa.float16()),
        ("float32", pa.float32()),
        ("float64", pa.float64()),
        ("decimal128(38,1)", pa.decimal128(38, 1)),
        ("decimal128(1,2)", pa.decimal128(1, 2)),
        ("time32(s)", pa.time32("s")),
        ("time32(ms)", pa.time32("ms")),
        ("time64(us)", pa.time64("us")),
        ("time64(ns)", pa.time64("ns")),
        ("timestamp(s)", pa.timestamp("s")),
        ("timestamp(ms)", pa.timestamp("ms")),
        ("timestamp(us)", pa.timestamp("us")),
예제 #34
0
# under the License.

import os
import sys
import pytest

import numpy as np
import pyarrow as pa


tensor_type_pairs = [
    ('i1', pa.int8()),
    ('i2', pa.int16()),
    ('i4', pa.int32()),
    ('i8', pa.int64()),
    ('u1', pa.uint8()),
    ('u2', pa.uint16()),
    ('u4', pa.uint32()),
    ('u8', pa.uint64()),
    ('f2', pa.float16()),
    ('f4', pa.float32()),
    ('f8', pa.float64())
]


def test_tensor_attrs():
    data = np.random.randn(10, 4)

    tensor = pa.Tensor.from_numpy(data)

    assert tensor.ndim == 2
예제 #35
0
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    result = pickle.loads(pickle.dumps(array))
    assert array.equals(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [
        (pa.null(), 'empty'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
예제 #36
0
def test_get_eq_func():
    for t in [
            pa.int8(),
            pa.int16(),
            pa.int32(),
            pa.int64(),
            pa.uint8(),
            pa.uint16(),
            pa.uint32(),
            pa.uint64(),
    ]:
        assert not get_eq_func(t)(0, 1)
        assert not get_eq_func(t)(None, 1)
        assert get_eq_func(t)(1, 1)
        assert get_eq_func(t)(None, None)
    t = pa.null()
    assert get_eq_func(t)("0", "1")
    assert get_eq_func(t)(None, "1")
    assert get_eq_func(t)("1", "1")
    assert get_eq_func(t)(None, None)
    t = pa.string()
    assert not get_eq_func(t)("0", "1")
    assert not get_eq_func(t)(None, "1")
    assert get_eq_func(t)("1", "1")
    assert get_eq_func(t)(None, None)
    t = pa.bool_()
    assert not get_eq_func(t)(False, True)
    assert not get_eq_func(t)(None, False)
    assert not get_eq_func(t)(None, True)
    assert get_eq_func(t)(True, True)
    assert get_eq_func(t)(False, False)
    assert get_eq_func(t)(None, None)
    for t in [pa.float16(), pa.float32(), pa.float64()]:
        assert not get_eq_func(t)(0.0, 1.1)
        assert get_eq_func(t)(1.1, 1.1)
        assert get_eq_func(t)(None, float("nan"))
        for n in [None, float("nan"), float("inf"), float("-inf")]:
            assert not get_eq_func(t)(None, 1.1)
            assert get_eq_func(t)(None, None)
    for t in [pa.timestamp("ns")]:
        for n in [None, pd.NaT]:
            assert not get_eq_func(t)(datetime(2020, 1, 1, 0),
                                      datetime(2020, 1, 1, 1))
            assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(datetime(2020, 1, 1, 1),
                                  datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(n, n)
    assert get_eq_func(pa.timestamp("ns"))(None, pd.NaT)
    for t in [pa.date32()]:
        for n in [None, pd.NaT]:
            assert get_eq_func(t)(datetime(2020, 1, 1, 0),
                                  datetime(2020, 1, 1, 1))
            assert not get_eq_func(t)(datetime(2020, 1, 1), datetime(
                2020, 1, 2).date())
            assert not get_eq_func(t)(n, datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(datetime(2020, 1, 1).date(),
                                  datetime(2020, 1, 1, 1))
            assert get_eq_func(t)(n, n)
    t = pa.struct([pa.field("a", pa.int32())])
    assert not get_eq_func(t)(dict(a=0), dict(a=1))
    assert not get_eq_func(t)(None, dict(a=1))
    assert get_eq_func(t)(dict(a=1), dict(a=1))
    assert get_eq_func(t)(None, None)
    t = pa.list_(pa.int32())
    assert not get_eq_func(t)([0], [1])
    assert not get_eq_func(t)(None, [1])
    assert get_eq_func(t)([1], [1])
    assert get_eq_func(t)(None, None)
예제 #37
0
# TODO(kszucs): alphanum_text, surrogate_text
custom_text = st.text(
    alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E))

null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())

signed_integer_types = st.sampled_from(
    [pa.int8(), pa.int16(), pa.int32(),
     pa.int64()])
unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=0, max_value=38),
                         scale=st.integers(min_value=0, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
     pa.time32('ms'),
     pa.time64('us'),
     pa.time64('ns')])
예제 #38
0
    field = pa.field('a', pa.int32())
    wr = weakref.ref(field)
    assert wr() is not None
    del field
    assert wr() is None


@pytest.mark.parametrize('t,check_func', [(pa.date32(), types.is_date32),
                                          (pa.date64(), types.is_date64),
                                          (pa.time32('s'), types.is_time32),
                                          (pa.time64('ns'), types.is_time64),
                                          (pa.int8(), types.is_int8),
                                          (pa.int16(), types.is_int16),
                                          (pa.int32(), types.is_int32),
                                          (pa.int64(), types.is_int64),
                                          (pa.uint8(), types.is_uint8),
                                          (pa.uint16(), types.is_uint16),
                                          (pa.uint32(), types.is_uint32),
                                          (pa.uint64(), types.is_uint64),
                                          (pa.float16(), types.is_float16),
                                          (pa.float32(), types.is_float32),
                                          (pa.float64(), types.is_float64)])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)


def test_type_id():
    # enum values are not exposed publicly
    for ty in get_many_types():
        assert isinstance(ty.id, int)
예제 #39
0
import datetime
import decimal
import itertools
import math
import traceback

import numpy as np
import pytz


int_type_pairs = [
    (np.int8, pa.int8()),
    (np.int16, pa.int16()),
    (np.int32, pa.int32()),
    (np.int64, pa.int64()),
    (np.uint8, pa.uint8()),
    (np.uint16, pa.uint16()),
    (np.uint32, pa.uint32()),
    (np.uint64, pa.uint64())]


np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst

    def __iter__(self):
        return self.lst.__iter__()
예제 #40
0
     "TIME": pyarrow_time,
     "TIMESTAMP": pyarrow_timestamp,
 }
 ARROW_SCALAR_IDS_TO_BQ = {
     # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
     pyarrow.bool_().id:
     "BOOL",
     pyarrow.int8().id:
     "INT64",
     pyarrow.int16().id:
     "INT64",
     pyarrow.int32().id:
     "INT64",
     pyarrow.int64().id:
     "INT64",
     pyarrow.uint8().id:
     "INT64",
     pyarrow.uint16().id:
     "INT64",
     pyarrow.uint32().id:
     "INT64",
     pyarrow.uint64().id:
     "INT64",
     pyarrow.float16().id:
     "FLOAT64",
     pyarrow.float32().id:
     "FLOAT64",
     pyarrow.float64().id:
     "FLOAT64",
     pyarrow.time32("ms").id:
     "TIME",
예제 #41
0
    def cast_to_compatible_types(table):
        """
        Cast PyArrow table to be fully compatible with OmniSci.

        Parameters
        ----------
        table : pyarrow.Table
            Source table.

        Returns
        -------
        pyarrow.Table
            Table with fully compatible types with OmniSci.
        """
        schema = table.schema
        new_schema = schema
        need_cast = False
        uint_to_int_cast = False
        new_cols = {}
        uint_to_int_map = {
            pa.uint8(): pa.int16(),
            pa.uint16(): pa.int32(),
            pa.uint32(): pa.int64(),
            pa.uint64(): pa.int64(),  # May cause overflow
        }
        for i, field in enumerate(schema):
            # Currently OmniSci doesn't support Arrow table import with
            # dictionary columns. Here we cast dictionaries until support
            # is in place.
            # https://github.com/modin-project/modin/issues/1738
            if pa.types.is_dictionary(field.type):
                # Conversion for dictionary of null type to string is not supported
                # in Arrow. Build new column for this case for now.
                if pa.types.is_null(field.type.value_type):
                    mask = np.full(table.num_rows, True, dtype=bool)
                    new_col_data = np.empty(table.num_rows, dtype=str)
                    new_col = pa.array(new_col_data, pa.string(), mask)
                    new_cols[i] = new_col
                else:
                    need_cast = True
                new_field = pa.field(field.name, pa.string(), field.nullable,
                                     field.metadata)
                new_schema = new_schema.set(i, new_field)
            # OmniSci doesn't support importing Arrow's date type:
            # https://github.com/omnisci/omniscidb/issues/678
            elif pa.types.is_date(field.type):
                # Arrow's date is the number of days since the UNIX-epoch, so we can convert it
                # to a timestamp[s] (number of seconds since the UNIX-epoch) without losing precision
                new_field = pa.field(field.name, pa.timestamp("s"),
                                     field.nullable, field.metadata)
                new_schema = new_schema.set(i, new_field)
                need_cast = True
            # OmniSci doesn't support unsigned types
            elif pa.types.is_unsigned_integer(field.type):
                new_field = pa.field(
                    field.name,
                    uint_to_int_map[field.type],
                    field.nullable,
                    field.metadata,
                )
                new_schema = new_schema.set(i, new_field)
                need_cast = True
                uint_to_int_cast = True

        # Such cast may affect the data, so we have to raise a warning about it
        if uint_to_int_cast:
            ErrorMessage.single_warning(
                "OmniSci does not support unsigned integer types, such types will be rounded up to the signed equivalent."
            )

        for i, col in new_cols.items():
            table = table.set_column(i, new_schema[i], col)

        if need_cast:
            try:
                table = table.cast(new_schema)
            except pa.lib.ArrowInvalid as e:
                raise (
                    OverflowError if uint_to_int_cast else RuntimeError
                )("An error occurred when trying to convert unsupported by OmniSci 'dtypes' "
                  +
                  f"to the supported ones, the schema to cast was: \n{new_schema}."
                  ) from e

        return table
예제 #42
0
def test_index_store_roundtrip_ts(store, dtype, timestamps):
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct=dict(zip(timestamps, [["part_1", "part_2"], ["part_3"]])),
        index_storage_key=storage_key,
        dtype=dtype,
    )
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store)
    assert index1 == index2


@pytest.mark.parametrize(
    "dtype,expected", [(pa.int8(), pa.int64()), (pa.uint8(), pa.uint64()), (None, None)]
)
def test_index_normalize_dtype(dtype, expected):
    index = ExplicitSecondaryIndex(
        column="col", dtype=dtype, index_storage_key="dataset_uuid/some_index.parquet"
    )
    assert index.dtype == expected


def test_index_raises_nested_dtype():
    with pytest.raises(NotImplementedError) as exc:
        ExplicitSecondaryIndex(
            column="col",
            dtype=pa.list_(pa.int8()),
            index_storage_key="dataset_uuid/some_index.parquet",
        )
예제 #43
0
파일: utils.py 프로젝트: wphicks/cudf
# Copyright (c) 2020, NVIDIA CORPORATION.

import random

import pandas as pd
import pyarrow as pa

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
    pa.bool_(): pd.BooleanDtype(),
    pa.string(): pd.StringDtype(),
}


def _generate_rand_meta(obj, dtypes_list):
    obj._current_params = {}
    num_rows = obj._rand(obj._max_rows)
    num_cols = obj._rand(obj._max_columns)

    dtypes_meta = []

    for _ in range(num_cols):
        dtype = random.choice(dtypes_list)
        null_frequency = random.uniform(0, 1)
예제 #44
0
    storage_key = "dataset_uuid/some_index.parquet"
    index1 = ExplicitSecondaryIndex(
        column="col",
        index_dct=dict(zip(timestamps, [["part_1", "part_2"], ["part_3"]])),
        index_storage_key=storage_key,
        dtype=dtype,
    )
    key1 = index1.store(store, "dataset_uuid")

    index2 = ExplicitSecondaryIndex(column="col",
                                    index_storage_key=key1).load(store)
    assert index1 == index2


@pytest.mark.parametrize("dtype,expected", [(pa.int8(), pa.int64()),
                                            (pa.uint8(), pa.uint64()),
                                            (None, None)])
def test_index_normalize_dtype(dtype, expected):
    index = ExplicitSecondaryIndex(
        column="col",
        dtype=dtype,
        index_storage_key="dataset_uuid/some_index.parquet")
    assert index.dtype == expected


def test_index_raises_nested_dtype():
    with pytest.raises(NotImplementedError) as exc:
        ExplicitSecondaryIndex(
            column="col",
            dtype=pa.list_(pa.int8()),
            index_storage_key="dataset_uuid/some_index.parquet",
예제 #45
0
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import numpy as np
import pytest

import pyarrow as pa


@pytest.mark.parametrize('arrow_type', [
    pa.int8(),
    pa.int16(),
    pa.int64(),
    pa.uint8(),
    pa.uint16(),
    pa.uint64(),
    pa.float32(),
    pa.float64()
])
def test_sum(arrow_type):
    arr = pa.array([1, 2, 3, 4], type=arrow_type)
    assert arr.sum() == 10


@pytest.mark.parametrize(
    ('ty', 'values'),
    [('bool', [True, False, False, True, True]), ('uint8', np.arange(5)),
     ('int8', np.arange(5)), ('uint16', np.arange(5)), ('int16', np.arange(5)),
     ('uint32', np.arange(5)), ('int32', np.arange(5)),
예제 #46
0
class MisraGriesSketchTest(parameterized.TestCase):
    @parameterized.named_parameters(
        ("binary", [b"a", b"a", b"b", b"c", None], pa.binary()),
        ("large_binary", [b"a", b"a", b"b", b"c"], pa.large_binary()),
        ("string", ["a", "a", "b", "c", None], pa.string()),
        ("large_string", ["a", "a", "b", "c"], pa.large_string()),
    )
    def test_add_binary_like(self, values, binary_like_type):
        expected_counts = [{
            "values": b"a",
            "counts": 2.0
        }, {
            "values": b"b",
            "counts": 1.0
        }, {
            "values": b"c",
            "counts": 1.0
        }]
        sketch = _create_basic_sketch(pa.array(values, type=binary_like_type))
        estimate = sketch.Estimate()
        estimate.validate(full=True)
        self.assertEqual(estimate.to_pylist(), expected_counts)

    @parameterized.named_parameters(
        ("int8", [1, 1, 2, 3, None], pa.int8()),
        ("int16", [1, 1, 2, 3], pa.int16()),
        ("int32", [1, 1, 2, 3, None], pa.int32()),
        ("int64", [1, 1, 2, 3], pa.int64()),
        ("uint8", [1, 1, 2, 3], pa.uint8()),
        ("uint16", [1, None, 1, 2, 3], pa.uint16()),
        ("uint32", [1, 1, 2, 3], pa.uint32()),
        ("uint64", [1, 1, 2, 3, None], pa.uint64()),
    )
    def test_add_integer(self, values, integer_type):
        expected_counts = [{
            "values": b"1",
            "counts": 2.0
        }, {
            "values": b"2",
            "counts": 1.0
        }, {
            "values": b"3",
            "counts": 1.0
        }]
        sketch = _create_basic_sketch(pa.array(values, type=integer_type))
        estimate = sketch.Estimate()
        estimate.validate(full=True)
        self.assertEqual(estimate.to_pylist(), expected_counts)

    def test_add_weighted_values(self):
        items = pa.array(["a", "a", "b", "c"], type=pa.string())
        weights = pa.array([4, 3, 2, 1], type=pa.float32())
        sketch = _create_basic_sketch(items, weights=weights)

        expected_counts = [{
            "values": b"a",
            "counts": 7.0
        }, {
            "values": b"b",
            "counts": 2.0
        }, {
            "values": b"c",
            "counts": 1.0
        }]
        estimate = sketch.Estimate()
        estimate.validate(full=True)

        self.assertEqual(estimate.to_pylist(), expected_counts)

    def test_add_invalid_weights(self):
        items = pa.array(["a", "a", "b", "c"], type=pa.string())
        weights = pa.array([4, 3, 2, 1], type=pa.int64())
        with self.assertRaisesRegex(
                RuntimeError,
                "INVALID_ARGUMENT: Weight array must be float type."):
            _create_basic_sketch(items, weights=weights)

    def test_add_unsupported_type(self):
        values = pa.array([True, False], pa.bool_())
        sketch = sketches.MisraGriesSketch(_NUM_BUCKETS)
        with self.assertRaisesRegex(RuntimeError, "UNIMPLEMENTED: bool"):
            sketch.AddValues(values)

    def test_replace_invalid_utf8(self):
        values1 = pa.array([
            b"a",
            b"\x80",  # invalid
            b"\xC1",  # invalid
        ])
        values2 = pa.array([
            b"\xc0\x80",  # invalid
            b"a"
        ])
        sketch1 = sketches.MisraGriesSketch(
            _NUM_BUCKETS, invalid_utf8_placeholder=b"<BYTES>")
        sketch1.AddValues(values1)

        sketch2 = sketches.MisraGriesSketch(
            _NUM_BUCKETS, invalid_utf8_placeholder=b"<BYTES>")
        sketch2.AddValues(values2)

        serialized1 = sketch1.Serialize()
        serialized2 = sketch2.Serialize()

        sketch1 = sketches.MisraGriesSketch.Deserialize(serialized1)
        sketch2 = sketches.MisraGriesSketch.Deserialize(serialized2)
        sketch1.AddValues(values2)
        sketch1.Merge(sketch2)

        actual = sketch1.Estimate()
        actual.validate(full=True)
        self.assertEqual(actual.to_pylist(), [
            {
                "values": b"<BYTES>",
                "counts": 4.0
            },
            {
                "values": b"a",
                "counts": 3.0
            },
        ])

    def test_no_replace_invalid_utf8(self):
        sketch = sketches.MisraGriesSketch(_NUM_BUCKETS)
        sketch.AddValues(pa.array([b"\x80"]))
        actual = sketch.Estimate()
        self.assertEqual(actual.to_pylist(), [
            {
                "values": b"\x80",
                "counts": 1.0
            },
        ])

    def test_large_string_threshold(self):
        values1 = pa.array(["a", "bbb", "c", "d", "eeff"])
        values2 = pa.array(["a", "gghh"])
        sketch1 = sketches.MisraGriesSketch(
            _NUM_BUCKETS,
            large_string_threshold=2,
            large_string_placeholder=b"<LARGE>")
        sketch1.AddValues(values1)

        sketch2 = sketches.MisraGriesSketch(
            _NUM_BUCKETS,
            large_string_threshold=2,
            large_string_placeholder=b"<LARGE>")
        sketch2.AddValues(values2)

        serialized1 = sketch1.Serialize()
        serialized2 = sketch2.Serialize()

        sketch1 = sketches.MisraGriesSketch.Deserialize(serialized1)
        sketch2 = sketches.MisraGriesSketch.Deserialize(serialized2)
        sketch1.AddValues(values2)
        sketch1.Merge(sketch2)

        actual = sketch1.Estimate()
        actual.validate(full=True)
        self.assertEqual(actual.to_pylist(), [
            {
                "values": b"<LARGE>",
                "counts": 4.0
            },
            {
                "values": b"a",
                "counts": 3.0
            },
            {
                "values": b"c",
                "counts": 1.0
            },
            {
                "values": b"d",
                "counts": 1.0
            },
        ])

    def test_invalid_large_string_replacing_config(self):
        with self.assertRaisesRegex(
                RuntimeError,
                "Must provide both or neither large_string_threshold and "
                "large_string_placeholder"):
            _ = sketches.MisraGriesSketch(_NUM_BUCKETS,
                                          large_string_threshold=1024)

        with self.assertRaisesRegex(
                RuntimeError,
                "Must provide both or neither large_string_threshold and "
                "large_string_placeholder"):
            _ = sketches.MisraGriesSketch(_NUM_BUCKETS,
                                          large_string_placeholder=b"<L>")

    def test_many_uniques(self):
        # Test that the tail elements with equal counts are not discarded after
        # `AddValues` call.
        sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"]),
                                      num_buckets=2)
        estimate = sketch.Estimate()
        estimate.validate(full=True)
        # Since "b" and "c" have equal counts and neither token has count > 4/2, any
        # combination is possible.
        all_counts = [{
            "values": b"a",
            "counts": 2.0
        }, {
            "values": b"b",
            "counts": 1.0
        }, {
            "values": b"c",
            "counts": 1.0
        }]
        self.assertIn(tuple(estimate.to_pylist()),
                      list(itertools.combinations(all_counts, 2)))

    def test_merge(self):
        sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))
        sketch2 = _create_basic_sketch(pa.array(["d", "a"]))

        sketch1.Merge(sketch2)
        estimate = sketch1.Estimate()
        estimate.validate(full=True)
        expected_counts = [{
            "values": b"a",
            "counts": 3.0
        }, {
            "values": b"b",
            "counts": 1.0
        }, {
            "values": b"c",
            "counts": 1.0
        }, {
            "values": b"d",
            "counts": 1.0
        }]

        self.assertEqual(estimate.to_pylist(), expected_counts)

    def test_merge_equal_to_kth_weights(self):
        # Test that tail elements with equal counts are not discarded after
        # `Compress` call.
        sketch1 = _create_basic_sketch(pa.array(["a"] * 5 + ["b"] * 5 +
                                                ["c"] * 4 + ["a"] * 4),
                                       num_buckets=3)
        sketch2 = _create_basic_sketch(pa.array(["d"] * 4 + ["a"] * 2),
                                       num_buckets=3)
        sketch1.Merge(sketch2)
        estimate = sketch1.Estimate()
        estimate.validate(full=True)
        # Since "c" and "d" have equal counts, the last entry may be either.
        expected_counts1 = [{
            "values": b"a",
            "counts": 11.0
        }, {
            "values": b"b",
            "counts": 5.0
        }, {
            "values": b"c",
            "counts": 4.0
        }]
        expected_counts2 = expected_counts1.copy()
        expected_counts2[2] = {"values": b"d", "counts": 4.0}
        self.assertIn(estimate.to_pylist(),
                      [expected_counts1, expected_counts2])

    def test_merge_with_extra_items(self):
        # Each of these sketches get more values than `num_buckets`. This will
        # result into removal of less frequent elements from the main buffer and
        # adding them to a buffer of extra elements.
        # Here we're testing that merging of sketches having extra elements is
        # correct and results in a sketch that produces the requested number of
        # elements.
        sketch1 = _create_basic_sketch(pa.array(["a"] * 3 + ["b"] * 2 +
                                                ["c", "d"]),
                                       num_buckets=3)
        sketch2 = _create_basic_sketch(pa.array(["e"] * 3 + ["f"] * 2 +
                                                ["g", "h"]),
                                       num_buckets=3)
        sketch3 = _create_basic_sketch(pa.array(["i"] * 2 + ["j", "k", "l"]),
                                       num_buckets=3)
        sketch1.Merge(sketch2)
        sketch1.Merge(sketch3)
        estimate = sketch1.Estimate()
        estimate.validate(full=True)

        # Due to large number of unique elements (relative to `num_buckets`), the
        # total estimated count error is 5.
        def get_expected_counts():
            for least_frequent_item in [b"b", b"f", b"i"]:
                yield [{
                    "values": b"a",
                    "counts": 5.0
                }, {
                    "values": b"e",
                    "counts": 5.0
                }, {
                    "values": least_frequent_item,
                    "counts": 5.0
                }]

        self.assertIn(estimate.to_pylist(), list(get_expected_counts()))

    def test_picklable(self):
        sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))
        pickled = pickle.dumps(sketch, 2)
        self.assertIsInstance(pickled, bytes)
        unpickled = pickle.loads(pickled)
        self.assertIsInstance(unpickled, sketches.MisraGriesSketch)

        estimate = unpickled.Estimate()
        estimate.validate(full=True)
        expected_counts = [{
            "values": b"a",
            "counts": 2.0
        }, {
            "values": b"b",
            "counts": 1.0
        }, {
            "values": b"c",
            "counts": 1.0
        }]

        self.assertEqual(estimate.to_pylist(), expected_counts)

    def test_serialization(self):
        sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))

        serialized = sketch.Serialize()
        self.assertIsInstance(serialized, bytes)

        deserialized = sketches.MisraGriesSketch.Deserialize(serialized)
        self.assertIsInstance(deserialized, sketches.MisraGriesSketch)

        estimate = deserialized.Estimate()
        estimate.validate(full=True)
        expected_counts = [{
            "values": b"a",
            "counts": 2.0
        }, {
            "values": b"b",
            "counts": 1.0
        }, {
            "values": b"c",
            "counts": 1.0
        }]

        self.assertEqual(estimate.to_pylist(), expected_counts)

    def test_deserialize_fails_with_exception(self):
        with self.assertRaisesRegex(RuntimeError,
                                    "Failed to parse MisraGries sketch"):
            sketches.MisraGriesSketch.Deserialize("I am no proto")
예제 #47
0
import pytest

from pyarrow.compat import unittest, u  # noqa
import pyarrow as pa

import collections
import datetime
import decimal
import itertools
import numpy as np
import six
import pytz

int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int64()),
                  (np.int32, pa.int32()), (np.int64, pa.int64()),
                  (np.uint8, pa.uint8()), (np.uint16, pa.uint64()),
                  (np.uint32, pa.uint32()), (np.uint64, pa.uint64())]

np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst

    def __iter__(self):
        return self.lst.__iter__()


def check_struct_type(ty, expected):
    """
예제 #48
0
import six
import tensorflow as tf
from tfx_bsl.tfxio import tensor_adapter

from google.protobuf import text_format
from absl.testing import absltest
from absl.testing import parameterized
from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
from tensorflow_metadata.proto.v0 import schema_pb2

_ALL_SUPPORTED_INT_VALUE_TYPES = [
    pa.int8(),
    pa.int16(),
    pa.int32(),
    pa.int64(),
    pa.uint8(),
    pa.uint16(),
    pa.uint32(),
    pa.uint64(),
]
_ALL_SUPPORTED_FLOATING_VALUE_TYPES = [pa.float32(), pa.float64()]
_ALL_SUPPORTED_STRING_VALUE_TYPES = [
    pa.binary(),
    pa.large_binary(),
    pa.string(),
    pa.large_string()
]
_ALL_SUPPORTED_VALUE_TYPES = (_ALL_SUPPORTED_INT_VALUE_TYPES +
                              _ALL_SUPPORTED_FLOATING_VALUE_TYPES +
                              _ALL_SUPPORTED_STRING_VALUE_TYPES)
_ARROW_TYPE_TO_TF_TYPE = {
예제 #49
0
def test_simple_type_construction():
    result = pa.lib.TimestampType()
    with pytest.raises(TypeError):
        str(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [
        (pa.null(), 'float64'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
import numpy as np
import pyarrow as pa

try:
    from scipy.sparse import csr_matrix, coo_matrix
except ImportError:
    coo_matrix = None
    csr_matrix = None

try:
    import sparse
except ImportError:
    sparse = None

tensor_type_pairs = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()),
                     ('i8', pa.int64()), ('u1', pa.uint8()),
                     ('u2', pa.uint16()), ('u4', pa.uint32()),
                     ('u8', pa.uint64()), ('f2', pa.float16()),
                     ('f4', pa.float32()), ('f8', pa.float64())]


@pytest.mark.parametrize('sparse_tensor_type', [
    pa.SparseCSRMatrix,
    pa.SparseCOOTensor,
])
def test_sparse_tensor_attrs(sparse_tensor_type):
    data = np.array([
        [8, 0, 2, 0, 0, 0],
        [0, 0, 0, 0, 0, 5],
        [3, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 4, 6],
예제 #51
0
import pickle
import pytest
import weakref

import numpy as np

import pyarrow as pa


@pytest.mark.parametrize(['value', 'ty', 'klass', 'deprecated'], [
    (False, None, pa.BooleanScalar, pa.BooleanValue),
    (True, None, pa.BooleanScalar, pa.BooleanValue),
    (1, None, pa.Int64Scalar, pa.Int64Value),
    (-1, None, pa.Int64Scalar, pa.Int64Value),
    (1, pa.int8(), pa.Int8Scalar, pa.Int8Value),
    (1, pa.uint8(), pa.UInt8Scalar, pa.UInt8Value),
    (1, pa.int16(), pa.Int16Scalar, pa.Int16Value),
    (1, pa.uint16(), pa.UInt16Scalar, pa.UInt16Value),
    (1, pa.int32(), pa.Int32Scalar, pa.Int32Value),
    (1, pa.uint32(), pa.UInt32Scalar, pa.UInt32Value),
    (1, pa.int64(), pa.Int64Scalar, pa.Int64Value),
    (1, pa.uint64(), pa.UInt64Scalar, pa.UInt64Value),
    (1.0, None, pa.DoubleScalar, pa.DoubleValue),
    (np.float16(1.0), pa.float16(), pa.HalfFloatScalar, pa.HalfFloatValue),
    (1.0, pa.float32(), pa.FloatScalar, pa.FloatValue),
    (decimal.Decimal("1.123"), None, pa.Decimal128Scalar, pa.Decimal128Value),
    ("string", None, pa.StringScalar, pa.StringValue),
    (b"bytes", None, pa.BinaryScalar, pa.BinaryValue),
    ("largestring", pa.large_string(), pa.LargeStringScalar,
     pa.LargeStringValue),
    (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar,
예제 #52
0
    tensor = pa.Tensor.from_numpy(data2)
    assert not tensor.is_mutable

def test_tensor_base_object():
    tensor = pa.Tensor.from_numpy(np.random.randn(10, 4))
    n = sys.getrefcount(tensor)
    array = tensor.to_numpy()
    assert sys.getrefcount(tensor) == n + 1


@pytest.mark.parametrize('dtype_str,arrow_type', [
    ('i1', pa.int8()),
    ('i2', pa.int16()),
    ('i4', pa.int32()),
    ('i8', pa.int64()),
    ('u1', pa.uint8()),
    ('u2', pa.uint16()),
    ('u4', pa.uint32()),
    ('u8', pa.uint64()),
    ('f2', pa.float16()),
    ('f4', pa.float32()),
    ('f8', pa.float64())
])
def test_tensor_numpy_roundtrip(dtype_str, arrow_type):
    dtype = np.dtype(dtype_str)
    data = (100 * np.random.randn(10, 4)).astype(dtype)

    tensor = pa.Tensor.from_numpy(data)
    assert tensor.type == arrow_type

    repr(tensor)