示例#1
0
def test_cast_signed_to_unsigned():
    safe_cases = [
        (np.array([0, 1, 2, 3], dtype='i1'), pa.uint8(),
         np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()),
        (np.array([0, 1, 2, 3], dtype='i2'), pa.uint16(),
         np.array([0, 1, 2, 3], dtype='u2'), pa.uint16())
    ]

    for case in safe_cases:
        _check_cast_case(case)
示例#2
0
def test_cast_integers_unsafe():
    # We let NumPy do the unsafe casting
    unsafe_cases = [
        (np.array([50000], dtype='i4'), 'int32',
         np.array([50000], dtype='i2'), pa.int16()),
        (np.array([70000], dtype='i4'), 'int32',
         np.array([70000], dtype='u2'), pa.uint16()),
        (np.array([-1], dtype='i4'), 'int32',
         np.array([-1], dtype='u2'), pa.uint16()),
        (np.array([50000], dtype='u2'), pa.uint16(),
         np.array([50000], dtype='i2'), pa.int16())
    ]

    for case in unsafe_cases:
        _check_cast_case(case, safe=False)
示例#3
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
示例#4
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
示例#5
0
def test_cast_integers_safe():
    safe_cases = [
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='i4'), pa.int32()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='u4'), pa.uint16()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='u1'), pa.uint8()),
        (np.array([0, 1, 2, 3], dtype='i1'), 'int8',
         np.array([0, 1, 2, 3], dtype='f8'), pa.float64())
    ]

    for case in safe_cases:
        _check_cast_case(case)

    unsafe_cases = [
        (np.array([50000], dtype='i4'), 'int32', 'int16'),
        (np.array([70000], dtype='i4'), 'int32', 'uint16'),
        (np.array([-1], dtype='i4'), 'int32', 'uint16'),
        (np.array([50000], dtype='u2'), 'uint16', 'int16')
    ]
    for in_data, in_type, out_type in unsafe_cases:
        in_arr = pa.array(in_data, type=in_type)

        with pytest.raises(pa.ArrowInvalid):
            in_arr.cast(out_type)
示例#6
0
文件: jvm.py 项目: rok/arrow
def _from_jvm_int_type(jvm_type):
    """
    Convert a JVM int type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int

    Returns
    -------
    typ: pyarrow.DataType
    """
    if jvm_type.isSigned:
        if jvm_type.bitWidth == 8:
            return pa.int8()
        elif jvm_type.bitWidth == 16:
            return pa.int16()
        elif jvm_type.bitWidth == 32:
            return pa.int32()
        elif jvm_type.bitWidth == 64:
            return pa.int64()
    else:
        if jvm_type.bitWidth == 8:
            return pa.uint8()
        elif jvm_type.bitWidth == 16:
            return pa.uint16()
        elif jvm_type.bitWidth == 32:
            return pa.uint32()
        elif jvm_type.bitWidth == 64:
            return pa.uint64()
def dataframe_with_arrays(include_index=False):
    """
    Dataframe with numpy arrays columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    """
    dtypes = [('i1', pa.int8()), ('i2', pa.int16()),
              ('i4', pa.int32()), ('i8', pa.int64()),
              ('u1', pa.uint8()), ('u2', pa.uint16()),
              ('u4', pa.uint32()), ('u8', pa.uint64()),
              ('f4', pa.float32()), ('f8', pa.float64())]

    arrays = OrderedDict()
    fields = []
    for dtype, arrow_dtype in dtypes:
        fields.append(pa.field(dtype, pa.list_(arrow_dtype)))
        arrays[dtype] = [
            np.arange(10, dtype=dtype),
            np.arange(5, dtype=dtype),
            None,
            np.arange(1, dtype=dtype)
        ]

    fields.append(pa.field('str', pa.list_(pa.string())))
    arrays['str'] = [
        np.array([u"1", u"ä"], dtype="object"),
        None,
        np.array([u"1"], dtype="object"),
        np.array([u"1", u"2", u"3"], dtype="object")
    ]

    fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms'))))
    arrays['datetime64'] = [
        np.array(['2007-07-13T01:23:34.123456789',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
        None,
        None,
        np.array(['2007-07-13T02',
                  None,
                  '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ms]'),
    ]

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))
    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
示例#8
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
示例#9
0
def test_table_from_lists_raises():
    data = [
        list(range(5)),
        [-10, -5, 0, 5, 10]
    ]

    with pytest.raises(TypeError):
        pa.Table.from_arrays(data, names=['a', 'b'])

    schema = pa.schema([
        pa.field('a', pa.uint16()),
        pa.field('b', pa.int64())
    ])
    with pytest.raises(TypeError):
        pa.Table.from_arrays(data, schema=schema)
示例#10
0
def test_is_integer():
    signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
    unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]

    for t in signed_ints + unsigned_ints:
        assert types.is_integer(t)

    for t in signed_ints:
        assert types.is_signed_integer(t)
        assert not types.is_unsigned_integer(t)

    for t in unsigned_ints:
        assert types.is_unsigned_integer(t)
        assert not types.is_signed_integer(t)

    assert not types.is_integer(pa.float32())
    assert not types.is_signed_integer(pa.float32())
示例#11
0
    def test_integer_no_nulls(self):
        data = {}
        fields = []

        numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()),
                        ('i4', A.int32()), ('i8', A.int64()),
                        ('u1', A.uint8()), ('u2', A.uint16()),
                        ('u4', A.uint32()), ('u8', A.uint64())]
        num_values = 100

        for dtype, arrow_dtype in numpy_dtypes:
            info = np.iinfo(dtype)
            values = np.random.randint(info.min,
                                       min(info.max, np.iinfo('i8').max),
                                       size=num_values)
            data[dtype] = values.astype(dtype)
            fields.append(A.Field.from_py(dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = A.Schema.from_fields(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
示例#12
0
def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns'))
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')
    def test_integer_no_nulls(self):
        data = OrderedDict()
        fields = []

        numpy_dtypes = [
            ('i1', pa.int8()), ('i2', pa.int16()),
            ('i4', pa.int32()), ('i8', pa.int64()),
            ('u1', pa.uint8()), ('u2', pa.uint16()),
            ('u4', pa.uint32()), ('u8', pa.uint64()),
            ('longlong', pa.int64()), ('ulonglong', pa.uint64())
        ]
        num_values = 100

        for dtype, arrow_dtype in numpy_dtypes:
            info = np.iinfo(dtype)
            values = np.random.randint(max(info.min, np.iinfo(np.int_).min),
                                       min(info.max, np.iinfo(np.int_).max),
                                       size=num_values)
            data[dtype] = values.astype(dtype)
            fields.append(pa.field(dtype, arrow_dtype))

        df = pd.DataFrame(data)
        schema = pa.schema(fields)
        self._check_pandas_roundtrip(df, expected_schema=schema)
示例#14
0
def main():
    # https://arrow.apache.org/docs/python/api/datatypes.html
    my_schema = pa.schema([
        # skip null

        ('c_bool', pa.bool_()),

        ('c_int8', pa.int8()),
        ('c_int16', pa.int16()),
        ('c_int32', pa.int32()),
        ('c_int64', pa.int64()),

        ('c_uint8', pa.uint8()),
        ('c_uint16', pa.uint16()),
        ('c_uint32', pa.uint32()),
        ('c_uint64', pa.uint64()),

        # skip ('c_float16', pa.float16()),
        ('c_float32', pa.float32()),
        ('c_float64', pa.float64()),

        ('c_time32', pa.time32('ms')),
        ('c_time64', pa.time64('ns')),
        ('c_timestamp', pa.timestamp('ms')),
        ('c_date32', pa.date32()),
        ('c_date64', pa.date64()),

        # skip binary

        ('c_string', pa.string()),

        # skip utf8
        # skip large_binary
        # skip large_string
        # skip large_utf8

        ('c_decimal128_8_3', pa.decimal128(8, 3))

        # skip list_
        # skip  large_list
        # skip struct
        # skip dictionary
        # skip field
        # skip schema
        # skip from_numpy_dtype
    ])

    c_bool = pa.array([False, True, False], type=pa.bool_())

    c_int8 = pa.array([1, 2, 3], type=pa.int8())
    c_int16 = pa.array([1, 2, 3], type=pa.int16())
    c_int32 = pa.array([1, 2, 3], type=pa.int32())
    c_int64 = pa.array([1, 2, 3], type=pa.int64())

    c_uint8 = pa.array([1, 2, 3], type=pa.uint8())
    c_uint16 = pa.array([1, 2, 3], type=pa.uint16())
    c_uint32 = pa.array([1, 2, 3], type=pa.uint32())
    c_uint64 = pa.array([1, 2, 3], type=pa.uint64())

    # c_float16 = pa.array([np.float16(1.0), np.float16(2.0), np.float16(3.0)], type=pa.float16())
    c_float32 = pa.array([1.0, 2.0, 3.0], type=pa.float32())
    c_float64 = pa.array([1.0, 2.0, 3.0], type=pa.float64())

    c_time32 = pa.array([1, 2, 3], type=pa.time32('ms'))
    c_time64 = pa.array([1, 2, 3], type=pa.time64('ns'))
    c_timestamp = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.timestamp('ms'))
    c_date32 = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.date32())
    c_date64 = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.date64())

    c_string = pa.array(
        ['*****@*****.**', '*****@*****.**', '*****@*****.**'],
        type=pa.string()
    )

    c_decimal128_8_3 = pa.array([1, 2, 3], type=pa.decimal128(8, 3))

    batch = pa.RecordBatch.from_arrays(
        [c_bool,
         c_int8, c_int16, c_int32, c_int64,
         c_uint8, c_uint16, c_uint32, c_uint64,
         # c_float16,
         c_float32, c_float64,
         c_time32, c_time64, c_timestamp, c_date32, c_date64,
         c_string,
         c_decimal128_8_3
         ],
        schema=my_schema
    )

    table = pa.Table.from_batches([batch])
    pq.write_table(table, 'example.parquet')
示例#15
0
import pyarrow as pa
import tensorflow as tf
from tfx_bsl.tfxio import tensor_adapter
from tfx_bsl.tfxio import tensor_to_arrow
from google.protobuf import text_format
from absl.testing import absltest
from absl.testing import parameterized
from tensorflow_metadata.proto.v0 import schema_pb2

_TF_TYPE_TO_ARROW_TYPE = {
    tf.int8: pa.int8(),
    tf.int16: pa.int16(),
    tf.int32: pa.int32(),
    tf.int64: pa.int64(),
    tf.uint8: pa.uint8(),
    tf.uint16: pa.uint16(),
    tf.uint32: pa.uint32(),
    tf.uint64: pa.uint64(),
    tf.float32: pa.float32(),
    tf.float64: pa.float64(),
    tf.string: pa.large_binary(),
}

_ROW_PARTITION_DTYPES = {"INT64": np.int64, "INT32": np.int32}


def _make_2d_dense_tensor_test_cases():
    result = []
    for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items():
        if tf_type == tf.string:
            tensor = tf.constant([[b"1", b"2"], [b"3", b"4"]], dtype=tf.string)
示例#16
0
null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())

signed_integer_types = st.sampled_from([
    pa.int8(),
    pa.int16(),
    pa.int32(),
    pa.int64()
])
unsigned_integer_types = st.sampled_from([
    pa.uint8(),
    pa.uint16(),
    pa.uint32(),
    pa.uint64()
])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([
    pa.float16(),
    pa.float32(),
    pa.float64()
])
decimal_type = st.builds(
    pa.decimal128,
    precision=st.integers(min_value=1, max_value=38),
    scale=st.integers(min_value=1, max_value=38)
)
示例#17
0
    result = pa.lib.TimestampType()
    with pytest.raises(TypeError):
        str(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [
        (pa.null(), 'float64'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
示例#18
0
Copyright (C) 2018 Anthony Potappel, The Netherlands. All Rights Reserved.
This work is licensed under the terms of the MIT license (for details, see attached LICENSE file).
"""

import pyarrow as pa

_ENDIANNESS = '<'

_DTYPES_CONV = {
    _ENDIANNESS + 'f2': pa.float16(),
    _ENDIANNESS + 'f4': pa.float32(),
    _ENDIANNESS + 'f8': pa.float64(),
    _ENDIANNESS + 'i2': pa.int16(),
    _ENDIANNESS + 'i4': pa.int32(),
    _ENDIANNESS + 'i8': pa.int64(),
    _ENDIANNESS + 'u2': pa.uint16(),
    _ENDIANNESS + 'u4': pa.uint32(),
    _ENDIANNESS + 'u8': pa.uint64(),
    '|i1': pa.int8(),
    '|u1': pa.uint8(),
}

_DTYPES_CONV_STR = {
    "float16": pa.float16(),
    "float32": pa.float32(),
    "float64": pa.float64(),
    "int16": pa.int16(),
    "int32": pa.int32(),
    "int64": pa.int64(),
    "uint16": pa.uint16(),
    "uint32": pa.uint32(),
示例#19
0
import ibis.expr.schema as sch

# TODO(kszucs): the following conversions are really rudimentary
# we should have a pyarrow backend which would be responsible
# for conversions between ibis types to pyarrow types

# TODO(kszucs): support nested and parametric types
# consolidate with the logic from the parquet backend

_to_ibis_dtypes = {
    pa.int8(): dt.Int8,
    pa.int16(): dt.Int16,
    pa.int32(): dt.Int32,
    pa.int64(): dt.Int64,
    pa.uint8(): dt.UInt8,
    pa.uint16(): dt.UInt16,
    pa.uint32(): dt.UInt32,
    pa.uint64(): dt.UInt64,
    pa.float16(): dt.Float16,
    pa.float32(): dt.Float32,
    pa.float64(): dt.Float64,
    pa.string(): dt.String,
    pa.binary(): dt.Binary,
    pa.bool_(): dt.Boolean,
}


@dt.dtype.register(pa.DataType)
def from_pyarrow_primitive(arrow_type, nullable=True):
    return _to_ibis_dtypes[arrow_type](nullable=nullable)
示例#20
0
 }
 ARROW_SCALAR_IDS_TO_BQ = {
     # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
     pyarrow.bool_().id:
     "BOOL",
     pyarrow.int8().id:
     "INT64",
     pyarrow.int16().id:
     "INT64",
     pyarrow.int32().id:
     "INT64",
     pyarrow.int64().id:
     "INT64",
     pyarrow.uint8().id:
     "INT64",
     pyarrow.uint16().id:
     "INT64",
     pyarrow.uint32().id:
     "INT64",
     pyarrow.uint64().id:
     "INT64",
     pyarrow.float16().id:
     "FLOAT64",
     pyarrow.float32().id:
     "FLOAT64",
     pyarrow.float64().id:
     "FLOAT64",
     pyarrow.time32("ms").id:
     "TIME",
     pyarrow.time64("ns").id:
     "TIME",
示例#21
0
class _TypeConverter(object):
    _CONVERTERS: Dict[pa.DataType, Any] = {
        pa.null(): _to_pynone,
        pa.string(): _to_pystr,
        pa.bool_(): _to_pybool,
        pa.int8(): _to_pyint,
        pa.int16(): _to_pyint,
        pa.int32(): _to_pyint,
        pa.int64(): _to_pyint,
        pa.uint8(): _to_pyint,
        pa.uint16(): _to_pyint,
        pa.uint32(): _to_pyint,
        pa.uint64(): _to_pyint,
        pa.float16(): _to_pyfloat,
        pa.float32(): _to_pyfloat,
        pa.float64(): _to_pyfloat,
        pa.date32(): _to_pydate,
        pa.binary(): _to_pybytes,
    }

    def __init__(
        self,
        schema: pa.Schema,
        copy: bool = True,
        deep: bool = False,
        str_as_json: bool = True,
    ):
        self._copy = copy
        self._deep = deep
        self._str_as_json = str_as_json

        self._to_pytype = [self._build_field_converter(f) for f in schema]

    def row_to_py(self, data: List[Any]) -> List[Any]:
        if not self._copy:
            for i in range(len(data)):
                data[i] = self._to_pytype[i](data[i])
            return data
        else:
            return [self._to_pytype[i](data[i]) for i in range(len(data))]

    def _build_field_converter(self, f: pa.Field) -> Callable[[Any], Any]:
        if f.type in _TypeConverter._CONVERTERS:
            return _TypeConverter._CONVERTERS[f.type]
        elif pa.types.is_timestamp(f.type):
            return _to_pydatetime
        elif pa.types.is_decimal(f.type):
            raise NotImplementedError("decimal conversion is not supported")
        elif pa.types.is_struct(f.type):
            if not self._deep:
                return lambda x: _assert_pytype(dict, x)
            else:
                converters = {
                    x.name: self._build_field_converter(x)
                    for x in list(f.type)
                }
                return lambda x: _to_pydict(converters, x, self._str_as_json)
        elif pa.types.is_list(f.type):
            if not self._deep:
                return lambda x: _assert_pytype(list, x)
            else:
                converter = self._build_field_converter(
                    pa.field("e", f.type.value_type))
                return lambda x: _to_pylist(converter, x, self._copy, self.
                                            _str_as_json)
        raise NotImplementedError(
            f"{f} type is not supported")  # pragma: no cover
示例#22
0
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import pytest
import sys

import numpy as np
import pyarrow as pa

tensor_type_pairs = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()),
                     ('i8', pa.int64()), ('u1', pa.uint8()),
                     ('u2', pa.uint16()), ('u4', pa.uint32()),
                     ('u8', pa.uint64()), ('f2', pa.float16()),
                     ('f4', pa.float32()), ('f8', pa.float64())]


@pytest.mark.parametrize('sparse_tensor_type', [
    pa.SparseTensorCSR,
    pa.SparseTensorCOO,
])
def test_sparse_tensor_attrs(sparse_tensor_type):
    data = np.array([
        [0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0],
        [0, 3, 0, 0, 0],
示例#23
0
class TestAbstractFileParserStatics:
    @pytest.mark.parametrize(  # testing all datatypes as laid out here: https://json-schema.org/understanding-json-schema/reference/type.html
        "input_json_type, output_pyarrow_type",
        [
            ("string", pa.large_string()),
            ("number", pa.float64()),
            ("integer", pa.int64()),
            ("object", pa.large_string()),
            ("array", pa.large_string()),
            ("boolean", pa.bool_()),
            ("null", pa.large_string()),
        ],
    )
    def test_json_type_to_pyarrow_type(self, input_json_type,
                                       output_pyarrow_type):
        # Json -> PyArrow direction
        LOGGER.info(
            f"asserting that JSON type '{input_json_type}' converts to PyArrow type '{output_pyarrow_type}'..."
        )
        assert AbstractFileParser.json_type_to_pyarrow_type(
            input_json_type) == output_pyarrow_type

    @pytest.mark.parametrize(  # testing all datatypes as laid out here: https://arrow.apache.org/docs/python/api/datatypes.html
        "input_pyarrow_types, output_json_type",
        [
            ((pa.null(), ), "string"),  # null type
            ((pa.bool_(), ), "boolean"),  # boolean type
            (
                (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(),
                 pa.uint16(), pa.uint32(), pa.uint64()),
                "integer",
            ),  # integer types
            ((pa.float16(), pa.float32(), pa.float64(), pa.decimal128(
                5, 10), pa.decimal256(3, 8)), "number"),  # number types
            ((pa.time32("s"), pa.time64("ns"), pa.timestamp("ms"), pa.date32(),
              pa.date64()), "string"),  # temporal types
            ((pa.binary(), pa.large_binary()), "string"),  # binary types
            ((pa.string(), pa.utf8(), pa.large_string(), pa.large_utf8()),
             "string"),  # string types
            ((pa.list_(pa.string()), pa.large_list(
                pa.timestamp("us"))), "string"),  # array types
            ((pa.map_(pa.string(), pa.float32()),
              pa.dictionary(pa.int16(), pa.list_(
                  pa.string()))), "string"),  # object types
        ],
    )
    def test_json_type_to_pyarrow_type_reverse(self, input_pyarrow_types,
                                               output_json_type):
        # PyArrow -> Json direction (reverse=True)
        for typ in input_pyarrow_types:
            LOGGER.info(
                f"asserting that PyArrow type '{typ}' converts to JSON type '{output_json_type}'..."
            )
            assert AbstractFileParser.json_type_to_pyarrow_type(
                typ, reverse=True) == output_json_type

    @pytest.mark.parametrize(  # if expecting fail, put pyarrow_schema as None
        "json_schema, pyarrow_schema",
        [
            (
                {
                    "a": "string",
                    "b": "number",
                    "c": "integer",
                    "d": "object",
                    "e": "array",
                    "f": "boolean",
                    "g": "null"
                },
                {
                    "a": pa.large_string(),
                    "b": pa.float64(),
                    "c": pa.int64(),
                    "d": pa.large_string(),
                    "e": pa.large_string(),
                    "f": pa.bool_(),
                    "g": pa.large_string(),
                },
            ),
            ({
                "single_column": "object"
            }, {
                "single_column": pa.large_string()
            }),
            ({}, {}),
            ({
                "a": "NOT A REAL TYPE",
                "b": "another fake type"
            }, {
                "a": pa.large_string(),
                "b": pa.large_string()
            }),
            (["string", "object"], None),  # bad input type
        ],
    )
    def test_json_schema_to_pyarrow_schema(self, json_schema, pyarrow_schema):
        # Json -> PyArrow direction
        if pyarrow_schema is not None:
            assert AbstractFileParser.json_schema_to_pyarrow_schema(
                json_schema) == pyarrow_schema
        else:
            with pytest.raises(Exception) as e_info:
                AbstractFileParser.json_schema_to_pyarrow_schema(json_schema)
                LOGGER.debug(str(e_info))

    @pytest.mark.parametrize(  # if expecting fail, put json_schema as None
        "pyarrow_schema, json_schema",
        [
            (
                {
                    "a": pa.utf8(),
                    "b": pa.float16(),
                    "c": pa.uint32(),
                    "d": pa.map_(pa.string(), pa.float32()),
                    "e": pa.bool_(),
                    "f": pa.date64(),
                },
                {
                    "a": "string",
                    "b": "number",
                    "c": "integer",
                    "d": "string",
                    "e": "boolean",
                    "f": "string"
                },
            ),
            ({
                "single_column": pa.int32()
            }, {
                "single_column": "integer"
            }),
            ({}, {}),
            ({
                "a": "NOT A REAL TYPE",
                "b": "another fake type"
            }, {
                "a": "string",
                "b": "string"
            }),
            (["string", "object"], None),  # bad input type
        ],
    )
    def test_json_schema_to_pyarrow_schema_reverse(self, pyarrow_schema,
                                                   json_schema):
        # PyArrow -> Json direction (reverse=True)
        if json_schema is not None:
            assert AbstractFileParser.json_schema_to_pyarrow_schema(
                pyarrow_schema, reverse=True) == json_schema
        else:
            with pytest.raises(Exception) as e_info:
                AbstractFileParser.json_schema_to_pyarrow_schema(
                    pyarrow_schema, reverse=True)
                LOGGER.debug(str(e_info))
示例#24
0
    "null": pa.null(),
    "str": pa.string(),
    "string": pa.string(),
    "bool": pa.bool_(),
    "boolean": pa.bool_(),
    "int8": pa.int8(),
    "byte": pa.int8(),
    "int16": pa.int16(),
    "short": pa.int16(),
    "int32": pa.int32(),
    "int": pa.int32(),
    "long": pa.int64(),
    "int64": pa.int64(),
    "uint8": pa.uint8(),
    "ubyte": pa.uint8(),
    "uint16": pa.uint16(),
    "ushort": pa.uint16(),
    "uint32": pa.uint32(),
    "uint": pa.uint32(),
    "ulong": pa.uint64(),
    "uint64": pa.uint64(),
    "float16": pa.float16(),
    "float": pa.float32(),
    "float32": pa.float32(),
    "double": pa.float64(),
    "float64": pa.float64(),
    "date": pa.date32(),
    "datetime": TRIAD_DEFAULT_TIMESTAMP,
    "binary": pa.binary(),
    "bytes": pa.binary(),
}
示例#25
0
# under the License.

import pytest
import sys

import numpy as np
import pyarrow as pa


tensor_type_pairs = [
    ('i1', pa.int8()),
    ('i2', pa.int16()),
    ('i4', pa.int32()),
    ('i8', pa.int64()),
    ('u1', pa.uint8()),
    ('u2', pa.uint16()),
    ('u4', pa.uint32()),
    ('u8', pa.uint64()),
    ('f2', pa.float16()),
    ('f4', pa.float32()),
    ('f8', pa.float64())
]


@pytest.mark.parametrize('sparse_tensor_type', [
    pa.SparseCSRMatrix,
    pa.SparseCOOTensor,
])
def test_sparse_tensor_attrs(sparse_tensor_type):
    data = np.array([
        [0, 1, 0, 0, 1],
示例#26
0
    take_indices_on_pyarrow_list,
)

# fmt:on

PANDAS_GE_0_26_0 = LooseVersion(pd.__version__) >= "0.26.0"
if PANDAS_GE_0_26_0:
    from pandas.core.indexers import check_array_indexer

_python_type_map = {
    pa.null().id: six.text_type,
    pa.bool_().id: bool,
    pa.int8().id: int,
    pa.uint8().id: int,
    pa.int16().id: int,
    pa.uint16().id: int,
    pa.int32().id: int,
    pa.uint32().id: int,
    pa.int64().id: int,
    pa.uint64().id: int,
    pa.float16().id: float,
    pa.float32().id: float,
    pa.float64().id: float,
    pa.date32().id: datetime.date,
    pa.date64().id: datetime.date,
    pa.timestamp("ms").id: datetime.datetime,
    pa.binary().id: six.binary_type,
    pa.string().id: six.text_type,
    # Use any list type here, only LIST is important
    pa.list_(pa.string()).id: list,
    pa.large_list(pa.string()).id: list,
示例#27
0
def test_basics(fletcher_array):
    df = pd.DataFrame(
        {
            "null": fletcher_array(pa.array([None, None], type=pa.null())),
            "bool": fletcher_array(pa.array([None, True], type=pa.bool_())),
            "int8": fletcher_array(pa.array([None, -1], type=pa.int8())),
            "uint8": fletcher_array(pa.array([None, 1], type=pa.uint8())),
            "int16": fletcher_array(pa.array([None, -1], type=pa.int16())),
            "uint16": fletcher_array(pa.array([None, 1], type=pa.uint16())),
            "int32": fletcher_array(pa.array([None, -1], type=pa.int32())),
            "uint32": fletcher_array(pa.array([None, 1], type=pa.uint32())),
            "int64": fletcher_array(pa.array([None, -1], type=pa.int64())),
            "uint64": fletcher_array(pa.array([None, 1], type=pa.uint64())),
            "float16": fletcher_array(
                pa.array([None, np.float16(-0.1)], type=pa.float16())
            ),
            "float32": fletcher_array(pa.array([None, -0.1], type=pa.float32())),
            "float64": fletcher_array(pa.array([None, -0.1], type=pa.float64())),
            "date32": fletcher_array(
                pa.array([None, datetime.date(2010, 9, 8)], type=pa.date32())
            ),
            "date64": fletcher_array(
                pa.array([None, datetime.date(2010, 9, 8)], type=pa.date64())
            ),
            # https://github.com/pandas-dev/pandas/issues/34986
            # "timestamp[s]": fletcher_array(
            #     pa.array(
            #         [None, datetime.datetime(2013, 12, 11, 10, 9, 8)],
            #         type=pa.timestamp("s"),
            #     )
            # ),
            # "timestamp[ms]": fletcher_array(
            #     pa.array(
            #         [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 1000)],
            #         type=pa.timestamp("ms"),
            #     )
            # ),
            # "timestamp[us]": fletcher_array(
            #     pa.array(
            #         [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)],
            #         type=pa.timestamp("us"),
            #     )
            # ),
            # FIXME: assert_extension_array_equal casts to numpy object thus cannot handle nanoseconds
            # 'timestamp[ns]': fletcher_array(pa.array([None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], type=pa.timestamp("ns"))),
            "binary": fletcher_array(pa.array([None, b"122"], type=pa.binary())),
            "string": fletcher_array(pa.array([None, "🤔"], type=pa.string())),
            "duration[s]": fletcher_array(
                pa.array([None, datetime.timedelta(seconds=9)], type=pa.duration("s"))
            ),
            "duration[ms]": fletcher_array(
                pa.array(
                    [None, datetime.timedelta(milliseconds=8)], type=pa.duration("ms")
                )
            ),
            "duration[us]": fletcher_array(
                pa.array(
                    [None, datetime.timedelta(microseconds=7)], type=pa.duration("us")
                )
            ),
            # FIXME: assert_extension_array_equal casts to numpy object thus cannot handle nanoseconds
            # 'duration[ns]': fletcher_array(pa.array([None, datetime.timedelta(microseconds=7)], type=pa.duration("ns"))),
            "list[string]": fletcher_array(
                pa.array([None, [None, "🤔"]], type=pa.list_(pa.string()))
            ),
        }
    )
    ddf = dd.from_pandas(df, npartitions=2)

    meta_nonempty = ddf._meta_nonempty
    pdt.assert_frame_equal(meta_nonempty, df)

    result = ddf.compute()
    pdt.assert_frame_equal(result, df)
示例#28
0
class KmvSketchTest(parameterized.TestCase):
    @parameterized.named_parameters(
        ("binary", [b"a", b"a", b"b", b"c", None], pa.binary()),
        ("large_binary", [b"a", b"a", b"b", b"c"], pa.large_binary()),
        ("string", ["a", "a", "b", "c", None], pa.string()),
        ("large_string", ["a", "a", "b", "c"], pa.large_string()),
        ("int8", [1, 1, 2, 3, None], pa.int8()),
        ("int16", [1, 1, 2, 3], pa.int16()),
        ("int32", [1, 1, 2, 3, None], pa.int32()),
        ("int64", [1, 1, 2, 3], pa.int64()),
        ("uint8", [1, 1, 2, 3], pa.uint8()),
        ("uint16", [1, None, 1, 2, 3], pa.uint16()),
        ("uint32", [1, 1, 2, 3], pa.uint32()),
        ("uint64", [1, 1, 2, 3, None], pa.uint64()),
    )
    def test_add(self, values, type_):
        sketch = _create_basic_sketch(pa.array(values, type=type_))
        num_unique = sketch.Estimate()

        self.assertEqual(3, num_unique)

    def test_add_unsupported_type(self):
        values = pa.array([True, False], pa.bool_())
        sketch = KmvSketch(_NUM_BUCKETS)
        with self.assertRaisesRegex(RuntimeError, "Unimplemented: bool"):
            sketch.AddValues(values)

    def test_merge(self):
        sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))
        sketch2 = _create_basic_sketch(pa.array(["d", "a"]))

        sketch1.Merge(sketch2)
        num_unique = sketch1.Estimate()

        self.assertEqual(4, num_unique)

    def test_merge_error(self):
        sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))
        sketch2 = _create_basic_sketch(pa.array(["d", "a"]), num_buckets=64)
        with self.assertRaisesRegex(
                Exception,
                "Both sketches must have the same number of buckets"):
            sketch1.Merge(sketch2)

    def test_picklable(self):
        sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))
        pickled = pickle.dumps(sketch, 2)
        self.assertIsInstance(pickled, bytes)
        unpickled = pickle.loads(pickled)
        self.assertIsInstance(unpickled, KmvSketch)

        num_unique = unpickled.Estimate()
        self.assertEqual(3, num_unique)

    def test_serialization(self):
        sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))

        serialized = sketch.Serialize()
        self.assertIsInstance(serialized, bytes)

        deserialized = KmvSketch.Deserialize(serialized)
        self.assertIsInstance(deserialized, KmvSketch)

        num_unique = deserialized.Estimate()
        self.assertEqual(3, num_unique)
示例#29
0
# The specifications were created using:
#
#   om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')()
#   field = …  # Code to instantiate the field
#   jvm_spec = om.writeValueAsString(field)
@pytest.mark.parametrize(
    'pa_type,jvm_spec',
    [
        (pa.null(), '{"name":"null"}'),
        (pa.bool_(), '{"name":"bool"}'),
        (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
        (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
        (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
        (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
        (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
        (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
        (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
        (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
        (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
        (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
        (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
        (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
        (pa.time32('ms'),
         '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
        (pa.time64('us'),
         '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
        (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
        (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
         '"timezone":null}'),
        (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
         '"timezone":null}'),
示例#30
0
文件: dtypes.py 项目: Meryam1985/cudf
from cudf._lib.scalar import DeviceScalar
from cudf.core._compat import PANDAS_GE_120

_NA_REP = "<NA>"
_np_pa_dtypes = {
    np.float64: pa.float64(),
    np.float32: pa.float32(),
    np.int64: pa.int64(),
    np.longlong: pa.int64(),
    np.int32: pa.int32(),
    np.int16: pa.int16(),
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
示例#31
0
import pyarrow as pa

PYTHON_TYPE_ARROW_TYPE_MAP = {
    float: pa.float32(),
    int: pa.int32(),
    str: pa.string()
}

STR_TYPE_ARROW_TYPE_MAP = {
    'int8': pa.int8(),
    'int16': pa.int16(),
    'int32': pa.int32(),
    'int64': pa.int64(),
    'uint8': pa.uint8(),
    'uint16': pa.uint16(),
    'uint32': pa.uint32(),
    'uint64': pa.uint64(),
    'float32': pa.float32(),
    'float64': pa.float64(),
    'double': pa.float64(),
    'half_float': pa.float16(),
    'string': pa.string(),
    'binary': pa.binary(),
    'bool': pa.bool_(),
    'float': pa.float32(),
    'int': pa.int32(),
    'str': pa.string()
}

示例#32
0
         ("my_bool", pa.bool_()),
         ("my_nullable_bool", pa.bool_()),
         ("my_date", pa.date32()),
         ("my_datetime", pa.timestamp("s")),
         ("my_int", pa.uint8()),
         ("my_string", pa.string()),
     ]
 ),
 pa.schema(
     [
         ("i", pa.int16()),
         ("my_bool", pa.bool_()),
         ("my_nullable_bool", pa.bool_()),
         ("my_date", pa.date32()),
         ("my_datetime", pa.timestamp("ms")),
         ("my_int", pa.uint16()),
         ("my_string", pa.string()),
     ]
 ),
 pa.schema(
     [
         ("i", pa.int32()),
         ("my_bool", pa.bool_()),
         ("my_nullable_bool", pa.bool_()),
         ("my_date", pa.date64()),
         ("my_datetime", pa.timestamp("us")),
         ("my_int", pa.uint32()),
         ("my_string", pa.string()),
     ]
 ),
 pa.schema(
示例#33
0
class DataMapping:
    """
    Map primary data between different supported data frameworks, preserving equivalent data types.

    DataMapping is for primary data, to map metadata types and values use
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.TypeMapping>` and
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.MetadataCodec>`.
    """

    __log = _util.logger_for_namespace(_DataInternal.__module__ +
                                       ".DataMapping")

    # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data

    __TRAC_DECIMAL_PRECISION = 38
    __TRAC_DECIMAL_SCALE = 12
    __TRAC_TIMESTAMP_UNIT = "ms"
    __TRAC_TIMESTAMP_ZONE = None

    __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = {
        _meta.BasicType.BOOLEAN:
        pa.bool_(),
        _meta.BasicType.INTEGER:
        pa.int64(),
        _meta.BasicType.FLOAT:
        pa.float64(),
        _meta.BasicType.DECIMAL:
        pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE),
        _meta.BasicType.STRING:
        pa.utf8(),
        _meta.BasicType.DATE:
        pa.date32(),
        _meta.BasicType.DATETIME:
        pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE)
    }

    # Check the Pandas dtypes for handling floats are available before setting up the type mapping
    __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check()
    __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype

    # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
    __ARROW_TO_PANDAS_TYPE_MAPPING = {
        pa.bool_(): pd.BooleanDtype(),
        pa.int8(): pd.Int8Dtype(),
        pa.int16(): pd.Int16Dtype(),
        pa.int32(): pd.Int32Dtype(),
        pa.int64(): pd.Int64Dtype(),
        pa.uint8(): pd.UInt8Dtype(),
        pa.uint16(): pd.UInt16Dtype(),
        pa.uint32(): pd.UInt32Dtype(),
        pa.uint64(): pd.UInt64Dtype(),
        pa.float16(): pd.Float32Dtype(),
        pa.float32(): pd.Float32Dtype(),
        pa.float64(): pd.Float64Dtype(),
        pa.utf8(): pd.StringDtype()
    }

    @staticmethod
    def arrow_to_python_type(arrow_type: pa.DataType) -> type:

        if pa.types.is_boolean(arrow_type):
            return bool

        if pa.types.is_integer(arrow_type):
            return int

        if pa.types.is_floating(arrow_type):
            return float

        if pa.types.is_decimal(arrow_type):
            return decimal.Decimal

        if pa.types.is_string(arrow_type):
            return str

        if pa.types.is_date(arrow_type):
            return dt.date

        if pa.types.is_timestamp(arrow_type):
            return dt.datetime

        raise _ex.ETracInternal(
            f"No Python type mapping available for Arrow type [{arrow_type}]")

    @classmethod
    def python_to_arrow_type(cls, python_type: type) -> pa.DataType:

        if python_type == bool:
            return pa.bool_()

        if python_type == int:
            return pa.int64()

        if python_type == float:
            return pa.float64()

        if python_type == decimal.Decimal:
            return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                                 cls.__TRAC_DECIMAL_SCALE)

        if python_type == str:
            return pa.utf8()

        if python_type == dt.date:
            return pa.date32()

        if python_type == dt.datetime:
            return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT,
                                cls.__TRAC_TIMESTAMP_ZONE)

        raise _ex.ETracInternal(
            f"No Arrow type mapping available for Python type [{python_type}]")

    @classmethod
    def trac_to_arrow_type(cls,
                           trac_type: _meta.TypeDescriptor) -> pa.DataType:

        return cls.trac_to_arrow_basic_type(trac_type.basicType)

    @classmethod
    def trac_to_arrow_basic_type(
            cls, trac_basic_type: _meta.BasicType) -> pa.DataType:

        arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get(
            trac_basic_type)

        if arrow_type is None:
            raise _ex.ETracInternal(
                f"No Arrow type mapping available for TRAC type [{trac_basic_type}]"
            )

        return arrow_type

    @classmethod
    def trac_to_arrow_schema(cls,
                             trac_schema: _meta.SchemaDefinition) -> pa.Schema:

        if trac_schema.schemaType != _meta.SchemaType.TABLE:
            raise _ex.ETracInternal(
                f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow"
            )

        arrow_fields = [(f.fieldName,
                         cls.trac_to_arrow_basic_type(f.fieldType))
                        for f in trac_schema.table.fields]

        return pa.schema(arrow_fields, metadata={})

    @classmethod
    def trac_arrow_decimal_type(cls) -> pa.Decimal128Type:

        return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                             cls.__TRAC_DECIMAL_SCALE)

    @classmethod
    def pandas_datetime_type(cls):
        return cls.__PANDAS_DATETIME_TYPE

    @classmethod
    def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame:

        deltas = view.parts.get(part)

        # Sanity checks

        if not view.arrow_schema:
            raise _ex.ETracInternal(f"Data view schema not set")

        if not deltas:
            raise _ex.ETracInternal(
                f"Data view for part [{part.opaque_key}] does not contain any items"
            )

        if len(deltas) == 1:
            return cls.item_to_pandas(deltas[0])

        batches = {
            batch
            for delta in deltas for batch in (
                delta.batches if delta.batches else delta.table.to_batches())
        }

        table = pa.Table.from_batches(batches)  # noqa
        return table.to_pandas()

    @classmethod
    def item_to_pandas(cls, item: DataItem) -> pd.DataFrame:

        if item.pandas is not None:
            return item.pandas.copy()

        if item.table is not None:
            return cls.arrow_to_pandas(item.table)

        if item.batches is not None:
            table = pa.Table.from_batches(item.batches, item.schema)  # noqa
            return cls.arrow_to_pandas(table)

        raise _ex.ETracInternal(f"Data item does not contain any usable data")

    @classmethod
    def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame:

        return table.to_pandas(
            ignore_metadata=True,  # noqa
            date_as_object=False,  # noqa
            timestamp_as_object=False,  # noqa
            types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get)

    @classmethod
    def pandas_to_view(cls, df: pd.DataFrame, prior_view: DataView,
                       part: DataPartKey):

        item = cls.pandas_to_item(df, prior_view.arrow_schema)
        return cls.add_item_to_view(prior_view, part, item)

    @classmethod
    def pandas_to_item(cls, df: pd.DataFrame,
                       schema: tp.Optional[pa.Schema]) -> DataItem:

        table = cls.pandas_to_arrow(df, schema)
        return DataItem(table.schema, table)

    @classmethod
    def pandas_to_arrow(cls,
                        df: pd.DataFrame,
                        schema: tp.Optional[pa.Schema] = None) -> pa.Table:

        # Here we convert the whole Pandas df and then pass it to conformance
        # An optimization would be to filter columns before applying conformance
        # To do this, we'd need the case-insensitive field matching logic, including output of warnings

        # Also, note that schema is not applied in from_pandas
        # This is because the conformance logic allows for a wider range of conversions
        # Applying the schema directly would fail for some types where casting is possible

        if len(df) == 0:
            df_schema = pa.Schema.from_pandas(df, preserve_index=False)  # noqa
            table = pa.Table.from_batches(list(), df_schema)  # noqa
        else:
            table = pa.Table.from_pandas(df, preserve_index=False)  # noqa

        # If there is no explict schema, give back the table exactly as it was received from Pandas
        # There could be an option here to coerce types to the appropriate TRAC standard types
        # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type

        if schema is None:
            return table
        else:
            return DataConformance.conform_to_schema(table, schema, df.dtypes)

    @classmethod
    def add_item_to_view(cls, view: DataView, part: DataPartKey,
                         item: DataItem) -> DataView:

        prior_deltas = view.parts.get(part) or list()
        deltas = [*prior_deltas, item]
        parts = {**view.parts, part: deltas}

        return DataView(view.trac_schema, view.arrow_schema, parts)
示例#34
0
    array = pa.array(data, type=typ)
    result = pickle.loads(pickle.dumps(array))
    assert array.equals(result)


@pytest.mark.parametrize(
    ('type', 'expected'),
    [
        (pa.null(), 'empty'),
        (pa.bool_(), 'bool'),
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
示例#35
0
import weakref

import numpy as np

import pyarrow as pa


@pytest.mark.parametrize(['value', 'ty', 'klass', 'deprecated'], [
    (False, None, pa.BooleanScalar, pa.BooleanValue),
    (True, None, pa.BooleanScalar, pa.BooleanValue),
    (1, None, pa.Int64Scalar, pa.Int64Value),
    (-1, None, pa.Int64Scalar, pa.Int64Value),
    (1, pa.int8(), pa.Int8Scalar, pa.Int8Value),
    (1, pa.uint8(), pa.UInt8Scalar, pa.UInt8Value),
    (1, pa.int16(), pa.Int16Scalar, pa.Int16Value),
    (1, pa.uint16(), pa.UInt16Scalar, pa.UInt16Value),
    (1, pa.int32(), pa.Int32Scalar, pa.Int32Value),
    (1, pa.uint32(), pa.UInt32Scalar, pa.UInt32Value),
    (1, pa.int64(), pa.Int64Scalar, pa.Int64Value),
    (1, pa.uint64(), pa.UInt64Scalar, pa.UInt64Value),
    (1.0, None, pa.DoubleScalar, pa.DoubleValue),
    (np.float16(1.0), pa.float16(), pa.HalfFloatScalar, pa.HalfFloatValue),
    (1.0, pa.float32(), pa.FloatScalar, pa.FloatValue),
    (decimal.Decimal("1.123"), None, pa.Decimal128Scalar, pa.Decimal128Value),
    (decimal.Decimal("1.1234567890123456789012345678901234567890"), None,
     pa.Decimal256Scalar, pa.Decimal256Value),
    ("string", None, pa.StringScalar, pa.StringValue),
    (b"bytes", None, pa.BinaryScalar, pa.BinaryValue),
    ("largestring", pa.large_string(), pa.LargeStringScalar,
     pa.LargeStringValue),
    (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar,
示例#36
0
class MisraGriesSketchTest(parameterized.TestCase):
    @parameterized.named_parameters(
        ("binary", [b"a", b"a", b"b", b"c", None], pa.binary()),
        ("large_binary", [b"a", b"a", b"b", b"c"], pa.large_binary()),
        ("string", ["a", "a", "b", "c", None], pa.string()),
        ("large_string", ["a", "a", "b", "c"], pa.large_string()),
    )
    def test_add_binary_like(self, values, binary_like_type):
        expected_counts = [{
            "values": b"a",
            "counts": 2.0
        }, {
            "values": b"b",
            "counts": 1.0
        }, {
            "values": b"c",
            "counts": 1.0
        }]
        sketch = _create_basic_sketch(pa.array(values, type=binary_like_type))
        estimate = sketch.Estimate().to_pylist()
        self.assertEqual(estimate, expected_counts)

    @parameterized.named_parameters(
        ("int8", [1, 1, 2, 3, None], pa.int8()),
        ("int16", [1, 1, 2, 3], pa.int16()),
        ("int32", [1, 1, 2, 3, None], pa.int32()),
        ("int64", [1, 1, 2, 3], pa.int64()),
        ("uint8", [1, 1, 2, 3], pa.uint8()),
        ("uint16", [1, None, 1, 2, 3], pa.uint16()),
        ("uint32", [1, 1, 2, 3], pa.uint32()),
        ("uint64", [1, 1, 2, 3, None], pa.uint64()),
    )
    def test_add_integer(self, values, integer_type):
        expected_counts = [{
            "values": b"1",
            "counts": 2.0
        }, {
            "values": b"2",
            "counts": 1.0
        }, {
            "values": b"3",
            "counts": 1.0
        }]
        sketch = _create_basic_sketch(pa.array(values, type=integer_type))
        estimate = sketch.Estimate().to_pylist()
        self.assertEqual(estimate, expected_counts)

    def test_add_weighted_values(self):
        items = pa.array(["a", "a", "b", "c"], type=pa.string())
        weights = pa.array([4, 3, 2, 1], type=pa.float32())
        sketch = _create_basic_sketch(items, weights=weights)

        expected_counts = [{
            "values": b"a",
            "counts": 7.0
        }, {
            "values": b"b",
            "counts": 2.0
        }, {
            "values": b"c",
            "counts": 1.0
        }]
        estimate = sketch.Estimate().to_pylist()

        self.assertEqual(estimate, expected_counts)

    def test_add_invalid_weights(self):
        items = pa.array(["a", "a", "b", "c"], type=pa.string())
        weights = pa.array([4, 3, 2, 1], type=pa.int64())
        with self.assertRaisesRegex(
                RuntimeError,
                "Invalid argument: Weight array must be float type."):
            _create_basic_sketch(items, weights=weights)

    def test_add_unsupported_type(self):
        values = pa.array([True, False], pa.bool_())
        sketch = MisraGriesSketch(_NUM_BUCKETS)
        with self.assertRaisesRegex(RuntimeError, "Unimplemented: bool"):
            sketch.AddValues(values)

    def test_merge(self):
        sketch1 = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))
        sketch2 = _create_basic_sketch(pa.array(["d", "a"]))

        sketch1.Merge(sketch2)
        estimate = sketch1.Estimate().to_pylist()
        expected_counts = [{
            "values": b"a",
            "counts": 3.0
        }, {
            "values": b"b",
            "counts": 1.0
        }, {
            "values": b"c",
            "counts": 1.0
        }, {
            "values": b"d",
            "counts": 1.0
        }]

        self.assertEqual(estimate, expected_counts)

    def test_picklable(self):
        sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))
        pickled = pickle.dumps(sketch, 2)
        self.assertIsInstance(pickled, bytes)
        unpickled = pickle.loads(pickled)
        self.assertIsInstance(unpickled, MisraGriesSketch)

        estimate = unpickled.Estimate().to_pylist()
        expected_counts = [{
            "values": b"a",
            "counts": 2.0
        }, {
            "values": b"b",
            "counts": 1.0
        }, {
            "values": b"c",
            "counts": 1.0
        }]

        self.assertEqual(estimate, expected_counts)

    def test_serialization(self):
        sketch = _create_basic_sketch(pa.array(["a", "b", "c", "a"]))

        serialized = sketch.Serialize()
        self.assertIsInstance(serialized, bytes)

        deserialized = MisraGriesSketch.Deserialize(serialized)
        self.assertIsInstance(deserialized, MisraGriesSketch)

        estimate = deserialized.Estimate().to_pylist()
        expected_counts = [{
            "values": b"a",
            "counts": 2.0
        }, {
            "values": b"b",
            "counts": 1.0
        }, {
            "values": b"c",
            "counts": 1.0
        }]

        self.assertEqual(estimate, expected_counts)
示例#37
0
import cudf
from cudf._lib.scalar import Scalar

_NA_REP = "<NA>"
_np_pa_dtypes = {
    np.float64: pa.float64(),
    np.float32: pa.float32(),
    np.int64: pa.int64(),
    np.longlong: pa.int64(),
    np.int32: pa.int32(),
    np.int16: pa.int16(),
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
示例#38
0
    assert not tensor.is_mutable

def test_tensor_base_object():
    tensor = pa.Tensor.from_numpy(np.random.randn(10, 4))
    n = sys.getrefcount(tensor)
    array = tensor.to_numpy()
    assert sys.getrefcount(tensor) == n + 1


@pytest.mark.parametrize('dtype_str,arrow_type', [
    ('i1', pa.int8()),
    ('i2', pa.int16()),
    ('i4', pa.int32()),
    ('i8', pa.int64()),
    ('u1', pa.uint8()),
    ('u2', pa.uint16()),
    ('u4', pa.uint32()),
    ('u8', pa.uint64()),
    ('f2', pa.float16()),
    ('f4', pa.float32()),
    ('f8', pa.float64())
])
def test_tensor_numpy_roundtrip(dtype_str, arrow_type):
    dtype = np.dtype(dtype_str)
    data = (100 * np.random.randn(10, 4)).astype(dtype)

    tensor = pa.Tensor.from_numpy(data)
    assert tensor.type == arrow_type

    repr(tensor)
示例#39
0
        'b': 2
    }, None, {
        'a': 3,
        'b': 4
    }, None, {
        'a': 5,
        'b': 6
    }]),
]

numerical_arrow_types = [
    pa.int8(),
    pa.int16(),
    pa.int64(),
    pa.uint8(),
    pa.uint16(),
    pa.uint64(),
    pa.float32(),
    pa.float64()
]


@pytest.mark.parametrize('arrow_type', numerical_arrow_types)
def test_sum_array(arrow_type):
    arr = pa.array([1, 2, 3, 4], type=arrow_type)
    assert arr.sum() == 10
    assert pa.compute.sum(arr) == 10

    arr = pa.array([], type=arrow_type)
    assert arr.sum() == None  # noqa: E711
    assert pa.compute.sum(arr) == None  # noqa: E711
示例#40
0
# without to invoke the JVM.
#
# The specifications were created using:
#
#   om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')()
#   field = …  # Code to instantiate the field
#   jvm_spec = om.writeValueAsString(field)
@pytest.mark.parametrize('pa_type,jvm_spec', [
    (pa.null(), '{"name":"null"}'),
    (pa.bool_(), '{"name":"bool"}'),
    (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
    (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
    (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
    (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
    (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
    (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
    (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
    (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
    (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
    (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
    (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
    (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
    (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
    (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
    (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
    (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
        '"timezone":null}'),
    (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
        '"timezone":null}'),
    (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
        '"timezone":null}'),
import collections
import datetime
import decimal
import itertools
import math
import traceback
import sys

import numpy as np
import pytz
import six

int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int16()),
                  (np.int32, pa.int32()), (np.int64, pa.int64()),
                  (np.uint8, pa.uint8()), (np.uint16, pa.uint16()),
                  (np.uint32, pa.uint32()), (np.uint64, pa.uint64())]

np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst

    def __iter__(self):
        return self.lst.__iter__()


class MyInt:
    def __init__(self, value):
示例#42
0
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i


@pytest.mark.parametrize('t,check_func', [
    (pa.date32(), types.is_date32),
    (pa.date64(), types.is_date64),
    (pa.time32('s'), types.is_time32),
    (pa.time64('ns'), types.is_time64),
    (pa.int8(), types.is_int8),
    (pa.int16(), types.is_int16),
    (pa.int32(), types.is_int32),
    (pa.int64(), types.is_int64),
    (pa.uint8(), types.is_uint8),
    (pa.uint16(), types.is_uint16),
    (pa.uint32(), types.is_uint32),
    (pa.uint64(), types.is_uint64),
    (pa.float16(), types.is_float16),
    (pa.float32(), types.is_float32),
    (pa.float64(), types.is_float64)
])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)
示例#43
0
import datetime
import decimal
import itertools
import math

import numpy as np
import pytz


int_type_pairs = [
    (np.int8, pa.int8()),
    (np.int16, pa.int16()),
    (np.int32, pa.int32()),
    (np.int64, pa.int64()),
    (np.uint8, pa.uint8()),
    (np.uint16, pa.uint16()),
    (np.uint32, pa.uint32()),
    (np.uint64, pa.uint64())]


np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst

    def __iter__(self):
        return self.lst.__iter__()

示例#44
0

def test_parquet_metadata_lifetime(tempdir):
    # ARROW-6642 - ensure that chained access keeps parent objects alive
    table = pa.table({'a': [1, 2, 3]})
    pq.write_table(table, tempdir / 'test_metadata_segfault.parquet')
    parquet_file = pq.ParquetFile(tempdir / 'test_metadata_segfault.parquet')
    parquet_file.metadata.row_group(0).column(0).statistics


@pytest.mark.pandas
@pytest.mark.parametrize(
    ('data', 'type', 'physical_type', 'min_value', 'max_value', 'null_count',
     'num_values', 'distinct_count'), [
         ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
         ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
         ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
         ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
         ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
         ([-1.1, 2.2, 2.3, None, 4.4
           ], pa.float32(), 'FLOAT', -1.1, 4.4, 1, 4, 0),
         ([-1.1, 2.2, 2.3, None, 4.4
           ], pa.float64(), 'DOUBLE', -1.1, 4.4, 1, 4, 0),
         (['', 'b', chr(1000), None, 'aaa'], pa.binary(), 'BYTE_ARRAY', b'',
          chr(1000).encode('utf-8'), 1, 4, 0),
         ([True, False, False, True, True
           ], pa.bool_(), 'BOOLEAN', False, True, 0, 5, 0),
         ([b'\x00', b'b', b'12', None, b'aaa'
示例#45
0
custom_text = st.text(
    alphabet=st.characters(min_codepoint=0x41, max_codepoint=0x7E))

null_type = st.just(pa.null())
bool_type = st.just(pa.bool_())

binary_type = st.just(pa.binary())
string_type = st.just(pa.string())
large_binary_type = st.just(pa.large_binary())
large_string_type = st.just(pa.large_string())

signed_integer_types = st.sampled_from(
    [pa.int8(), pa.int16(), pa.int32(),
     pa.int64()])
unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=1, max_value=38),
                         scale=st.integers(min_value=1, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
     pa.time32('ms'),
     pa.time64('us'),
     pa.time64('ns')])
示例#46
0
import collections
import datetime
import decimal
import itertools
import numpy as np
import six
import pytz


int_type_pairs = [
    (np.int8, pa.int8()),
    (np.int16, pa.int16()),
    (np.int32, pa.int32()),
    (np.int64, pa.int64()),
    (np.uint8, pa.uint8()),
    (np.uint16, pa.uint16()),
    (np.uint32, pa.uint32()),
    (np.uint64, pa.uint64())]


np_int_types, _ = zip(*int_type_pairs)


class StrangeIterable:
    def __init__(self, lst):
        self.lst = lst

    def __iter__(self):
        return self.lst.__iter__()

示例#47
0
    np.arange(10, dtype=np.float16),
])
def test_to_numpy_roundtrip(narr):
    arr = pa.array(narr)
    assert narr.dtype == arr.to_numpy().dtype
    np.testing.assert_array_equal(narr, arr.to_numpy())
    np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy())
    np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy())
    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2**63], type=pa.uint64())
    expected = pa.array(np.array([2**63], dtype='u8'))
示例#48
0
        in_dict[field] = i
    assert len(in_dict) == len(fields)
    for i, field in enumerate(fields):
        assert in_dict[field] == i


@pytest.mark.parametrize('t,check_func', [(pa.date32(), types.is_date32),
                                          (pa.date64(), types.is_date64),
                                          (pa.time32('s'), types.is_time32),
                                          (pa.time64('ns'), types.is_time64),
                                          (pa.int8(), types.is_int8),
                                          (pa.int16(), types.is_int16),
                                          (pa.int32(), types.is_int32),
                                          (pa.int64(), types.is_int64),
                                          (pa.uint8(), types.is_uint8),
                                          (pa.uint16(), types.is_uint16),
                                          (pa.uint32(), types.is_uint32),
                                          (pa.uint64(), types.is_uint64),
                                          (pa.float16(), types.is_float16),
                                          (pa.float32(), types.is_float32),
                                          (pa.float64(), types.is_float64)])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)


def test_bit_width():
    for ty, expected in [(pa.bool_(), 1), (pa.int8(), 8), (pa.uint32(), 32),
                         (pa.float16(), 16), (pa.decimal128(19, 4), 128),
                         (pa.binary(42), 42 * 8)]:
        assert ty.bit_width == expected
    for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]:
示例#49
0
     "INT64": pyarrow.int64,
     "INTEGER": pyarrow.int64,
     "NUMERIC": pyarrow_numeric,
     "STRING": pyarrow.string,
     "TIME": pyarrow_time,
     "TIMESTAMP": pyarrow_timestamp,
 }
 ARROW_SCALAR_IDS_TO_BQ = {
     # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
     pyarrow.bool_().id: "BOOL",
     pyarrow.int8().id: "INT64",
     pyarrow.int16().id: "INT64",
     pyarrow.int32().id: "INT64",
     pyarrow.int64().id: "INT64",
     pyarrow.uint8().id: "INT64",
     pyarrow.uint16().id: "INT64",
     pyarrow.uint32().id: "INT64",
     pyarrow.uint64().id: "INT64",
     pyarrow.float16().id: "FLOAT64",
     pyarrow.float32().id: "FLOAT64",
     pyarrow.float64().id: "FLOAT64",
     pyarrow.time32("ms").id: "TIME",
     pyarrow.time64("ns").id: "TIME",
     pyarrow.timestamp("ns").id: "TIMESTAMP",
     pyarrow.date32().id: "DATE",
     pyarrow.date64().id: "DATETIME",  # because millisecond resolution
     pyarrow.binary().id: "BYTES",
     pyarrow.string().id: "STRING",  # also alias for pyarrow.utf8()
     pyarrow.decimal128(38, scale=9).id: "NUMERIC",
     # The exact decimal's scale and precision are not important, as only
     # the type ID matters, and it's the same for all decimal128 instances.