示例#1
0
文件: jvm.py 项目: rok/arrow
def _from_jvm_time_type(jvm_type):
    """
    Convert a JVM time type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Time

    Returns
    -------
    typ: pyarrow.DataType
    """
    time_unit = jvm_type.getUnit().toString()
    if time_unit == 'SECOND':
        assert jvm_type.bitWidth == 32
        return pa.time32('s')
    elif time_unit == 'MILLISECOND':
        assert jvm_type.bitWidth == 32
        return pa.time32('ms')
    elif time_unit == 'MICROSECOND':
        assert jvm_type.bitWidth == 64
        return pa.time64('us')
    elif time_unit == 'NANOSECOND':
        assert jvm_type.bitWidth == 64
        return pa.time64('ns')
示例#2
0
文件: test_types.py 项目: rok/arrow
def test_time64_units():
    for valid_unit in ('us', 'ns'):
        ty = pa.time64(valid_unit)
        assert ty.unit == valid_unit

    for invalid_unit in ('m', 's', 'ms'):
        error_msg = 'Invalid TimeUnit for time64: {}'.format(invalid_unit)
        with pytest.raises(ValueError, match=error_msg):
            pa.time64(invalid_unit)
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]'])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
    def test_arrow_time_to_pandas(self):
        pytimes = [time(1, 2, 3, 1356),
                   time(4, 5, 6, 1356),
                   time(0, 0, 0)]

        expected = np.array(pytimes[:2] + [None])
        expected_ms = np.array([x.replace(microsecond=1000)
                                for x in pytimes[:2]] +
                               [None])
        expected_s = np.array([x.replace(microsecond=0)
                               for x in pytimes[:2]] +
                              [None])

        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')
        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')

        null_mask = np.array([False, False, True], dtype=bool)

        a1 = pa.array(arr, mask=null_mask, type=pa.time64('us'))
        a2 = pa.array(arr * 1000, mask=null_mask,
                      type=pa.time64('ns'))

        a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask,
                      type=pa.time32('ms'))
        a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask,
                      type=pa.time32('s'))

        names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
        batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
        arr = a1.to_pandas()
        assert (arr == expected).all()

        arr = a2.to_pandas()
        assert (arr == expected).all()

        arr = a3.to_pandas()
        assert (arr == expected_ms).all()

        arr = a4.to_pandas()
        assert (arr == expected_s).all()

        df = batch.to_pandas()
        expected_df = pd.DataFrame({'time64[us]': expected,
                                    'time64[ns]': expected,
                                    'time32[ms]': expected_ms,
                                    'time32[s]': expected_s},
                                   columns=names)

        tm.assert_frame_equal(df, expected_df)
示例#5
0
def test_cast_time64_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                   type=pa.time64('us'))
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')
    assert result.equals(expected)
示例#6
0
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
示例#7
0
def test_time_types():
    t1 = pa.time32('s')
    t2 = pa.time32('ms')
    t3 = pa.time64('us')
    t4 = pa.time64('ns')

    assert t1.unit == 's'
    assert t2.unit == 'ms'
    assert t3.unit == 'us'
    assert t4.unit == 'ns'

    assert str(t1) == 'time32[s]'
    assert str(t4) == 'time64[ns]'

    with pytest.raises(ValueError):
        pa.time32('us')

    with pytest.raises(ValueError):
        pa.time64('s')
示例#8
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
    def test_pytime_from_pandas(self):
        pytimes = [time(1, 2, 3, 1356),
                   time(4, 5, 6, 1356)]

        # microseconds
        t1 = pa.time64('us')

        aobjs = np.array(pytimes + [None], dtype=object)
        parr = pa.array(aobjs)
        assert parr.type == t1
        assert parr[0].as_py() == pytimes[0]
        assert parr[1].as_py() == pytimes[1]
        assert parr[2] is pa.NA

        # DataFrame
        df = pd.DataFrame({'times': aobjs})
        batch = pa.RecordBatch.from_pandas(df)
        assert batch[0].equals(parr)

        # Test ndarray of int64 values
        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')

        a1 = pa.array(arr, type=pa.time64('us'))
        assert a1[0].as_py() == pytimes[0]

        a2 = pa.array(arr * 1000, type=pa.time64('ns'))
        assert a2[0].as_py() == pytimes[0]

        a3 = pa.array((arr / 1000).astype('i4'),
                      type=pa.time32('ms'))
        assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)

        a4 = pa.array((arr / 1000000).astype('i4'),
                      type=pa.time32('s'))
        assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
示例#10
0
def test_types_hashable():
    types = [
        pa.null(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i
示例#11
0
文件: test_types.py 项目: rok/arrow
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )
示例#12
0
def test_is_temporal_date_time_timestamp():
    date_types = [pa.date32(), pa.date64()]
    time_types = [pa.time32('s'), pa.time64('ns')]
    timestamp_types = [pa.timestamp('ms')]

    for case in date_types + time_types + timestamp_types:
        assert types.is_temporal(case)

    for case in date_types:
        assert types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)

    for case in time_types:
        assert types.is_time(case)
        assert not types.is_date(case)
        assert not types.is_timestamp(case)

    for case in timestamp_types:
        assert types.is_timestamp(case)
        assert not types.is_date(case)
        assert not types.is_time(case)

    assert not types.is_temporal(pa.int32())
示例#13
0
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i


@pytest.mark.parametrize('t,check_func', [
    (pa.date32(), types.is_date32),
    (pa.date64(), types.is_date64),
    (pa.time32('s'), types.is_time32),
    (pa.time64('ns'), types.is_time64),
    (pa.int8(), types.is_int8),
    (pa.int16(), types.is_int16),
    (pa.int32(), types.is_int32),
    (pa.int64(), types.is_int64),
    (pa.uint8(), types.is_uint8),
    (pa.uint16(), types.is_uint16),
    (pa.uint32(), types.is_uint32),
    (pa.uint64(), types.is_uint64),
    (pa.float16(), types.is_float16),
    (pa.float32(), types.is_float32),
    (pa.float64(), types.is_float64)
])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)
示例#14
0
])
decimal_type = st.builds(
    pa.decimal128,
    precision=st.integers(min_value=1, max_value=38),
    scale=st.integers(min_value=1, max_value=38)
)
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([
    pa.date32(),
    pa.date64()
])
time_types = st.sampled_from([
    pa.time32('s'),
    pa.time32('ms'),
    pa.time64('us'),
    pa.time64('ns')
])
timestamp_types = st.builds(
    pa.timestamp,
    unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
    tz=tzst.timezones()
)
temporal_types = st.one_of(date_types, time_types, timestamp_types)

primitive_types = st.one_of(
    null_type,
    bool_type,
    binary_type,
    string_type,
    numeric_types,
示例#15
0
 (pa.null(), '{"name":"null"}'),
 (pa.bool_(), '{"name":"bool"}'),
 (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
 (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
 (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
 (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
 (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
 (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
 (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
 (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
 (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
 (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
 (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
 (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
 (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
 (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
 (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
 (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
     '"timezone":null}'),
 (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
     '"timezone":null}'),
 (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
     '"timezone":null}'),
 (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
     '"timezone":null}'),
 (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"'
     ',"timezone":"UTC"}'),
 (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",'
     '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),
 (pa.date32(), '{"name":"date","unit":"DAY"}'),
 (pa.date64(), '{"name":"date","unit":"MILLISECOND"}'),
示例#16
0
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000],
                     dtype='int64')
    a7 = pa.Array.from_pandas(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.Array.from_pandas(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]',
                                  'timestamp[ns]'])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0',
                     flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
示例#17
0
    ARROW_SCALAR_IDS_TO_BQ = {
        # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
        pyarrow.bool_().id: "BOOL",
        pyarrow.int8().id: "INT64",
        pyarrow.int16().id: "INT64",
        pyarrow.int32().id: "INT64",
        pyarrow.int64().id: "INT64",
        pyarrow.uint8().id: "INT64",
        pyarrow.uint16().id: "INT64",
        pyarrow.uint32().id: "INT64",
        pyarrow.uint64().id: "INT64",
        pyarrow.float16().id: "FLOAT64",
        pyarrow.float32().id: "FLOAT64",
        pyarrow.float64().id: "FLOAT64",
        pyarrow.time32("ms").id: "TIME",
        pyarrow.time64("ns").id: "TIME",
        pyarrow.timestamp("ns").id: "TIMESTAMP",
        pyarrow.date32().id: "DATE",
        pyarrow.date64().id: "DATETIME",  # because millisecond resolution
        pyarrow.binary().id: "BYTES",
        pyarrow.string().id: "STRING",  # also alias for pyarrow.utf8()
        pyarrow.decimal128(38, scale=9).id: "NUMERIC",
        # The exact decimal's scale and precision are not important, as only
        # the type ID matters, and it's the same for all decimal128 instances.
    }

else:  # pragma: NO COVER
    BQ_TO_ARROW_SCALARS = {}  # pragma: NO COVER
    ARROW_SCALAR_IDS_TO_BQ = {}  # pragma: NO_COVER

示例#18
0
    )

    # order is important; _string_like[:2] vs _string_like[::2]
    _string_like = (
        pyarrow.string(),
        pyarrow.large_string(),
        pyarrow.binary(),
        pyarrow.large_binary(),
    )

    _pyarrow_to_numpy_dtype = {
        pyarrow.date32(): (True, np.dtype("M8[D]")),
        pyarrow.date64(): (False, np.dtype("M8[ms]")),
        pyarrow.time32("s"): (True, np.dtype("M8[s]")),
        pyarrow.time32("ms"): (True, np.dtype("M8[ms]")),
        pyarrow.time64("us"): (False, np.dtype("M8[us]")),
        pyarrow.time64("ns"): (False, np.dtype("M8[ns]")),
        pyarrow.timestamp("s"): (False, np.dtype("M8[s]")),
        pyarrow.timestamp("ms"): (False, np.dtype("M8[ms]")),
        pyarrow.timestamp("us"): (False, np.dtype("M8[us]")),
        pyarrow.timestamp("ns"): (False, np.dtype("M8[ns]")),
        pyarrow.duration("s"): (False, np.dtype("m8[s]")),
        pyarrow.duration("ms"): (False, np.dtype("m8[ms]")),
        pyarrow.duration("us"): (False, np.dtype("m8[us]")),
        pyarrow.duration("ns"): (False, np.dtype("m8[ns]")),
    }

if not ak._v2._util.numpy_at_least("1.17.0"):

    def packbits(bytearray, lsb_order=True):
        if lsb_order:
示例#19
0
def test_in_expr_todo():
    import pyarrow.gandiva as gandiva
    # TODO: Implement reasonable support for timestamp, time & date.
    # Current exceptions:
    # pyarrow.lib.ArrowException: ExpressionValidationError:
    # Evaluation expression for IN clause returns XXXX values are of typeXXXX

    # binary
    arr = pa.array([b"ga", b"an", b"nd", b"di", b"iv", b"va"])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [b'an', b'nd'], pa.binary())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert result.to_array().equals(pa.array([1, 2], type=pa.uint32()))

    # timestamp
    datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877)
    datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877)
    datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877)

    arr = pa.array([datetime_1, datetime_2, datetime_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [datetime_2], pa.timestamp('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # time
    time_1 = datetime_1.time()
    time_2 = datetime_2.time()
    time_3 = datetime_3.time()

    arr = pa.array([time_1, time_2, time_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [time_2], pa.time64('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # date
    date_1 = datetime_1.date()
    date_2 = datetime_2.date()
    date_3 = datetime_3.date()

    arr = pa.array([date_1, date_2, date_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [date_2], pa.date32())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]
def pyarrow_time():
    return pyarrow.time64("us")
示例#21
0
 (pa.bool_(), '{"name":"bool"}'),
 (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
 (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
 (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
 (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
 (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
 (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
 (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
 (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
 (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
 (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
 (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
 (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
 (pa.time32('ms'),
  '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
 (pa.time64('us'),
  '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
 (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
 (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
  '"timezone":null}'),
 (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
  '"timezone":null}'),
 (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
  '"timezone":null}'),
 (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
  '"timezone":null}'),
 (pa.timestamp('ns',
               tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"'
  ',"timezone":"UTC"}'),
 (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",'
  '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),
示例#22
0
 "INT64",
 pyarrow.uint16().id:
 "INT64",
 pyarrow.uint32().id:
 "INT64",
 pyarrow.uint64().id:
 "INT64",
 pyarrow.float16().id:
 "FLOAT64",
 pyarrow.float32().id:
 "FLOAT64",
 pyarrow.float64().id:
 "FLOAT64",
 pyarrow.time32("ms").id:
 "TIME",
 pyarrow.time64("ns").id:
 "TIME",
 pyarrow.timestamp("ns").id:
 "TIMESTAMP",
 pyarrow.date32().id:
 "DATE",
 pyarrow.date64().id:
 "DATETIME",  # because millisecond resolution
 pyarrow.binary().id:
 "BYTES",
 pyarrow.string().id:
 "STRING",  # also alias for pyarrow.utf8()
 pyarrow.decimal128(38, scale=9).id:
 "NUMERIC",
 # The exact decimal's scale and precision are not important, as only
 # the type ID matters, and it's the same for all decimal128 instances.
示例#23
0
def dataframe_with_lists(include_index=False, parquet_compatible=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    parquet_compatible: bool
        Exclude types not supported by parquet
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    date_data = [
        [],
        [date(2018, 1, 1), date(2032, 12, 30)],
        [date(2000, 6, 7)],
        None,
        [date(1969, 6, 9), date(1972, 7, 3)]
    ]
    time_data = [
        [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)],
        [],
        [time(22, 5, 59)],
        None,
        [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)]
    ]

    temporal_pairs = [
        (pa.date32(), date_data),
        (pa.date64(), date_data),
        (pa.time32('s'), time_data),
        (pa.time32('ms'), time_data),
        (pa.time64('us'), time_data)
    ]
    if not parquet_compatible:
        temporal_pairs += [
            (pa.time64('ns'), time_data),
        ]

    for value_type, data in temporal_pairs:
        field_name = '{}_list'.format(value_type)
        field_type = pa.list_(value_type)
        field = pa.field(field_name, field_type)
        fields.append(field)
        arrays[field_name] = data

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))

    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
示例#24
0
def test_cast_time64_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.time64('us'))
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')
    assert result.equals(expected)
示例#25
0
    np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy())


@pytest.mark.parametrize(
    ('type', 'expected'),
    [(pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'),
     (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'),
     (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'),
     (pa.uint64(), 'uint64'), (pa.float16(), 'float16'),
     (pa.float32(), 'float32'), (pa.float64(), 'float64'),
     (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'),
     (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'),
     (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
     (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'),
     (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'),
     (pa.time64('us'), 'time')])
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2**63], type=pa.uint64())
    expected = pa.array(np.array([2**63], dtype='u8'))
    assert arr.equals(expected)


def test_array_conversions_no_sentinel_values():
    arr = np.array([1, 2, 3, 4], dtype='int8')
    refcount = sys.getrefcount(arr)
    arr2 = pa.array(arr)  # noqa
    assert sys.getrefcount(arr) == (refcount + 1)
示例#26
0
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2 ** 63], type=pa.uint64())
    expected = pa.array(np.array([2 ** 63], dtype='u8'))
    assert arr.equals(expected)


def test_array_conversions_no_sentinel_values():
    arr = np.array([1, 2, 3, 4], dtype='int8')
    refcount = sys.getrefcount(arr)
示例#27
0
unsigned_integer_types = st.sampled_from(
    [pa.uint8(), pa.uint16(),
     pa.uint32(), pa.uint64()])
integer_types = st.one_of(signed_integer_types, unsigned_integer_types)

floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()])
decimal_type = st.builds(pa.decimal128,
                         precision=st.integers(min_value=1, max_value=38),
                         scale=st.integers(min_value=1, max_value=38))
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([pa.date32(), pa.date64()])
time_types = st.sampled_from(
    [pa.time32('s'),
     pa.time32('ms'),
     pa.time64('us'),
     pa.time64('ns')])
timestamp_types = st.builds(pa.timestamp,
                            unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
                            tz=tzst.timezones())
duration_types = st.builds(pa.duration,
                           st.sampled_from(['s', 'ms', 'us', 'ns']))
temporal_types = st.one_of(date_types, time_types, timestamp_types,
                           duration_types)

primitive_types = st.one_of(null_type, bool_type, binary_type, string_type,
                            large_binary_type, large_string_type,
                            numeric_types, temporal_types)

metadata = st.dictionaries(st.text(), st.text())
示例#28
0
# specific language governing permissions and limitations
# under the License.

import pickle

import pytest

import pyarrow as pa
import pyarrow.types as types

MANY_TYPES = [
    pa.null(),
    pa.bool_(),
    pa.int32(),
    pa.time32('s'),
    pa.time64('us'),
    pa.date32(),
    pa.timestamp('us'),
    pa.timestamp('us', tz='UTC'),
    pa.timestamp('us', tz='Europe/Paris'),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.binary(10),
    pa.list_(pa.int32()),
    pa.struct([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int8()),
示例#29
0
def test_date_time_types(tempdir):
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.array(data7, type=t7)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.0')

    t0 = pa.timestamp('ms')
    data0 = np.arange(4, dtype='int64')
    a0 = pa.array(data0, type=t0)

    t1 = pa.timestamp('us')
    data1 = np.arange(4, dtype='int64')
    a1 = pa.array(data1, type=t1)

    t2 = pa.timestamp('ns')
    data2 = np.arange(4, dtype='int64')
    a2 = pa.array(data2, type=t2)

    table = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]'])
    expected = pa.Table.from_arrays([a0, a1, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int64 for all timestamps supported by default
    filename = tempdir / 'int64_timestamps.parquet'
    _write_table(table, filename, version='2.0')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT64'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    t0_ns = pa.timestamp('ns')
    data0_ns = np.array(data0 * 1000000, dtype='int64')
    a0_ns = pa.array(data0_ns, type=t0_ns)

    t1_ns = pa.timestamp('ns')
    data1_ns = np.array(data1 * 1000, dtype='int64')
    a1_ns = pa.array(data1_ns, type=t1_ns)

    expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int96 nanosecond timestamps produced upon request
    filename = tempdir / 'explicit_int96_timestamps.parquet'
    _write_table(table,
                 filename,
                 version='2.0',
                 use_deprecated_int96_timestamps=True)
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    # int96 nanosecond timestamps implied by flavor 'spark'
    filename = tempdir / 'spark_int96_timestamps.parquet'
    _write_table(table, filename, version='2.0', flavor='spark')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)
示例#30
0
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000],
                     dtype='int64')
    a7 = pa.array(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.array(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]',
                                  'timestamp[ns]'])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0',
                     flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.array(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
示例#31
0
def main():
    # https://arrow.apache.org/docs/python/api/datatypes.html
    my_schema = pa.schema([
        # skip null

        ('c_bool', pa.bool_()),

        ('c_int8', pa.int8()),
        ('c_int16', pa.int16()),
        ('c_int32', pa.int32()),
        ('c_int64', pa.int64()),

        ('c_uint8', pa.uint8()),
        ('c_uint16', pa.uint16()),
        ('c_uint32', pa.uint32()),
        ('c_uint64', pa.uint64()),

        # skip ('c_float16', pa.float16()),
        ('c_float32', pa.float32()),
        ('c_float64', pa.float64()),

        ('c_time32', pa.time32('ms')),
        ('c_time64', pa.time64('ns')),
        ('c_timestamp', pa.timestamp('ms')),
        ('c_date32', pa.date32()),
        ('c_date64', pa.date64()),

        # skip binary

        ('c_string', pa.string()),

        # skip utf8
        # skip large_binary
        # skip large_string
        # skip large_utf8

        ('c_decimal128_8_3', pa.decimal128(8, 3))

        # skip list_
        # skip  large_list
        # skip struct
        # skip dictionary
        # skip field
        # skip schema
        # skip from_numpy_dtype
    ])

    c_bool = pa.array([False, True, False], type=pa.bool_())

    c_int8 = pa.array([1, 2, 3], type=pa.int8())
    c_int16 = pa.array([1, 2, 3], type=pa.int16())
    c_int32 = pa.array([1, 2, 3], type=pa.int32())
    c_int64 = pa.array([1, 2, 3], type=pa.int64())

    c_uint8 = pa.array([1, 2, 3], type=pa.uint8())
    c_uint16 = pa.array([1, 2, 3], type=pa.uint16())
    c_uint32 = pa.array([1, 2, 3], type=pa.uint32())
    c_uint64 = pa.array([1, 2, 3], type=pa.uint64())

    # c_float16 = pa.array([np.float16(1.0), np.float16(2.0), np.float16(3.0)], type=pa.float16())
    c_float32 = pa.array([1.0, 2.0, 3.0], type=pa.float32())
    c_float64 = pa.array([1.0, 2.0, 3.0], type=pa.float64())

    c_time32 = pa.array([1, 2, 3], type=pa.time32('ms'))
    c_time64 = pa.array([1, 2, 3], type=pa.time64('ns'))
    c_timestamp = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.timestamp('ms'))
    c_date32 = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.date32())
    c_date64 = pa.array([
        datetime(2019, 9, 3, 9, 0, 0),
        datetime(2019, 9, 3, 10, 0, 0),
        datetime(2019, 9, 3, 11, 0, 0)
    ], type=pa.date64())

    c_string = pa.array(
        ['*****@*****.**', '*****@*****.**', '*****@*****.**'],
        type=pa.string()
    )

    c_decimal128_8_3 = pa.array([1, 2, 3], type=pa.decimal128(8, 3))

    batch = pa.RecordBatch.from_arrays(
        [c_bool,
         c_int8, c_int16, c_int32, c_int64,
         c_uint8, c_uint16, c_uint32, c_uint64,
         # c_float16,
         c_float32, c_float64,
         c_time32, c_time64, c_timestamp, c_date32, c_date64,
         c_string,
         c_decimal128_8_3
         ],
        schema=my_schema
    )

    table = pa.Table.from_batches([batch])
    pq.write_table(table, 'example.parquet')
示例#32
0
文件: __init__.py 项目: tnir/pandas
        pa.uint8(), pa.uint16(),
        pa.uint32(), pa.uint64()
    ]
    SIGNED_INT_PYARROW_DTYPES = [
        pa.uint8(), pa.int16(),
        pa.int32(), pa.uint64()
    ]
    ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES

    FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()]
    STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()]

    TIME_PYARROW_DTYPES = [
        pa.time32("s"),
        pa.time32("ms"),
        pa.time64("us"),
        pa.time64("ns"),
    ]
    DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()]
    DATETIME_PYARROW_DTYPES = [
        pa.timestamp(unit=unit, tz=tz) for unit in ["s", "ms", "us", "ns"]
        for tz in [None, "UTC", "US/Pacific", "US/Eastern"]
    ]
    TIMEDELTA_PYARROW_DTYPES = [
        pa.duration(unit) for unit in ["s", "ms", "us", "ns"]
    ]

    BOOL_PYARROW_DTYPES = [pa.bool_()]

    # TODO: Add container like pyarrow types:
    #  https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions
 ("int8", pa.int8()),
 ("int16", pa.int16()),
 ("int32", pa.int32()),
 ("int64", pa.int64()),
 ("uint8", pa.uint8()),
 ("uint16", pa.uint16()),
 ("uint32", pa.uint32()),
 ("uint64", pa.uint64()),
 ("float16", pa.float16()),
 ("float32", pa.float32()),
 ("float64", pa.float64()),
 ("decimal128(38,1)", pa.decimal128(38, 1)),
 ("decimal128(1,2)", pa.decimal128(1, 2)),
 ("time32(s)", pa.time32("s")),
 ("time32(ms)", pa.time32("ms")),
 ("time64(us)", pa.time64("us")),
 ("time64(ns)", pa.time64("ns")),
 ("timestamp(s)", pa.timestamp("s")),
 ("timestamp(ms)", pa.timestamp("ms")),
 ("timestamp(us)", pa.timestamp("us")),
 ("timestamp(ns)", pa.timestamp("ns")),
 ("date32", pa.date32()),
 ("date64", pa.date64()),
 ("string", pa.string()),
 ("large_string", pa.large_string()),
 ("utf8", pa.utf8()),
 ("large_utf8", pa.large_utf8()),
 ("binary", pa.binary()),
 ("binary(128)", pa.binary(128)),
 ("large_binary", pa.large_binary()),
 ("struct<num:int64>", pa.struct([("num", pa.int64())])),
示例#34
0
def pyarrow_time():
    return pyarrow.time64("us")