Пример #1
0
    def test_max_times(self, duckdb_cursor):
        if not can_run:
            return
        data = pa.array([2147483647000000], type=pa.time64('us'))
        result = pa.Table.from_arrays([data], ['a'])
        #Max Sec
        data = pa.array([2147483647], type=pa.time32('s'))
        arrow_table = pa.Table.from_arrays([data], ['a'])
        rel = duckdb.from_arrow_table(arrow_table).arrow()
        assert (rel['a'] == result['a'])

        #Max MSec
        data = pa.array([2147483647000], type=pa.time64('us'))
        result = pa.Table.from_arrays([data], ['a'])
        data = pa.array([2147483647], type=pa.time32('ms'))
        arrow_table = pa.Table.from_arrays([data], ['a'])
        rel = duckdb.from_arrow_table(arrow_table).arrow()
        assert (rel['a'] == result['a'])

        #Max NSec
        data = pa.array([9223372036854774], type=pa.time64('us'))
        result = pa.Table.from_arrays([data], ['a'])
        data = pa.array([9223372036854774000], type=pa.time64('ns'))
        arrow_table = pa.Table.from_arrays([data], ['a'])
        rel = duckdb.from_arrow_table(arrow_table).arrow()

        print(rel['a'])
        print(result['a'])
        assert (rel['a'] == result['a'])
    def test_pytime_from_pandas(self):
        pytimes = [time(1, 2, 3, 1356), time(4, 5, 6, 1356)]

        # microseconds
        t1 = pa.time64('us')

        aobjs = np.array(pytimes + [None], dtype=object)
        parr = pa.array(aobjs)
        assert parr.type == t1
        assert parr[0].as_py() == pytimes[0]
        assert parr[1].as_py() == pytimes[1]
        assert parr[2] is pa.NA

        # DataFrame
        df = pd.DataFrame({'times': aobjs})
        batch = pa.RecordBatch.from_pandas(df)
        assert batch[0].equals(parr)

        # Test ndarray of int64 values
        arr = np.array([_pytime_to_micros(v) for v in pytimes], dtype='int64')

        a1 = pa.array(arr, type=pa.time64('us'))
        assert a1[0].as_py() == pytimes[0]

        a2 = pa.array(arr * 1000, type=pa.time64('ns'))
        assert a2[0].as_py() == pytimes[0]

        a3 = pa.array((arr / 1000).astype('i4'), type=pa.time32('ms'))
        assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)

        a4 = pa.array((arr / 1000000).astype('i4'), type=pa.time32('s'))
        assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
Пример #3
0
Файл: jvm.py Проект: rok/arrow
def _from_jvm_time_type(jvm_type):
    """
    Convert a JVM time type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Time

    Returns
    -------
    typ: pyarrow.DataType
    """
    time_unit = jvm_type.getUnit().toString()
    if time_unit == 'SECOND':
        assert jvm_type.bitWidth == 32
        return pa.time32('s')
    elif time_unit == 'MILLISECOND':
        assert jvm_type.bitWidth == 32
        return pa.time32('ms')
    elif time_unit == 'MICROSECOND':
        assert jvm_type.bitWidth == 64
        return pa.time64('us')
    elif time_unit == 'NANOSECOND':
        assert jvm_type.bitWidth == 64
        return pa.time64('ns')
Пример #4
0
def _from_jvm_time_type(jvm_type):
    """
    Convert a JVM time type to its Python equivalent.

    Parameters
    ----------
    jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Time

    Returns
    -------
    typ: pyarrow.DataType
    """
    time_unit = jvm_type.getUnit().toString()
    if time_unit == 'SECOND':
        assert jvm_type.getBitWidth() == 32
        return pa.time32('s')
    elif time_unit == 'MILLISECOND':
        assert jvm_type.getBitWidth() == 32
        return pa.time32('ms')
    elif time_unit == 'MICROSECOND':
        assert jvm_type.getBitWidth() == 64
        return pa.time64('us')
    elif time_unit == 'NANOSECOND':
        assert jvm_type.getBitWidth() == 64
        return pa.time64('ns')
Пример #5
0
 def test_time32_python(self):
     """
     Python -> Rust -> Python
     """
     a = pyarrow.array([None, 1, 2], pyarrow.time32('s'))
     b = arrow_pyarrow_integration_testing.concatenate(a)
     expected = pyarrow.array([None, 1, 2] + [None, 1, 2], pyarrow.time32('s'))
     self.assertEqual(b, expected)
Пример #6
0
def test_table(n, types=None, offset=None, length=None, nullable=True):
    if types is None:
        types = [
            pyarrow.null(),
            pyarrow.bool_(),
            pyarrow.int8(),
            pyarrow.int16(),
            pyarrow.int32(),
            pyarrow.int64(),
            pyarrow.uint8(),
            pyarrow.uint16(),
            pyarrow.uint32(),
            pyarrow.uint64(),
            pyarrow.float16(),
            pyarrow.float32(),
            pyarrow.float64(),
            pyarrow.date32(),
            pyarrow.date64(),
            pyarrow.timestamp('s'),
            pyarrow.timestamp('ms'),
            pyarrow.timestamp('us'),
            pyarrow.timestamp('ns'),
            pyarrow.time32('s'),
            pyarrow.time32('ms'),
            pyarrow.time64('us'),
            pyarrow.time64('ns'),
            pyarrow.string(),
            pyarrow.binary(),
            pyarrow.binary(4),
            pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), True),
            pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), True),
            pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), False),
            pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), False),
            pyarrow.list_(pyarrow.int32()),
            pyarrow.struct([pyarrow.field('int32', pyarrow.int32())]),
            pyarrow.list_(
                pyarrow.struct([pyarrow.field('int32', pyarrow.int32())])),
            pyarrow.struct(
                [pyarrow.field('int32', pyarrow.list_(pyarrow.int32()))]),
        ]

    data = list()

    for t in types:
        name = str(t)
        array = TestArrayGenerator(n, t, False).array
        if offset is not None:
            array = array.slice(offset, length)
        data.append(pyarrow.column(name, array))

        if nullable:
            name = str(t) + ' (null)'
            array = TestArrayGenerator(n, t, True).array
            if offset is not None:
                array = array.slice(offset, length)
            data.append(pyarrow.column(name, array))

    return pyarrow.Table.from_arrays(data)
Пример #7
0
def test_date_time_types(tmpdir):
    buf = io.BytesIO()

    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]', 'time32[s]'])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]', 'time32[s]'])

    pq.write_table(table, buf, version="2.0")
    buf.seek(0)

    result = pq.read_table(buf)
    assert result.equals(expected)

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            pq.write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
Пример #8
0
def test_time32_units():
    for valid_unit in ('s', 'ms'):
        ty = pa.time32(valid_unit)
        assert ty.unit == valid_unit

    for invalid_unit in ('m', 'us', 'ns'):
        error_msg = 'Invalid time unit for time32: {!r}'.format(invalid_unit)
        with pytest.raises(ValueError, match=error_msg):
            pa.time32(invalid_unit)
Пример #9
0
def test_time32_units():
    for valid_unit in ('s', 'ms'):
        ty = pa.time32(valid_unit)
        assert ty.unit == valid_unit

    for invalid_unit in ('m', 'us', 'ns'):
        error_msg = 'Invalid TimeUnit for time32: {}'.format(invalid_unit)
        with pytest.raises(ValueError, match=error_msg):
            pa.time32(invalid_unit)
Пример #10
0
def test_time():
    t1 = datetime.time(18, 0)
    t2 = datetime.time(21, 0)

    types = [pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns')]
    for ty in types:
        for t in [t1, t2]:
            s = pa.scalar(t, type=ty)
            assert s.as_py() == t
Пример #11
0
 def test_time_null(self, duckdb_cursor):
     if not can_run:
         return   
     data = (pa.array([None], type=pa.time32('s')),pa.array([None], type=pa.time32('ms')),pa.array([None], pa.time64('us')),pa.array([None], pa.time64('ns')))
     arrow_table = pa.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d'])
     rel = duckdb.from_arrow(arrow_table).arrow()
     assert (rel['a'] == arrow_table['c'])
     assert (rel['b'] == arrow_table['c'])
     assert (rel['c'] == arrow_table['c'])
     assert (rel['d'] == arrow_table['c'])
Пример #12
0
def test_time32_python():
    """
    Python -> Rust -> Python
    """
    a = pa.array([None, 1, 2], pa.time32("s"))
    b = rust.concatenate(a)
    expected = pa.array([None, 1, 2] + [None, 1, 2], pa.time32("s"))
    assert b == expected
    del a
    del b
    del expected
Пример #13
0
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.Array.from_pandas(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.Array.from_pandas(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.Array.from_pandas(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.Array.from_pandas(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.Array.from_pandas(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]'])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
    def test_arrow_time_to_pandas(self):
        pytimes = [time(1, 2, 3, 1356),
                   time(4, 5, 6, 1356),
                   time(0, 0, 0)]

        expected = np.array(pytimes[:2] + [None])
        expected_ms = np.array([x.replace(microsecond=1000)
                                for x in pytimes[:2]] +
                               [None])
        expected_s = np.array([x.replace(microsecond=0)
                               for x in pytimes[:2]] +
                              [None])

        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')
        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')

        null_mask = np.array([False, False, True], dtype=bool)

        a1 = pa.array(arr, mask=null_mask, type=pa.time64('us'))
        a2 = pa.array(arr * 1000, mask=null_mask,
                      type=pa.time64('ns'))

        a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask,
                      type=pa.time32('ms'))
        a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask,
                      type=pa.time32('s'))

        names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
        batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
        arr = a1.to_pandas()
        assert (arr == expected).all()

        arr = a2.to_pandas()
        assert (arr == expected).all()

        arr = a3.to_pandas()
        assert (arr == expected_ms).all()

        arr = a4.to_pandas()
        assert (arr == expected_s).all()

        df = batch.to_pandas()
        expected_df = pd.DataFrame({'time64[us]': expected,
                                    'time64[ns]': expected,
                                    'time32[ms]': expected_ms,
                                    'time32[s]': expected_s},
                                   columns=names)

        tm.assert_frame_equal(df, expected_df)
Пример #15
0
    def test_arrow_time_to_pandas(self):
        pytimes = [time(1, 2, 3, 1356),
                   time(4, 5, 6, 1356),
                   time(0, 0, 0)]

        expected = np.array(pytimes[:2] + [None])
        expected_ms = np.array([x.replace(microsecond=1000)
                                for x in pytimes[:2]] +
                               [None])
        expected_s = np.array([x.replace(microsecond=0)
                               for x in pytimes[:2]] +
                              [None])

        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')
        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')

        null_mask = np.array([False, False, True], dtype=bool)

        a1 = pa.Array.from_pandas(arr, mask=null_mask, type=pa.time64('us'))
        a2 = pa.Array.from_pandas(arr * 1000, mask=null_mask,
                                  type=pa.time64('ns'))

        a3 = pa.Array.from_pandas((arr / 1000).astype('i4'), mask=null_mask,
                                  type=pa.time32('ms'))
        a4 = pa.Array.from_pandas((arr / 1000000).astype('i4'), mask=null_mask,
                                  type=pa.time32('s'))

        names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
        batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
        arr = a1.to_pandas()
        assert (arr == expected).all()

        arr = a2.to_pandas()
        assert (arr == expected).all()

        arr = a3.to_pandas()
        assert (arr == expected_ms).all()

        arr = a4.to_pandas()
        assert (arr == expected_s).all()

        df = batch.to_pandas()
        expected_df = pd.DataFrame({'time64[us]': expected,
                                    'time64[ns]': expected,
                                    'time32[ms]': expected_ms,
                                    'time32[s]': expected_s},
                                   columns=names)

        tm.assert_frame_equal(df, expected_df)
Пример #16
0
 def test_time32_python(self):
     """
     Python -> Rust -> Python
     """
     old_allocated = pyarrow.total_allocated_bytes()
     a = pyarrow.array([None, 1, 2], pyarrow.time32('s'))
     b = arrow_pyarrow_integration_testing.concatenate(a)
     expected = pyarrow.array([None, 1, 2] + [None, 1, 2], pyarrow.time32('s'))
     self.assertEqual(b, expected)
     del a
     del b
     del expected
     # No leak of C++ memory
     self.assertEqual(old_allocated, pyarrow.total_allocated_bytes())
def _map_arrow_type(arrow_type):
    arrow_to_dh = {
        pa.null(): '',
        pa.bool_(): '',
        pa.int8(): 'byte',
        pa.int16(): 'short',
        pa.int32(): 'int',
        pa.int64(): 'long',
        pa.uint8(): '',
        pa.uint16(): 'char',
        pa.uint32(): '',
        pa.uint64(): '',
        pa.float16(): '',
        pa.float32(): 'float',
        pa.float64(): 'double',
        pa.time32('s'): '',
        pa.time32('ms'): '',
        pa.time64('us'): '',
        pa.time64('ns'): 'io.deephaven.time.DateTime',
        pa.timestamp('us', tz=None): '',
        pa.timestamp('ns', tz=None): '',
        pa.date32(): 'java.time.LocalDate',
        pa.date64(): 'java.time.LocalDate',
        pa.binary(): '',
        pa.string(): 'java.lang.String',
        pa.utf8(): 'java.lang.String',
        pa.large_binary(): '',
        pa.large_string(): '',
        pa.large_utf8(): '',
        # decimal128(int precision, int scale=0)
        # list_(value_type, int list_size=-1)
        # large_list(value_type)
        # map_(key_type, item_type[, keys_sorted])
        # struct(fields)
        # dictionary(index_type, value_type, …)
        # field(name, type, bool nullable = True[, metadata])
        # schema(fields[, metadata])
        # from_numpy_dtype(dtype)
    }

    dh_type = arrow_to_dh.get(arrow_type)
    if not dh_type:
        # if this is a case of timestamp with tz specified
        if isinstance(arrow_type, pa.TimestampType):
            dh_type = "io.deephaven.time.DateTime"

    if not dh_type:
        raise DHError(f'unsupported arrow data type : {arrow_type}')

    return {"deephaven:type": dh_type}
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'),
            pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'),
            pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(),
            pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(),
            pa.binary(10), pa.list_(pa.int32()),
            pa.struct([
                pa.field('a', pa.int32()),
                pa.field('b', pa.int8()),
                pa.field('c', pa.string())
            ]),
            pa.struct([
                pa.field('a', pa.int32(), nullable=False),
                pa.field('b', pa.int8(), nullable=False),
                pa.field('c', pa.string())
            ]),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_DENSE),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_SPARSE),
            pa.union([
                pa.field('a', pa.binary(10), nullable=False),
                pa.field('b', pa.string())
            ],
                     mode=pa.lib.UnionMode_SPARSE),
            pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])))
Пример #19
0
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
Пример #20
0
def test_type_ids():
    # Having this fixed is very important because internally we rely on this id
    # to parse from python
    for idx, arrow_type in [
        (0, pa.null()),
        (1, pa.bool_()),
        (2, pa.uint8()),
        (3, pa.int8()),
        (4, pa.uint16()),
        (5, pa.int16()),
        (6, pa.uint32()),
        (7, pa.int32()),
        (8, pa.uint64()),
        (9, pa.int64()),
        (10, pa.float16()),
        (11, pa.float32()),
        (12, pa.float64()),
        (13, pa.string()),
        (13, pa.utf8()),
        (14, pa.binary()),
        (16, pa.date32()),
        (17, pa.date64()),
        (18, pa.timestamp("us")),
        (19, pa.time32("s")),
        (20, pa.time64("us")),
        (23, pa.decimal128(8, 1)),
        (34, pa.large_utf8()),
        (35, pa.large_binary()),
    ]:
        assert idx == arrow_type.id
Пример #21
0
def test_statistics_convert_logical_types(tempdir):
    # ARROW-5166, ARROW-4139

    # (min, max, type)
    cases = [
        (10, 11164359321221007157, pa.uint64()), (10, 4294967295, pa.uint32()),
        ("ähnlich", "öffentlich", pa.utf8()),
        (datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0,
                                                       1000), pa.time32('ms')),
        (datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0,
                                                       1000), pa.time64('us')),
        (datetime.datetime(2019, 6, 24, 0, 0, 0,
                           1000), datetime.datetime(2019, 6, 25, 0, 0, 0,
                                                    1000), pa.timestamp('ms')),
        (datetime.datetime(2019, 6, 24, 0, 0, 0,
                           1000), datetime.datetime(2019, 6, 25, 0, 0, 0,
                                                    1000), pa.timestamp('us'))
    ]

    for i, (min_val, max_val, typ) in enumerate(cases):
        t = pa.Table.from_arrays([pa.array([min_val, max_val], type=typ)],
                                 ['col'])
        path = str(tempdir / ('example{}.parquet'.format(i)))
        pq.write_table(t, path, version='2.0')
        pf = pq.ParquetFile(path)
        stats = pf.metadata.row_group(0).column(0).statistics
        assert stats.min == min_val
        assert stats.max == max_val
Пример #22
0
def test_cast_time32_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int32'),
                   type=pa.time32('s'))
    expected = pa.array([0, 1, 2], type='i4')

    result = arr.cast('i4')
    assert result.equals(expected)
Пример #23
0
def test_cast_time32_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int32'),
                   type=pa.time32('s'))
    expected = pa.array([0, 1, 2], type='i4')

    result = arr.cast('i4')
    assert result.equals(expected)
Пример #24
0
def test_is_temporal_date_time_timestamp():
    date_types = [pa.date32(), pa.date64()]
    time_types = [pa.time32('s'), pa.time64('ns')]
    timestamp_types = [pa.timestamp('ms')]
    duration_types = [pa.duration('ms')]

    for case in date_types + time_types + timestamp_types + duration_types:
        assert types.is_temporal(case)

    for case in date_types:
        assert types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)
        assert not types.is_duration(case)

    for case in time_types:
        assert types.is_time(case)
        assert not types.is_date(case)
        assert not types.is_timestamp(case)
        assert not types.is_duration(case)

    for case in timestamp_types:
        assert types.is_timestamp(case)
        assert not types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_duration(case)

    for case in duration_types:
        assert types.is_duration(case)
        assert not types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)

    assert not types.is_temporal(pa.int32())
Пример #25
0
    def test_load_table_columnar_arrow_all(self, con):

        c = con.cursor()
        c.execute('drop table if exists all_types;')
        create = textwrap.dedent('''\
        create table all_types (
            boolean_ BOOLEAN,
            smallint_ SMALLINT,
            int_ INT,
            bigint_ BIGINT,
            float_ FLOAT,
            double_ DOUBLE,
            varchar_ VARCHAR(40),
            text_ TEXT,
            time_ TIME,
            timestamp_ TIMESTAMP,
            date_ DATE
        );''')
        # skipping decimal for now
        c.execute(create)

        names = [
            'boolean_',
            'smallint_',
            'int_',
            'bigint_',
            'float_',
            'double_',
            'varchar_',
            'text_',
            'time_',
            'timestamp_',
            'date_',
        ]

        columns = [
            pa.array([True, False, None], type=pa.bool_()),
            pa.array([1, 0, None]).cast(pa.int16()),
            pa.array([1, 0, None]).cast(pa.int32()),
            pa.array([1, 0, None]),
            pa.array([1.0, 1.1, None]).cast(pa.float32()),
            pa.array([1.0, 1.1, None]),
            # no fixed-width string
            pa.array(['a', 'b', None]),
            pa.array(['a', 'b', None]),
            (pa.array([1, 2, None]).cast(pa.int32()).cast(pa.time32('s'))),
            pa.array([
                datetime.datetime(2016, 1, 1, 12, 12, 12),
                datetime.datetime(2017, 1, 1),
                None,
            ]),
            pa.array(
                [datetime.date(2016, 1, 1),
                 datetime.date(2017, 1, 1), None]),
        ]
        table = pa.Table.from_arrays(columns, names=names)
        con.load_table_arrow("all_types", table)
        c.execute('drop table if exists all_types;')
Пример #26
0
def test_from_time_arrow() -> None:
    times = pa.array([10, 20, 30], type=pa.time32("s"))
    times_table = pa.table([times], names=["times"])

    assert pl.from_arrow(times_table).to_series().to_list() == [
        time(0, 0, 10),
        time(0, 0, 20),
        time(0, 0, 30),
    ]
Пример #27
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
        ('duration[s]', pa.duration('s')),
        ('duration[ms]', pa.duration('ms')),
        ('duration[us]', pa.duration('us')),
        ('duration[ns]', pa.duration('ns')),
        ('month_day_nano_interval', pa.month_day_nano_interval()),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
def test_time_types():
    t1 = pa.time32('s')
    t2 = pa.time32('ms')
    t3 = pa.time64('us')
    t4 = pa.time64('ns')

    assert t1.unit == 's'
    assert t2.unit == 'ms'
    assert t3.unit == 'us'
    assert t4.unit == 'ns'

    assert str(t1) == 'time32[s]'
    assert str(t4) == 'time64[ns]'

    with pytest.raises(ValueError):
        pa.time32('us')

    with pytest.raises(ValueError):
        pa.time64('s')
Пример #29
0
 def _to_arrow_type(field):
     if field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.TINYINT:
         return pa.field(field.name, pa.int8(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.SMALLINT:
         return pa.field(field.name, pa.int16(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.INT:
         return pa.field(field.name, pa.int32(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.BIGINT:
         return pa.field(field.name, pa.int64(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.BOOLEAN:
         return pa.field(field.name, pa.bool_(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.FLOAT:
         return pa.field(field.name, pa.float32(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.DOUBLE:
         return pa.field(field.name, pa.float64(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.VARCHAR:
         return pa.field(field.name, pa.utf8(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.VARBINARY:
         return pa.field(field.name, pa.binary(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.DECIMAL:
         return pa.field(
             field.name,
             pa.decimal128(field.type.decimal_info.precision,
                           field.type.decimal_info.scale),
             field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.DATE:
         return pa.field(field.name, pa.date32(), field.type.nullable)
     elif field.type.type_name == flink_fn_execution_pb2.Schema.TypeName.TIME:
         if field.type.time_info.precision == 0:
             return pa.field(field.name, pa.time32('s'),
                             field.type.nullable)
         elif 1 <= field.type.time_type.precision <= 3:
             return pa.field(field.name, pa.time32('ms'),
                             field.type.nullable)
         elif 4 <= field.type.time_type.precision <= 6:
             return pa.field(field.name, pa.time64('us'),
                             field.type.nullable)
         else:
             return pa.field(field.name, pa.time64('ns'),
                             field.type.nullable)
     else:
         raise ValueError("field_type %s is not supported." %
                          field.type)
Пример #30
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
Пример #31
0
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.map_(pa.string(), pa.int8()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.union([
            pa.field('a', pa.int8()),
            pa.field('b', pa.int16())
        ], pa.lib.UnionMode_SPARSE),
        pa.union([
            pa.field('a', pa.int8()),
            pa.field('b', pa.int16())
        ], pa.lib.UnionMode_DENSE),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal128(12, 2),
        pa.decimal256(76, 38),
        pa.field('a', 'string', metadata={b'foo': b'bar'}),
        pa.list_(pa.field("element", pa.int64())),
        pa.large_list(pa.field("element", pa.int64())),
        pa.map_(pa.field("key", pa.string(), nullable=False),
                pa.field("value", pa.int8()))
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
    def test_pytime_from_pandas(self):
        pytimes = [time(1, 2, 3, 1356),
                   time(4, 5, 6, 1356)]

        # microseconds
        t1 = pa.time64('us')

        aobjs = np.array(pytimes + [None], dtype=object)
        parr = pa.array(aobjs)
        assert parr.type == t1
        assert parr[0].as_py() == pytimes[0]
        assert parr[1].as_py() == pytimes[1]
        assert parr[2] is pa.NA

        # DataFrame
        df = pd.DataFrame({'times': aobjs})
        batch = pa.RecordBatch.from_pandas(df)
        assert batch[0].equals(parr)

        # Test ndarray of int64 values
        arr = np.array([_pytime_to_micros(v) for v in pytimes],
                       dtype='int64')

        a1 = pa.array(arr, type=pa.time64('us'))
        assert a1[0].as_py() == pytimes[0]

        a2 = pa.array(arr * 1000, type=pa.time64('ns'))
        assert a2[0].as_py() == pytimes[0]

        a3 = pa.array((arr / 1000).astype('i4'),
                      type=pa.time32('ms'))
        assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)

        a4 = pa.array((arr / 1000000).astype('i4'),
                      type=pa.time32('s'))
        assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
Пример #33
0
    def test_string_to_arrow_bijection_for_primitive_types(self):
        supported_pyarrow_datatypes = [
            pa.time32("s"),
            pa.time64("us"),
            pa.timestamp("s"),
            pa.timestamp("ns", tz="America/New_York"),
            pa.date32(),
            pa.date64(),
            pa.duration("s"),
            pa.decimal128(10, 2),
            pa.decimal256(40, -3),
            pa.string(),
            pa.int32(),
            pa.float64(),
        ]
        for dt in supported_pyarrow_datatypes:
            self.assertEqual(dt, string_to_arrow(_arrow_to_datasets_dtype(dt)))

        unsupported_pyarrow_datatypes = [pa.list_(pa.float64())]
        for dt in unsupported_pyarrow_datatypes:
            with self.assertRaises(ValueError):
                string_to_arrow(_arrow_to_datasets_dtype(dt))

        supported_datasets_dtypes = [
            "time32[s]",
            "timestamp[ns]",
            "timestamp[ns, tz=+07:30]",
            "duration[us]",
            "decimal128(30, -4)",
            "int32",
            "float64",
        ]
        for sdt in supported_datasets_dtypes:
            self.assertEqual(sdt,
                             _arrow_to_datasets_dtype(string_to_arrow(sdt)))

        unsupported_datasets_dtypes = [
            "time32[ns]",
            "timestamp[blob]",
            "timestamp[[ns]]",
            "timestamp[ns, tz=[ns]]",
            "duration[[us]]",
            "decimal20(30, -4)",
            "int",
        ]
        for sdt in unsupported_datasets_dtypes:
            with self.assertRaises(ValueError):
                string_to_arrow(sdt)
Пример #34
0
def test_types_hashable():
    types = [
        pa.null(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i
Пример #35
0
def test_types_hashable():
    types = [
        pa.null(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i
Пример #36
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (
        pa.null(),
        pa.bool_(),
        pa.int32(),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.decimal128(19, 4),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int32()),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())]),
        pa.struct([pa.field('a', pa.int32(), nullable=False),
                   pa.field('b', pa.int8(), nullable=False),
                   pa.field('c', pa.string())]),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.union([pa.field('a', pa.binary(10), nullable=False),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        pa.dictionary(pa.int32(), pa.string())
    )
Пример #37
0
def test_is_temporal_date_time_timestamp():
    date_types = [pa.date32(), pa.date64()]
    time_types = [pa.time32('s'), pa.time64('ns')]
    timestamp_types = [pa.timestamp('ms')]

    for case in date_types + time_types + timestamp_types:
        assert types.is_temporal(case)

    for case in date_types:
        assert types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)

    for case in time_types:
        assert types.is_time(case)
        assert not types.is_date(case)
        assert not types.is_timestamp(case)

    for case in timestamp_types:
        assert types.is_timestamp(case)
        assert not types.is_date(case)
        assert not types.is_time(case)

    assert not types.is_temporal(pa.int32())
Пример #38
0
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2 ** 63], type=pa.uint64())
    expected = pa.array(np.array([2 ** 63], dtype='u8'))
    assert arr.equals(expected)


def test_array_conversions_no_sentinel_values():
    arr = np.array([1, 2, 3, 4], dtype='int8')
Пример #39
0
    pa.float32(),
    pa.float64()
])
decimal_type = st.builds(
    pa.decimal128,
    precision=st.integers(min_value=1, max_value=38),
    scale=st.integers(min_value=1, max_value=38)
)
numeric_types = st.one_of(integer_types, floating_types, decimal_type)

date_types = st.sampled_from([
    pa.date32(),
    pa.date64()
])
time_types = st.sampled_from([
    pa.time32('s'),
    pa.time32('ms'),
    pa.time64('us'),
    pa.time64('ns')
])
timestamp_types = st.builds(
    pa.timestamp,
    unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
    tz=tzst.timezones()
)
temporal_types = st.one_of(date_types, time_types, timestamp_types)

primitive_types = st.one_of(
    null_type,
    bool_type,
    binary_type,
Пример #40
0
 "INT64",
 pyarrow.uint8().id:
 "INT64",
 pyarrow.uint16().id:
 "INT64",
 pyarrow.uint32().id:
 "INT64",
 pyarrow.uint64().id:
 "INT64",
 pyarrow.float16().id:
 "FLOAT64",
 pyarrow.float32().id:
 "FLOAT64",
 pyarrow.float64().id:
 "FLOAT64",
 pyarrow.time32("ms").id:
 "TIME",
 pyarrow.time64("ns").id:
 "TIME",
 pyarrow.timestamp("ns").id:
 "TIMESTAMP",
 pyarrow.date32().id:
 "DATE",
 pyarrow.date64().id:
 "DATETIME",  # because millisecond resolution
 pyarrow.binary().id:
 "BYTES",
 pyarrow.string().id:
 "STRING",  # also alias for pyarrow.utf8()
 # The exact scale and precision don't matter, see below.
 pyarrow.decimal128(38, scale=9).id:
Пример #41
0
def dataframe_with_lists(include_index=False, parquet_compatible=False):
    """
    Dataframe with list columns of every possible primtive type.

    Returns
    -------
    df: pandas.DataFrame
    schema: pyarrow.Schema
        Arrow schema definition that is in line with the constructed df.
    parquet_compatible: bool
        Exclude types not supported by parquet
    """
    arrays = OrderedDict()
    fields = []

    fields.append(pa.field('int64', pa.list_(pa.int64())))
    arrays['int64'] = [
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4],
        None,
        [],
        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
                 dtype=np.int64)[::2]
    ]
    fields.append(pa.field('double', pa.list_(pa.float64())))
    arrays['double'] = [
        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
        [0., 1., 2., 3., 4.],
        None,
        [],
        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
    ]
    fields.append(pa.field('bytes_list', pa.list_(pa.binary())))
    arrays['bytes_list'] = [
        [b"1", b"f"],
        None,
        [b"1"],
        [b"1", b"2", b"3"],
        [],
    ]
    fields.append(pa.field('str_list', pa.list_(pa.string())))
    arrays['str_list'] = [
        [u"1", u"ä"],
        None,
        [u"1"],
        [u"1", u"2", u"3"],
        [],
    ]

    date_data = [
        [],
        [date(2018, 1, 1), date(2032, 12, 30)],
        [date(2000, 6, 7)],
        None,
        [date(1969, 6, 9), date(1972, 7, 3)]
    ]
    time_data = [
        [time(23, 11, 11), time(1, 2, 3), time(23, 59, 59)],
        [],
        [time(22, 5, 59)],
        None,
        [time(0, 0, 0), time(18, 0, 2), time(12, 7, 3)]
    ]

    temporal_pairs = [
        (pa.date32(), date_data),
        (pa.date64(), date_data),
        (pa.time32('s'), time_data),
        (pa.time32('ms'), time_data),
        (pa.time64('us'), time_data)
    ]
    if not parquet_compatible:
        temporal_pairs += [
            (pa.time64('ns'), time_data),
        ]

    for value_type, data in temporal_pairs:
        field_name = '{}_list'.format(value_type)
        field_type = pa.list_(value_type)
        field = pa.field(field_name, field_type)
        fields.append(field)
        arrays[field_name] = data

    if include_index:
        fields.append(pa.field('__index_level_0__', pa.int64()))

    df = pd.DataFrame(arrays)
    schema = pa.schema(fields)

    return df, schema
Пример #42
0
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000],
                     dtype='int64')
    a7 = pa.array(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.array(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
                                 ['date32', 'date64', 'timestamp[us]',
                                  'time32[s]', 'time64[us]',
                                  'time32_from64[s]',
                                  'timestamp[ns]'])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
                                    ['date32', 'date64', 'timestamp[us]',
                                     'time32[s]', 'time64[us]',
                                     'time32_from64[s]',
                                     'timestamp[ns]'])

    _check_roundtrip(table, expected=expected, version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0',
                     flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.array(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
Пример #43
0
    assert len(in_dict) == len(fields)
    for i, field in enumerate(fields):
        assert in_dict[field] == i


def test_fields_weakrefable():
    field = pa.field('a', pa.int32())
    wr = weakref.ref(field)
    assert wr() is not None
    del field
    assert wr() is None


@pytest.mark.parametrize('t,check_func', [(pa.date32(), types.is_date32),
                                          (pa.date64(), types.is_date64),
                                          (pa.time32('s'), types.is_time32),
                                          (pa.time64('ns'), types.is_time64),
                                          (pa.int8(), types.is_int8),
                                          (pa.int16(), types.is_int16),
                                          (pa.int32(), types.is_int32),
                                          (pa.int64(), types.is_int64),
                                          (pa.uint8(), types.is_uint8),
                                          (pa.uint16(), types.is_uint16),
                                          (pa.uint32(), types.is_uint32),
                                          (pa.uint64(), types.is_uint64),
                                          (pa.float16(), types.is_float16),
                                          (pa.float32(), types.is_float32),
                                          (pa.float64(), types.is_float64)])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)
Пример #44
0
    ("string", None, pa.StringScalar, pa.StringValue),
    (b"bytes", None, pa.BinaryScalar, pa.BinaryValue),
    ("largestring", pa.large_string(), pa.LargeStringScalar,
     pa.LargeStringValue),
    (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar,
     pa.LargeBinaryValue),
    (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar, pa.FixedSizeBinaryValue),
    ([1, 2, 3], None, pa.ListScalar, pa.ListValue),
    ([1, 2, 3, 4], pa.large_list(
        pa.int8()), pa.LargeListScalar, pa.LargeListValue),
    ([1, 2, 3, 4, 5], pa.list_(
        pa.int8(), 5), pa.FixedSizeListScalar, pa.FixedSizeListValue),
    (datetime.date.today(), None, pa.Date32Scalar, pa.Date32Value),
    (datetime.date.today(), pa.date64(), pa.Date64Scalar, pa.Date64Value),
    (datetime.datetime.now(), None, pa.TimestampScalar, pa.TimestampValue),
    (datetime.datetime.now().time().replace(microsecond=0), pa.time32('s'),
     pa.Time32Scalar, pa.Time32Value),
    (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value),
    (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue),
    ({
        'a': 1,
        'b': [1, 2]
    }, None, pa.StructScalar, pa.StructValue),
    ([('a', 1),
      ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar, pa.MapValue),
])
def test_basics(value, ty, klass, deprecated):
    s = pa.scalar(value, type=ty)
    assert isinstance(s, klass)
    assert s.as_py() == value
    assert s == pa.scalar(value, type=ty)
Пример #45
0
#   jvm_spec = om.writeValueAsString(field)
@pytest.mark.parametrize('pa_type,jvm_spec', [
    (pa.null(), '{"name":"null"}'),
    (pa.bool_(), '{"name":"bool"}'),
    (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'),
    (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'),
    (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'),
    (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'),
    (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'),
    (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'),
    (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'),
    (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'),
    (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'),
    (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'),
    (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'),
    (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'),
    (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'),
    (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'),
    (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'),
    (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",'
        '"timezone":null}'),
    (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",'
        '"timezone":null}'),
    (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",'
        '"timezone":null}'),
    (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
        '"timezone":null}'),
    (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"'
        ',"timezone":"UTC"}'),
    (pa.timestamp('ns', tz='Europe/Paris'), '{"name":"timestamp",'
        '"unit":"NANOSECOND","timezone":"Europe/Paris"}'),
Пример #46
0
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.int8()),
                   pa.field('c', pa.string())])
    ]

    in_dict = {}
    for i, type_ in enumerate(types):
        assert hash(type_) == hash(type_)
        in_dict[type_] = i
        assert in_dict[type_] == i


@pytest.mark.parametrize('t,check_func', [
    (pa.date32(), types.is_date32),
    (pa.date64(), types.is_date64),
    (pa.time32('s'), types.is_time32),
    (pa.time64('ns'), types.is_time64),
    (pa.int8(), types.is_int8),
    (pa.int16(), types.is_int16),
    (pa.int32(), types.is_int32),
    (pa.int64(), types.is_int64),
    (pa.uint8(), types.is_uint8),
    (pa.uint16(), types.is_uint16),
    (pa.uint32(), types.is_uint32),
    (pa.uint64(), types.is_uint64),
    (pa.float16(), types.is_float16),
    (pa.float32(), types.is_float32),
    (pa.float64(), types.is_float64)
])
def test_exact_primitive_types(t, check_func):
    assert check_func(t)