Пример #1
0
def test_duration():
    for unit in ('s', 'ms', 'us', 'ns'):
        ty = pa.duration(unit)
        assert ty.unit == unit

    for invalid_unit in ('m', 'arbit', 'rary'):
        with pytest.raises(ValueError, match='Invalid time unit'):
            pa.duration(invalid_unit)
Пример #2
0
 def test_duration_null(self, duckdb_cursor):
     if not can_run:
         return   
     data = (pa.array([None], type=pa.duration('ns')),pa.array([None], type=pa.duration('us')),pa.array([None], pa.duration('ms')),pa.array([None], pa.duration('s')))
     arrow_table = pa.Table.from_arrays([data[0],data[1],data[2],data[3]],['a','b','c','d'])
     rel = duckdb.from_arrow_table(arrow_table).arrow()
     assert (rel['a'] == arrow_table['c'])
     assert (rel['b'] == arrow_table['c'])
     assert (rel['c'] == arrow_table['c'])
     assert (rel['d'] == arrow_table['c'])
Пример #3
0
def test_duration_nanos_pandas():
    import pandas as pd
    arr = pa.array([0, 3600000000000], type=pa.duration('ns'))
    expected = pd.Timedelta('1 hour')
    assert isinstance(arr[1].as_py(), pd.Timedelta)
    assert arr[1].as_py() == expected
    assert arr[1].value == expected.value

    # Non-zero nanos work fine
    arr = pa.array([946684800000000001], type=pa.duration('ns'))
    assert arr[0].as_py() == pd.Timedelta(946684800000000001, unit='ns')
Пример #4
0
def test_duration_nanos_nopandas():
    arr = pa.array([0, 3600000000000], pa.duration('ns'))
    expected = datetime.timedelta(seconds=60 * 60)
    assert isinstance(arr[1].as_py(), datetime.timedelta)
    assert arr[1].as_py() == expected
    assert arr[1].value == expected.total_seconds() * 1e9

    # Non-zero nanos yields ValueError
    arr = pa.array([946684800000000001], type=pa.duration('ns'))
    with pytest.raises(ValueError):
        arr[0].as_py()
Пример #5
0
def test_sequence_duration_nested_lists():
    td1 = datetime.timedelta(1, 1, 1000)
    td2 = datetime.timedelta(1, 100)

    data = [[td1, None], [td1, td2]]

    arr = pa.array(data)
    assert len(arr) == 2
    assert arr.type == pa.list_(pa.duration('us'))
    assert arr.to_pylist() == data

    arr = pa.array(data, type=pa.list_(pa.duration('ms')))
    assert len(arr) == 2
    assert arr.type == pa.list_(pa.duration('ms'))
    assert arr.to_pylist() == data
Пример #6
0
 def decode(encoding, type_spec):
     if isinstance(type_spec, dict):
         if type_spec['type'] == 'duration':
             return DataType(pa.duration(type_spec['unit']))
         elif type_spec['type'] == 'timestamp':
             return DataType(pa.timestamp(type_spec['unit']))
         elif type_spec['type'] == 'list':
             sub = encoding.decode('dtype', type_spec['value_type']).arrow
             return DataType(pa.list_(sub))
         elif type_spec['type'] == 'dict':
             value_type = encoding.decode('dtype', type_spec["value_type"]).arrow
             index_type = encoding.decode('dtype', type_spec["index_type"]).arrow
             bool_ordered = type_spec["ordered"]
             return DataType(pa.dictionary(index_type, value_type, bool_ordered))
         else:
             raise ValueError(f'Do not understand type {type_spec}')
     if type_spec == 'string':
         return DataType(pa.string())
     if type_spec == 'large_string':
         return DataType(pa.large_string())
     # TODO: find a proper way to support all arrow types
     if type_spec == 'timestamp[ms]':
         return DataType(pa.timestamp('ms'))
     else:
         return DataType(np.dtype(type_spec))
Пример #7
0
def to_arrow_type(dt: DataType) -> "pa.DataType":
    """Convert Spark data type to pyarrow type"""
    from distutils.version import LooseVersion
    import pyarrow as pa

    if type(dt) == BooleanType:
        arrow_type = pa.bool_()
    elif type(dt) == ByteType:
        arrow_type = pa.int8()
    elif type(dt) == ShortType:
        arrow_type = pa.int16()
    elif type(dt) == IntegerType:
        arrow_type = pa.int32()
    elif type(dt) == LongType:
        arrow_type = pa.int64()
    elif type(dt) == FloatType:
        arrow_type = pa.float32()
    elif type(dt) == DoubleType:
        arrow_type = pa.float64()
    elif type(dt) == DecimalType:
        arrow_type = pa.decimal128(dt.precision, dt.scale)
    elif type(dt) == StringType:
        arrow_type = pa.string()
    elif type(dt) == BinaryType:
        arrow_type = pa.binary()
    elif type(dt) == DateType:
        arrow_type = pa.date32()
    elif type(dt) == TimestampType:
        # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
        arrow_type = pa.timestamp("us", tz="UTC")
    elif type(dt) == TimestampNTZType:
        arrow_type = pa.timestamp("us", tz=None)
    elif type(dt) == DayTimeIntervalType:
        arrow_type = pa.duration("us")
    elif type(dt) == ArrayType:
        if type(dt.elementType) in [StructType, TimestampType]:
            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
        arrow_type = pa.list_(to_arrow_type(dt.elementType))
    elif type(dt) == MapType:
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if type(dt.keyType) in [StructType, TimestampType] or type(dt.valueType) in [
            StructType,
            TimestampType,
        ]:
            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
        arrow_type = pa.map_(to_arrow_type(dt.keyType), to_arrow_type(dt.valueType))
    elif type(dt) == StructType:
        if any(type(field.dataType) == StructType for field in dt):
            raise TypeError("Nested StructType not supported in conversion to Arrow")
        fields = [
            pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable)
            for field in dt
        ]
        arrow_type = pa.struct(fields)
    elif type(dt) == NullType:
        arrow_type = pa.null()
    else:
        raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
    return arrow_type
Пример #8
0
def get_many_types():
    # returning them from a function is required because of pa.dictionary
    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
    # checks that the default memory pool has zero allocated bytes
    return (pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'),
            pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'),
            pa.timestamp('us',
                         tz='Europe/Paris'), pa.duration('s'), pa.float16(),
            pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(),
            pa.binary(), pa.binary(10), pa.large_string(), pa.large_binary(),
            pa.list_(pa.int32()), pa.large_list(pa.uint16()),
            pa.struct([
                pa.field('a', pa.int32()),
                pa.field('b', pa.int8()),
                pa.field('c', pa.string())
            ]),
            pa.struct([
                pa.field('a', pa.int32(), nullable=False),
                pa.field('b', pa.int8(), nullable=False),
                pa.field('c', pa.string())
            ]),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_DENSE),
            pa.union(
                [pa.field('a', pa.binary(10)),
                 pa.field('b', pa.string())],
                mode=pa.lib.UnionMode_SPARSE),
            pa.union([
                pa.field('a', pa.binary(10), nullable=False),
                pa.field('b', pa.string())
            ],
                     mode=pa.lib.UnionMode_SPARSE),
            pa.dictionary(pa.int32(), pa.string()))
Пример #9
0
def test_is_temporal_date_time_timestamp():
    date_types = [pa.date32(), pa.date64()]
    time_types = [pa.time32('s'), pa.time64('ns')]
    timestamp_types = [pa.timestamp('ms')]
    duration_types = [pa.duration('ms')]

    for case in date_types + time_types + timestamp_types + duration_types:
        assert types.is_temporal(case)

    for case in date_types:
        assert types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)
        assert not types.is_duration(case)

    for case in time_types:
        assert types.is_time(case)
        assert not types.is_date(case)
        assert not types.is_timestamp(case)
        assert not types.is_duration(case)

    for case in timestamp_types:
        assert types.is_timestamp(case)
        assert not types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_duration(case)

    for case in duration_types:
        assert types.is_duration(case)
        assert not types.is_date(case)
        assert not types.is_time(case)
        assert not types.is_timestamp(case)

    assert not types.is_temporal(pa.int32())
Пример #10
0
def test_sequence_duration_from_int_with_unit(unit):
    data = [5]

    ty = pa.duration(unit)
    arr = pa.array(data, type=ty)
    assert len(arr) == 1
    assert arr.type == ty
    assert arr[0].value == 5
Пример #11
0
def data_timedelta(f):
    data = [
        datetime.timedelta(days=100),
        datetime.timedelta(days=1),
        datetime.timedelta(seconds=1),
    ]
    return pa.array(data,
                    type=pa.duration(f),
                    mask=np.array([False, True, False]))
Пример #12
0
def np_to_pa_dtype(dtype):
    """Util to convert numpy dtype to PyArrow dtype."""
    # special case when dtype is np.datetime64
    if dtype.kind == "M":
        time_unit, _ = np.datetime_data(dtype)
        if time_unit in ("s", "ms", "us", "ns"):
            # return a pa.Timestamp of the appropriate unit
            return pa.timestamp(time_unit)
        # default is int64_t UNIX ms
        return pa.date64()
    elif dtype.kind == "m":
        time_unit, _ = np.datetime_data(dtype)
        if time_unit in ("s", "ms", "us", "ns"):
            # return a pa.Duration of the appropriate unit
            return pa.duration(time_unit)
        # default fallback unit is ns
        return pa.duration("ns")
    return _np_pa_dtypes[cudf.dtype(dtype).type]
Пример #13
0
def test_duration_type():
    # ARROW-6780
    arrays = [
        pa.array([0, 1, 2, 3], type=pa.duration(unit))
        for unit in ["s", "ms", "us", "ns"]
    ]
    table = pa.Table.from_arrays(arrays, ["d[s]", "d[ms]", "d[us]", "d[ns]"])

    _check_roundtrip(table)
Пример #14
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
        ('duration[s]', pa.duration('s')),
        ('duration[ms]', pa.duration('ms')),
        ('duration[us]', pa.duration('us')),
        ('duration[ns]', pa.duration('ns')),
        ('month_day_nano_interval', pa.month_day_nano_interval()),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
Пример #15
0
def test_sequence_duration_nested_lists_numpy():
    td1 = datetime.timedelta(1, 1, 1000)
    td2 = datetime.timedelta(1, 100)

    data = [[np.timedelta64(td1), None],
            [np.timedelta64(td1), np.timedelta64(td2)]]

    arr = pa.array(data)
    assert len(arr) == 2
    assert arr.type == pa.list_(pa.duration('us'))
    assert arr.to_pylist() == [[td1, None], [td1, td2]]

    data = [np.array([np.timedelta64(td1), None], dtype='timedelta64[us]'),
            np.array([np.timedelta64(td1), np.timedelta64(td2)])]

    arr = pa.array(data)
    assert len(arr) == 2
    assert arr.type == pa.list_(pa.duration('us'))
    assert arr.to_pylist() == [[td1, None], [td1, td2]]
Пример #16
0
    def test_duration_overflow(self, duckdb_cursor):
        if not can_run:
            return

        # Only seconds can overflow
        data = pa.array([9223372036854775807], pa.duration('s'))
        arrow_table = pa.Table.from_arrays([data], ['a'])

        with pytest.raises(Exception):
            arrow_from_duck = duckdb.from_arrow(arrow_table).arrow()
Пример #17
0
def test_from_numpy_dtype():
    cases = [
        (np.dtype('bool'), pa.bool_()),
        (np.dtype('int8'), pa.int8()),
        (np.dtype('int16'), pa.int16()),
        (np.dtype('int32'), pa.int32()),
        (np.dtype('int64'), pa.int64()),
        (np.dtype('uint8'), pa.uint8()),
        (np.dtype('uint16'), pa.uint16()),
        (np.dtype('uint32'), pa.uint32()),
        (np.dtype('float16'), pa.float16()),
        (np.dtype('float32'), pa.float32()),
        (np.dtype('float64'), pa.float64()),
        (np.dtype('U'), pa.string()),
        (np.dtype('S'), pa.binary()),
        (np.dtype('datetime64[s]'), pa.timestamp('s')),
        (np.dtype('datetime64[ms]'), pa.timestamp('ms')),
        (np.dtype('datetime64[us]'), pa.timestamp('us')),
        (np.dtype('datetime64[ns]'), pa.timestamp('ns')),
        (np.dtype('timedelta64[s]'), pa.duration('s')),
        (np.dtype('timedelta64[ms]'), pa.duration('ms')),
        (np.dtype('timedelta64[us]'), pa.duration('us')),
        (np.dtype('timedelta64[ns]'), pa.duration('ns')),
    ]

    for dt, pt in cases:
        result = pa.from_numpy_dtype(dt)
        assert result == pt

    # Things convertible to numpy dtypes work
    assert pa.from_numpy_dtype('U') == pa.string()
    assert pa.from_numpy_dtype(np.unicode) == pa.string()
    assert pa.from_numpy_dtype('int32') == pa.int32()
    assert pa.from_numpy_dtype(bool) == pa.bool_()

    with pytest.raises(NotImplementedError):
        pa.from_numpy_dtype(np.dtype('O'))

    with pytest.raises(TypeError):
        pa.from_numpy_dtype('not_convertible_to_dtype')
Пример #18
0
def test_sequence_duration(np_scalar):
    td1 = datetime.timedelta(2, 3601, 1)
    td2 = datetime.timedelta(1, 100, 1000)
    if np_scalar:
        data = [np.timedelta64(td1), None, np.timedelta64(td2)]
    else:
        data = [td1, None, td2]

    arr = pa.array(data)
    assert len(arr) == 3
    assert arr.type == pa.duration('us')
    assert arr.null_count == 1
    assert arr[0].as_py() == td1
    assert arr[1].as_py() is None
    assert arr[2].as_py() == td2
Пример #19
0
def test_sequence_duration_with_unit(unit):
    data = [
        datetime.timedelta(3, 22, 1001),
    ]
    expected = {'s': datetime.timedelta(3, 22),
                'ms': datetime.timedelta(3, 22, 1000),
                'us': datetime.timedelta(3, 22, 1001),
                'ns': datetime.timedelta(3, 22, 1001)}

    ty = pa.duration(unit)

    arr_s = pa.array(data, type=ty)
    assert len(arr_s) == 1
    assert arr_s.type == ty
    assert arr_s[0].as_py() == expected[unit]
Пример #20
0
    def test_string_to_arrow_bijection_for_primitive_types(self):
        supported_pyarrow_datatypes = [
            pa.time32("s"),
            pa.time64("us"),
            pa.timestamp("s"),
            pa.timestamp("ns", tz="America/New_York"),
            pa.date32(),
            pa.date64(),
            pa.duration("s"),
            pa.decimal128(10, 2),
            pa.decimal256(40, -3),
            pa.string(),
            pa.int32(),
            pa.float64(),
        ]
        for dt in supported_pyarrow_datatypes:
            self.assertEqual(dt, string_to_arrow(_arrow_to_datasets_dtype(dt)))

        unsupported_pyarrow_datatypes = [pa.list_(pa.float64())]
        for dt in unsupported_pyarrow_datatypes:
            with self.assertRaises(ValueError):
                string_to_arrow(_arrow_to_datasets_dtype(dt))

        supported_datasets_dtypes = [
            "time32[s]",
            "timestamp[ns]",
            "timestamp[ns, tz=+07:30]",
            "duration[us]",
            "decimal128(30, -4)",
            "int32",
            "float64",
        ]
        for sdt in supported_datasets_dtypes:
            self.assertEqual(sdt,
                             _arrow_to_datasets_dtype(string_to_arrow(sdt)))

        unsupported_datasets_dtypes = [
            "time32[ns]",
            "timestamp[blob]",
            "timestamp[[ns]]",
            "timestamp[ns, tz=[ns]]",
            "duration[[us]]",
            "decimal20(30, -4)",
            "int",
        ]
        for sdt in unsupported_datasets_dtypes:
            with self.assertRaises(ValueError):
                string_to_arrow(sdt)
Пример #21
0
        pyarrow.binary(),
        pyarrow.large_binary(),
    )

    _pyarrow_to_numpy_dtype = {
        pyarrow.date32(): (True, np.dtype("M8[D]")),
        pyarrow.date64(): (False, np.dtype("M8[ms]")),
        pyarrow.time32("s"): (True, np.dtype("M8[s]")),
        pyarrow.time32("ms"): (True, np.dtype("M8[ms]")),
        pyarrow.time64("us"): (False, np.dtype("M8[us]")),
        pyarrow.time64("ns"): (False, np.dtype("M8[ns]")),
        pyarrow.timestamp("s"): (False, np.dtype("M8[s]")),
        pyarrow.timestamp("ms"): (False, np.dtype("M8[ms]")),
        pyarrow.timestamp("us"): (False, np.dtype("M8[us]")),
        pyarrow.timestamp("ns"): (False, np.dtype("M8[ns]")),
        pyarrow.duration("s"): (False, np.dtype("m8[s]")),
        pyarrow.duration("ms"): (False, np.dtype("m8[ms]")),
        pyarrow.duration("us"): (False, np.dtype("m8[us]")),
        pyarrow.duration("ns"): (False, np.dtype("m8[ns]")),
    }

if not ak._v2._util.numpy_at_least("1.17.0"):

    def packbits(bytearray, lsb_order=True):
        if lsb_order:
            if len(bytearray) % 8 == 0:
                ready_to_pack = bytearray
            else:
                ready_to_pack = numpy.empty(
                    int(numpy.ceil(len(bytearray) / 8.0)) * 8,
                    dtype=bytearray.dtype,
Пример #22
0
supported_array_types = (np.ndarray, ) + supported_arrow_array_types
string_types = [pa.string(), pa.large_string()]
_type_names_int = [
    "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"
]
_type_names = ["float64", "float32"] + _type_names_int
map_arrow_to_numpy = {
    getattr(pa, name)(): np.dtype(name)
    for name in _type_names
}
map_arrow_to_numpy[pa.bool_()] = np.dtype("?")
for unit in 's ms us ns'.split():
    map_arrow_to_numpy[pa.timestamp(unit)] = np.dtype(f"datetime64[{unit}]")

for unit in 's ms us ns'.split():
    map_arrow_to_numpy[pa.duration(unit)] = np.dtype(f"timedelta64[{unit}]")


def full(n, value, dtype):
    from .datatype import DataType
    dtype = DataType(dtype)
    values = np.full(n, value, dtype=dtype.numpy)
    if dtype.is_arrow:
        return pa.array(values)
    else:
        return values


def is_arrow_array(ar):
    return isinstance(ar, supported_arrow_array_types)
Пример #23
0
    ("Q" if sys.platform == "win32" else "L"): UInt64,
    "f": Float32,
    "d": Float64,
    "?": Boolean,
}

if _PYARROW_AVAILABLE:
    _PY_TYPE_TO_ARROW_TYPE: dict[type, pa.lib.DataType] = {
        float: pa.float64(),
        int: pa.int64(),
        str: pa.large_utf8(),
        bool: pa.bool_(),
        date: pa.date32(),
        time: pa.time64("us"),
        datetime: pa.timestamp("us"),
        timedelta: pa.duration("us"),
    }

    _DTYPE_TO_ARROW_TYPE = {
        Int8: pa.int8(),
        Int16: pa.int16(),
        Int32: pa.int32(),
        Int64: pa.int64(),
        UInt8: pa.uint8(),
        UInt16: pa.uint16(),
        UInt32: pa.uint32(),
        UInt64: pa.uint64(),
        Float32: pa.float32(),
        Float64: pa.float64(),
        Boolean: pa.bool_(),
        Utf8: pa.large_utf8(),
Пример #24
0
def from_ibis_interval(dtype):
    try:
        return pa.duration(dtype.unit)
    except ValueError:
        raise com.IbisTypeError(f"Unsupported interval unit: {dtype.unit}")
Пример #25
0
    pa.date32(),
    pa.date64()
])
time_types = st.sampled_from([
    pa.time32('s'),
    pa.time32('ms'),
    pa.time64('us'),
    pa.time64('ns')
])
timestamp_types = st.builds(
    pa.timestamp,
    unit=st.sampled_from(['s', 'ms', 'us', 'ns']),
    tz=tzst.timezones()
)
duration_types = st.sampled_from([
    pa.duration(unit) for unit in ['s', 'ms', 'us', 'ns']])
temporal_types = st.one_of(
    date_types, time_types, timestamp_types, duration_types)

primitive_types = st.one_of(
    null_type,
    bool_type,
    binary_type,
    string_type,
    large_binary_type,
    large_string_type,
    numeric_types,
    temporal_types
)

metadata = st.dictionaries(st.text(), st.text())
Пример #26
0
def test_datetime_subclassing():
    data = [
        MyDate(2007, 7, 13),
    ]
    date_type = pa.date32()
    arr_date = pa.array(data, type=date_type)
    assert len(arr_date) == 1
    assert arr_date.type == date_type
    assert arr_date[0].as_py() == datetime.date(2007, 7, 13)

    data = [
        MyDatetime(2007, 7, 13, 1, 23, 34, 123456),
    ]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                 23, 34, 0)

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123000)

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)

    data = [
        MyTimedelta(123, 456, 1002),
    ]

    s = pa.duration('s')
    ms = pa.duration('ms')
    us = pa.duration('us')

    arr_s = pa.array(data)
    assert len(arr_s) == 1
    assert arr_s.type == us
    assert arr_s[0].as_py() == datetime.timedelta(123, 456, 1002)

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert arr_s[0].as_py() == datetime.timedelta(123, 456)

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert arr_ms[0].as_py() == datetime.timedelta(123, 456, 1000)

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert arr_us[0].as_py() == datetime.timedelta(123, 456, 1002)
Пример #27
0
            pa.field("c", pa.string()),
        ]
    ),
    pa.struct(
        [
            pa.field("a", pa.int32(), nullable=False),
            pa.field("b", pa.int8(), nullable=False),
            pa.field("c", pa.string()),
        ]
    ),
    pa.dictionary(pa.int8(), pa.string()),
]

_unsupported_pyarrow_types = [
    pa.decimal256(76, 38),
    pa.duration("s"),
    pa.map_(pa.string(), pa.int32()),
    pa.union(
        [pa.field("a", pa.binary(10)), pa.field("b", pa.string())],
        mode=pa.lib.UnionMode_DENSE,
    ),
    pa.union(
        [pa.field("a", pa.binary(10)), pa.field("b", pa.string())],
        mode=pa.lib.UnionMode_DENSE,
        type_codes=[4, 8],
    ),
    pa.union(
        [pa.field("a", pa.binary(10)), pa.field("b", pa.string())],
        mode=pa.lib.UnionMode_SPARSE,
    ),
    pa.union(
Пример #28
0
def test_basics(fletcher_array):
    df = pd.DataFrame(
        {
            "null": fletcher_array(pa.array([None, None], type=pa.null())),
            "bool": fletcher_array(pa.array([None, True], type=pa.bool_())),
            "int8": fletcher_array(pa.array([None, -1], type=pa.int8())),
            "uint8": fletcher_array(pa.array([None, 1], type=pa.uint8())),
            "int16": fletcher_array(pa.array([None, -1], type=pa.int16())),
            "uint16": fletcher_array(pa.array([None, 1], type=pa.uint16())),
            "int32": fletcher_array(pa.array([None, -1], type=pa.int32())),
            "uint32": fletcher_array(pa.array([None, 1], type=pa.uint32())),
            "int64": fletcher_array(pa.array([None, -1], type=pa.int64())),
            "uint64": fletcher_array(pa.array([None, 1], type=pa.uint64())),
            "float16": fletcher_array(
                pa.array([None, np.float16(-0.1)], type=pa.float16())
            ),
            "float32": fletcher_array(pa.array([None, -0.1], type=pa.float32())),
            "float64": fletcher_array(pa.array([None, -0.1], type=pa.float64())),
            "date32": fletcher_array(
                pa.array([None, datetime.date(2010, 9, 8)], type=pa.date32())
            ),
            "date64": fletcher_array(
                pa.array([None, datetime.date(2010, 9, 8)], type=pa.date64())
            ),
            # https://github.com/pandas-dev/pandas/issues/34986
            # "timestamp[s]": fletcher_array(
            #     pa.array(
            #         [None, datetime.datetime(2013, 12, 11, 10, 9, 8)],
            #         type=pa.timestamp("s"),
            #     )
            # ),
            # "timestamp[ms]": fletcher_array(
            #     pa.array(
            #         [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 1000)],
            #         type=pa.timestamp("ms"),
            #     )
            # ),
            # "timestamp[us]": fletcher_array(
            #     pa.array(
            #         [None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)],
            #         type=pa.timestamp("us"),
            #     )
            # ),
            # FIXME: assert_extension_array_equal casts to numpy object thus cannot handle nanoseconds
            # 'timestamp[ns]': fletcher_array(pa.array([None, datetime.datetime(2013, 12, 11, 10, 9, 8, 7)], type=pa.timestamp("ns"))),
            "binary": fletcher_array(pa.array([None, b"122"], type=pa.binary())),
            "string": fletcher_array(pa.array([None, "🤔"], type=pa.string())),
            "duration[s]": fletcher_array(
                pa.array([None, datetime.timedelta(seconds=9)], type=pa.duration("s"))
            ),
            "duration[ms]": fletcher_array(
                pa.array(
                    [None, datetime.timedelta(milliseconds=8)], type=pa.duration("ms")
                )
            ),
            "duration[us]": fletcher_array(
                pa.array(
                    [None, datetime.timedelta(microseconds=7)], type=pa.duration("us")
                )
            ),
            # FIXME: assert_extension_array_equal casts to numpy object thus cannot handle nanoseconds
            # 'duration[ns]': fletcher_array(pa.array([None, datetime.timedelta(microseconds=7)], type=pa.duration("ns"))),
            "list[string]": fletcher_array(
                pa.array([None, [None, "🤔"]], type=pa.list_(pa.string()))
            ),
        }
    )
    ddf = dd.from_pandas(df, npartitions=2)

    meta_nonempty = ddf._meta_nonempty
    pdt.assert_frame_equal(meta_nonempty, df)

    result = ddf.compute()
    pdt.assert_frame_equal(result, df)
Пример #29
0
def generate_test_parquet():
    import pyarrow as pa
    import datetime
    import decimal
    import json
    import pandas as pd
    import pathlib
    import pyarrow.parquet as pq
    import struct

    boolean = pa.array([True, False, None, False, True], type=pa.bool_())
    uint8 = pa.array([None if i == 2 else 1 + i for i in range(5)],
                     type=pa.uint8())
    int8 = pa.array([None if i == 2 else -2 + i for i in range(5)],
                    type=pa.int8())
    uint16 = pa.array([None if i == 2 else 1 + i * 10000 for i in range(5)],
                      type=pa.uint16())
    int16 = pa.array(
        [None if i == 2 else -20000 + i * 10000 for i in range(5)],
        type=pa.int16())
    uint32 = pa.array(
        [None if i == 2 else 1 + i * 1000000000 for i in range(5)],
        type=pa.uint32())
    int32 = pa.array(
        [None if i == 2 else -2000000000 + i * 1000000000 for i in range(5)],
        type=pa.int32())
    uint64 = pa.array(
        [None if i == 2 else 1 + i * 100000000000 for i in range(5)],
        type=pa.uint64())
    int64 = pa.array([
        None if i == 2 else -200000000000 + i * 100000000000 for i in range(5)
    ],
                     type=pa.int64())
    float32 = pa.array([None if i == 2 else 1.5 + i for i in range(5)],
                       type=pa.float32())
    float64 = pa.array([None if i == 2 else 1.5 + i for i in range(5)],
                       type=pa.float64())
    string = pa.array(["abcd", "", None, "c", "d"], type=pa.string())
    large_string = pa.array(["abcd", "", None, "c", "d"],
                            type=pa.large_string())
    gmt_plus_2 = datetime.timezone(datetime.timedelta(hours=2))
    timestamp_ms_gmt_plus_2 = pa.array([
        pd.Timestamp(year=2019,
                     month=1,
                     day=1,
                     hour=14,
                     nanosecond=500 * 1e6,
                     tz=gmt_plus_2)
    ] * 5,
                                       type=pa.timestamp('ms', tz=gmt_plus_2))
    gmt = datetime.timezone(datetime.timedelta(hours=0))
    timestamp_ms_gmt = pa.array([
        pd.Timestamp(
            year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt)
    ] * 5,
                                type=pa.timestamp('ms', tz=gmt))
    gmt_minus_0215 = datetime.timezone(datetime.timedelta(hours=-2.25))
    timestamp_ms_gmt_minus_0215 = pa.array([
        pd.Timestamp(year=2019,
                     month=1,
                     day=1,
                     hour=14,
                     nanosecond=500 * 1e6,
                     tz=gmt_minus_0215)
    ] * 5,
                                           type=pa.timestamp(
                                               'ms', tz=gmt_minus_0215))
    timestamp_s_no_tz = pa.array([
        pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6)
    ] * 5,
                                 type=pa.timestamp('s'))
    time32_s = pa.array([3600 + 120 + 3, None, 3, 4, 5], type=pa.time32('s'))
    time32_ms = pa.array([(3600 + 120 + 3) * 1000 + 456, 2, 3, 4, 5],
                         type=pa.time32('ms'))
    time64_us = pa.array([(3600 + 120 + 3) * 1e6, None, 3, 4, 5],
                         type=pa.time64('us'))
    time64_ns = pa.array([(3600 + 120 + 3) * 1e9 + 456, 2, 3, 4, 5],
                         type=pa.time64('ns'))
    date32 = pa.array([1, 2, 3, 4, 5], type=pa.date32())
    date64 = pa.array([86400 * 1000, 2, 3, 4, 5], type=pa.date64())
    duration_s = pa.array([1, 2, 3, 4, 5], type=pa.duration('s'))
    duration_ms = pa.array([1, 2, 3, 4, 5], type=pa.duration('ms'))
    binary = pa.array([b'\x00\x01'] * 5, type=pa.binary())
    large_binary = pa.array([b'\x00\x01'] * 5, type=pa.large_binary())
    fixed_size_binary = pa.array([b'\x00\x01'] * 5, type=pa.binary(2))
    decimal128 = pa.array([
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567'), None,
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567')
    ],
                          type=pa.decimal128(7, 3))
    decimal256 = pa.array([
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567'), None,
        decimal.Decimal('1234.567'),
        decimal.Decimal('-1234.567')
    ],
                          type=pa.decimal256(7, 3))
    list_boolean = pa.array([
        None if i == 2 else [
            None if j == 0 else True if (j % 2) == 0 else False
            for j in range(i)
        ] for i in range(5)
    ],
                            type=pa.list_(pa.bool_()))
    list_uint8 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.uint8()))
    list_int8 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                         type=pa.list_(pa.int8()))
    list_uint16 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                           type=pa.list_(pa.uint16()))
    list_int16 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.int16()))
    list_uint32 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                           type=pa.list_(pa.uint32()))
    list_int32 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.int32()))
    list_uint64 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                           type=pa.list_(pa.uint64()))
    list_int64 = pa.array([
        None if i == 2 else
        [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                          type=pa.list_(pa.int64()))
    list_float32 = pa.array([
        None if i == 2 else
        [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                            type=pa.list_(pa.float32()))
    list_float64 = pa.array([
        None if i == 2 else
        [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)]
        for i in range(5)
    ],
                            type=pa.list_(pa.float64()))
    list_string = pa.array([
        None if i == 2 else [
            "".join(["%c" % (65 + j + k) for k in range(1 + j)])
            for j in range(i)
        ] for i in range(5)
    ])
    fixed_size_list_boolean = pa.array(
        [[True, False], [False, True], [True, False], [False, True],
         [True, False]],
        type=pa.list_(pa.bool_(), 2))
    fixed_size_list_uint8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.uint8(), 2))
    fixed_size_list_int8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                    type=pa.list_(pa.int8(), 2))
    fixed_size_list_uint16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                      type=pa.list_(pa.uint16(), 2))
    fixed_size_list_int16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.int16(), 2))
    fixed_size_list_uint32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                      type=pa.list_(pa.uint32(), 2))
    fixed_size_list_int32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.int32(), 2))
    fixed_size_list_uint64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                      type=pa.list_(pa.uint64(), 2))
    fixed_size_list_int64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                                     type=pa.list_(pa.int64(), 2))
    fixed_size_list_float32 = pa.array(
        [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]],
        type=pa.list_(pa.float32(), 2))
    fixed_size_list_float64 = pa.array(
        [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]],
        type=pa.list_(pa.float64(), 2))
    fixed_size_list_string = pa.array(
        [["a", "b"], ["c", "d"], ["e", "f"], ["g", "h"], ["i", "j"]],
        type=pa.list_(pa.string(), 2))
    struct_field = pa.array([{
        "a": 1,
        "b": 2.5,
        "c": {
            "d": "e",
            "f": "g"
        },
        "h": [5, 6],
        "i": 3
    }] * 5)

    #struct_val = { "a": 5 }
    #for i in range(123):
    #    struct_val = { "a": struct_val }
    #struct_field = pa.array([struct_val] * 5)

    map_boolean = pa.array([[('x', None),
                             ('y', True)], [('z', True)], None, [], []],
                           type=pa.map_(pa.string(), pa.bool_()))
    map_uint8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.uint8()))
    map_int8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                        type=pa.map_(pa.string(), pa.int8()))
    map_uint16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                          type=pa.map_(pa.string(), pa.uint16()))
    map_int16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.int16()))
    map_uint32 = pa.array([[('x', 4 * 1000 * 1000 * 1000),
                            ('y', None)], [('z', 3)], None, [], []],
                          type=pa.map_(pa.string(), pa.uint32()))
    map_int32 = pa.array([[('x', 2 * 1000 * 1000 * 1000),
                           ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.int32()))
    map_uint64 = pa.array([[('x', 4 * 1000 * 1000 * 1000 * 1000),
                            ('y', None)], [('z', 3)], None, [], []],
                          type=pa.map_(pa.string(), pa.uint64()))
    map_int64 = pa.array([[('x', -2 * 1000 * 1000 * 1000 * 1000),
                           ('y', None)], [('z', 3)], None, [], []],
                         type=pa.map_(pa.string(), pa.int64()))
    map_float32 = pa.array([[('x', 1.5),
                             ('y', None)], [('z', 3)], None, [], []],
                           type=pa.map_(pa.string(), pa.float32()))
    map_float64 = pa.array([[('x', 1.5),
                             ('y', None)], [('z', 3)], None, [], []],
                           type=pa.map_(pa.string(), pa.float64()))
    map_string = pa.array([[('x', 'x_val'),
                            ('y', None)], [('z', 'z_val')], None, [], []],
                          type=pa.map_(pa.string(), pa.string()))

    indices = pa.array([0, 1, 2, None, 2])
    dictionary = pa.array(['foo', 'bar', 'baz'])
    dict = pa.DictionaryArray.from_arrays(indices, dictionary)

    map_list = pa.array([[('x', []), ('y', [])], [('z', [])], None, [], []],
                        type=pa.map_(pa.string(), pa.list_(pa.uint32())))

    geometry = pa.array([
        None if i == 1 else
        (b'\x01\x01\x00\x00\x00' + struct.pack('<dd', i, 2)) for i in range(5)
    ],
                        type=pa.binary())

    names = [
        "boolean",
        "uint8",
        "int8",
        "uint16",
        "int16",
        "uint32",
        "int32",
        "uint64",
        "int64",
        "float32",
        "float64",
        "string",
        "large_string",
        "timestamp_ms_gmt",
        "timestamp_ms_gmt_plus_2",
        "timestamp_ms_gmt_minus_0215",
        "timestamp_s_no_tz",
        "time32_s",
        "time32_ms",
        "time64_us",
        "time64_ns",
        "date32",
        "date64",
        # "duration_s",
        # "duration_ms",
        "binary",
        "large_binary",
        "fixed_size_binary",
        "decimal128",
        "decimal256",
        "list_boolean",
        "list_uint8",
        "list_int8",
        "list_uint16",
        "list_int16",
        "list_uint32",
        "list_int32",
        "list_uint64",
        "list_int64",
        "list_float32",
        "list_float64",
        "list_string",
        "fixed_size_list_boolean",
        "fixed_size_list_uint8",
        "fixed_size_list_int8",
        "fixed_size_list_uint16",
        "fixed_size_list_int16",
        "fixed_size_list_uint32",
        "fixed_size_list_int32",
        "fixed_size_list_uint64",
        "fixed_size_list_int64",
        "fixed_size_list_float32",
        "fixed_size_list_float64",
        "fixed_size_list_string",
        "struct_field",
        "map_boolean",
        "map_uint8",
        "map_int8",
        "map_uint16",
        "map_int16",
        "map_uint32",
        "map_int32",
        "map_uint64",
        "map_int64",
        "map_float32",
        "map_float64",
        "map_string",
        # "map_list",
        "dict",
        "geometry",
    ]

    locals_ = locals()
    table = pa.table([locals_[x] for x in names], names=names)

    my_schema = table.schema.with_metadata({
        "geo":
        json.dumps({
            "version": "0.1.0",
            "primary_column": "geometry",
            "columns": {
                "geometry": {
                    'crs': wkt_epsg_4326,
                    'bbox': [0, 2, 4, 2],
                    'encoding': 'WKB'
                }
            }
        })
    })

    table = table.cast(my_schema)
    HERE = pathlib.Path(__file__).parent
    pq.write_table(table,
                   HERE / "ogr/data/parquet/test.parquet",
                   compression='NONE',
                   row_group_size=3)
Пример #30
0
    FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()]
    STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()]

    TIME_PYARROW_DTYPES = [
        pa.time32("s"),
        pa.time32("ms"),
        pa.time64("us"),
        pa.time64("ns"),
    ]
    DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()]
    DATETIME_PYARROW_DTYPES = [
        pa.timestamp(unit=unit, tz=tz) for unit in ["s", "ms", "us", "ns"]
        for tz in [None, "UTC", "US/Pacific", "US/Eastern"]
    ]
    TIMEDELTA_PYARROW_DTYPES = [
        pa.duration(unit) for unit in ["s", "ms", "us", "ns"]
    ]

    BOOL_PYARROW_DTYPES = [pa.bool_()]

    # TODO: Add container like pyarrow types:
    #  https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions
    ALL_PYARROW_DTYPES = (ALL_INT_PYARROW_DTYPES + FLOAT_PYARROW_DTYPES +
                          TIME_PYARROW_DTYPES + DATE_PYARROW_DTYPES +
                          DATETIME_PYARROW_DTYPES + TIMEDELTA_PYARROW_DTYPES +
                          BOOL_PYARROW_DTYPES)

EMPTY_STRING_PATTERN = re.compile("^$")

# set testing_mode
_testing_mode_warnings = (DeprecationWarning, ResourceWarning)