示例#1
0
def test_iterator_without_size():
    expected = pa.array((0, 1, 2))
    arr1 = pa.array(iter(range(3)))
    assert arr1.equals(expected)
    # Same with explicit type
    arr1 = pa.array(iter(range(3)), type=pa.int64())
    assert arr1.equals(expected)
示例#2
0
def make_recordbatch(length):
    schema = pa.schema([pa.field('f0', pa.int16()),
                        pa.field('f1', pa.int16())])
    a0 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16))
    a1 = pa.array(np.random.randint(0, 255, size=length, dtype=np.int16))
    batch = pa.RecordBatch.from_arrays([a0, a1], schema)
    return batch
示例#3
0
def test_file_reader_writer():
    data = [
        pa.array([1, 2, 3, 4]),
        pa.array(['foo', 'bar', 'baz', None]),
        pa.array([True, None, False, True])
    ]
    batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])

    sink = pa.BufferOutputStream()

    with pytest.warns(FutureWarning):
        stream_writer = pa.StreamWriter(sink, batch.schema)
        assert isinstance(stream_writer, pa.RecordBatchStreamWriter)

    sink2 = pa.BufferOutputStream()
    with pytest.warns(FutureWarning):
        file_writer = pa.FileWriter(sink2, batch.schema)
        assert isinstance(file_writer, pa.RecordBatchFileWriter)

    file_writer.write_batch(batch)
    stream_writer.write_batch(batch)

    file_writer.close()
    stream_writer.close()

    buf = sink.get_result()
    buf2 = sink2.get_result()

    with pytest.warns(FutureWarning):
        stream_reader = pa.StreamReader(buf)
        assert isinstance(stream_reader, pa.RecordBatchStreamReader)

    with pytest.warns(FutureWarning):
        file_reader = pa.FileReader(buf2)
        assert isinstance(file_reader, pa.RecordBatchFileReader)
示例#4
0
def test_sequence_nesting_levels():
    data = [1, 2, None]
    arr = pa.array(data)
    assert arr.type == pa.int64()
    assert arr.to_pylist() == data

    data = [[1], [2], None]
    arr = pa.array(data)
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == data

    data = [[1], [2, 3, 4], [None]]
    arr = pa.array(data)
    assert arr.type == pa.list_(pa.int64())
    assert arr.to_pylist() == data

    data = [None, [[None, 1]], [[2, 3, 4], None], [None]]
    arr = pa.array(data)
    assert arr.type == pa.list_(pa.list_(pa.int64()))
    assert arr.to_pylist() == data

    exceptions = (pa.ArrowInvalid, pa.ArrowTypeError)

    # Mixed nesting levels are rejected
    with pytest.raises(exceptions):
        pa.array([1, 2, [1]])

    with pytest.raises(exceptions):
        pa.array([1, 2, []])

    with pytest.raises(exceptions):
        pa.array([[1], [2], [None, [1]]])
示例#5
0
def test_struct_from_tuples():
    ty = pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.string()),
                    pa.field('c', pa.bool_())])

    data = [(5, 'foo', True),
            (6, 'bar', False)]
    expected = [{'a': 5, 'b': 'foo', 'c': True},
                {'a': 6, 'b': 'bar', 'c': False}]
    arr = pa.array(data, type=ty)

    data_as_ndarray = np.empty(len(data), dtype=object)
    data_as_ndarray[:] = data
    arr2 = pa.array(data_as_ndarray, type=ty)
    assert arr.to_pylist() == expected

    assert arr.equals(arr2)

    # With omitted values
    data = [(5, 'foo', None),
            None,
            (6, None, False)]
    expected = [{'a': 5, 'b': 'foo', 'c': None},
                None,
                {'a': 6, 'b': None, 'c': False}]
    arr = pa.array(data, type=ty)
    assert arr.to_pylist() == expected

    # Invalid tuple size
    for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]:
        with pytest.raises(ValueError, match="(?i)tuple size"):
            pa.array([tup], type=ty)
示例#6
0
def test_unsigned_integer_overflow(bits):
    ty = getattr(pa, "uint%d" % bits)()
    # XXX ideally would raise OverflowError
    with pytest.raises((ValueError, pa.ArrowException)):
        pa.array([2 ** bits], ty)
    with pytest.raises((ValueError, pa.ArrowException)):
        pa.array([-1], ty)
示例#7
0
def test_table_basics():
    data = [
        pa.array(range(5)),
        pa.array([-10, -5, 0, 5, 10])
    ]
    table = pa.Table.from_arrays(data, names=('a', 'b'))
    table._validate()
    assert len(table) == 5
    assert table.num_rows == 5
    assert table.num_columns == 2
    assert table.shape == (5, 2)
    assert table.to_pydict() == OrderedDict([
        ('a', [0, 1, 2, 3, 4]),
        ('b', [-10, -5, 0, 5, 10])
    ])

    columns = []
    for col in table.itercolumns():
        columns.append(col)
        for chunk in col.data.iterchunks():
            assert chunk is not None

        with pytest.raises(IndexError):
            col.data.chunk(-1)

        with pytest.raises(IndexError):
            col.data.chunk(col.data.num_chunks)

    assert table.columns == columns
示例#8
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
示例#9
0
def test_struct_array_field():
    ty = pa.struct([pa.field('x', pa.int16()),
                    pa.field('y', pa.float32())])
    a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)

    x0 = a.field(0)
    y0 = a.field(1)
    x1 = a.field(-2)
    y1 = a.field(-1)
    x2 = a.field('x')
    y2 = a.field('y')

    assert isinstance(x0, pa.lib.Int16Array)
    assert isinstance(y1, pa.lib.FloatArray)
    assert x0.equals(pa.array([1, 3, 5], type=pa.int16()))
    assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32()))
    assert x0.equals(x1)
    assert x0.equals(x2)
    assert y0.equals(y1)
    assert y0.equals(y2)

    for invalid_index in [None, pa.int16()]:
        with pytest.raises(TypeError):
            a.field(invalid_index)

    for invalid_index in [3, -3]:
        with pytest.raises(IndexError):
            a.field(invalid_index)

    for invalid_name in ['z', '']:
        with pytest.raises(KeyError):
            a.field(invalid_name)
示例#10
0
def test_infinite_iterator():
    expected = pa.array((0, 1, 2))
    arr1 = pa.array(itertools.count(0), size=3)
    assert arr1.equals(expected)
    # Same with explicit type
    arr1 = pa.array(itertools.count(0), type=pa.int64(), size=3)
    assert arr1.equals(expected)
示例#11
0
def test_asarray():
    arr = pa.array(range(4))

    # The iterator interface gives back an array of Int64Value's
    np_arr = np.asarray([_ for _ in arr])
    assert np_arr.tolist() == [0, 1, 2, 3]
    assert np_arr.dtype == np.dtype('O')
    assert type(np_arr[0]) == pa.lib.Int64Value

    # Calling with the arrow array gives back an array with 'int64' dtype
    np_arr = np.asarray(arr)
    assert np_arr.tolist() == [0, 1, 2, 3]
    assert np_arr.dtype == np.dtype('int64')

    # An optional type can be specified when calling np.asarray
    np_arr = np.asarray(arr, dtype='str')
    assert np_arr.tolist() == ['0', '1', '2', '3']

    # If PyArrow array has null values, numpy type will be changed as needed
    # to support nulls.
    arr = pa.array([0, 1, 2, None])
    assert arr.type == pa.int64()
    np_arr = np.asarray(arr)
    elements = np_arr.tolist()
    assert elements[:3] == [0., 1., 2.]
    assert np.isnan(elements[3])
    assert np_arr.dtype == np.dtype('float64')
示例#12
0
def test_recordbatch_basics():
    data = [
        pa.array(range(5)),
        pa.array([-10, -5, 0, 5, 10])
    ]

    batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1'])
    assert not batch.schema.metadata

    assert len(batch) == 5
    assert batch.num_rows == 5
    assert batch.num_columns == len(data)
    assert batch.to_pydict() == OrderedDict([
        ('c0', [0, 1, 2, 3, 4]),
        ('c1', [-10, -5, 0, 5, 10])
    ])

    with pytest.raises(IndexError):
        # bounds checking
        batch[2]

    # Schema passed explicitly
    schema = pa.schema([pa.field('c0', pa.int16()),
                        pa.field('c1', pa.int32())],
                       metadata={b'foo': b'bar'})
    batch = pa.RecordBatch.from_arrays(data, schema)
    assert batch.schema == schema
示例#13
0
def test_invalid_table_construct():
    array = np.array([0, 1], dtype=np.uint8)
    u8 = pa.uint8()
    arrays = [pa.array(array, type=u8), pa.array(array[1:], type=u8)]

    with pytest.raises(pa.lib.ArrowInvalid):
        pa.Table.from_arrays(arrays, names=["a1", "a2"])
示例#14
0
def test_cast_time32_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int32'),
                   type=pa.time32('s'))
    expected = pa.array([0, 1, 2], type='i4')

    result = arr.cast('i4')
    assert result.equals(expected)
示例#15
0
def test_cast_timestamp_unit():
    # ARROW-1680
    val = datetime.datetime.now()
    s = pd.Series([val])
    s_nyc = s.dt.tz_localize('tzlocal()').dt.tz_convert('America/New_York')

    us_with_tz = pa.timestamp('us', tz='America/New_York')

    arr = pa.Array.from_pandas(s_nyc, type=us_with_tz)

    # ARROW-1906
    assert arr.type == us_with_tz

    arr2 = pa.Array.from_pandas(s, type=pa.timestamp('us'))

    assert arr[0].as_py() == s_nyc[0]
    assert arr2[0].as_py() == s[0]

    # Disallow truncation
    arr = pa.array([123123], type='int64').cast(pa.timestamp('ms'))
    expected = pa.array([123], type='int64').cast(pa.timestamp('s'))

    target = pa.timestamp('s')
    with pytest.raises(ValueError):
        arr.cast(target)

    result = arr.cast(target, safe=False)
    assert result.equals(expected)
示例#16
0
def test_buffers_primitive():
    a = pa.array([1, 2, None, 4], type=pa.int16())
    buffers = a.buffers()
    assert len(buffers) == 2
    null_bitmap = buffers[0].to_pybytes()
    assert 1 <= len(null_bitmap) <= 64  # XXX this is varying
    assert bytearray(null_bitmap)[0] == 0b00001011

    # Slicing does not affect the buffers but the offset
    a_sliced = a[1:]
    buffers = a_sliced.buffers()
    a_sliced.offset == 1
    assert len(buffers) == 2
    null_bitmap = buffers[0].to_pybytes()
    assert 1 <= len(null_bitmap) <= 64  # XXX this is varying
    assert bytearray(null_bitmap)[0] == 0b00001011

    assert struct.unpack('hhxxh', buffers[1].to_pybytes()) == (1, 2, 4)

    a = pa.array(np.int8([4, 5, 6]))
    buffers = a.buffers()
    assert len(buffers) == 2
    # No null bitmap from Numpy int array
    assert buffers[0] is None
    assert struct.unpack('3b', buffers[1].to_pybytes()) == (4, 5, 6)

    a = pa.array([b'foo!', None, b'bar!!'])
    buffers = a.buffers()
    assert len(buffers) == 3
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    offsets = buffers[1].to_pybytes()
    assert struct.unpack('4i', offsets) == (0, 4, 4, 9)
    values = buffers[2].to_pybytes()
    assert values == b'foo!bar!!'
示例#17
0
def test_cast_timestamp_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'),
                   type=pa.timestamp('us'))
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')
    assert result.equals(expected)
示例#18
0
def test_buffers_nested():
    a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64()))
    buffers = a.buffers()
    assert len(buffers) == 4
    # The parent buffers
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    offsets = buffers[1].to_pybytes()
    assert struct.unpack('4i', offsets) == (0, 2, 2, 6)
    # The child buffers
    null_bitmap = buffers[2].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00110111
    values = buffers[3].to_pybytes()
    assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5)

    a = pa.array([(42, None), None, (None, 43)],
                 type=pa.struct([pa.field('a', pa.int8()),
                                 pa.field('b', pa.int16())]))
    buffers = a.buffers()
    assert len(buffers) == 5
    # The parent buffer
    null_bitmap = buffers[0].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000101
    # The child buffers: 'a'
    null_bitmap = buffers[1].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000001
    values = buffers[2].to_pybytes()
    assert struct.unpack('bxx', values) == (42,)
    # The child buffers: 'b'
    null_bitmap = buffers[3].to_pybytes()
    assert bytearray(null_bitmap)[0] == 0b00000100
    values = buffers[4].to_pybytes()
    assert struct.unpack('4xh', values) == (43,)
示例#19
0
def test_to_pandas_zero_copy():
    import gc

    arr = pa.array(range(10))

    for i in range(10):
        np_arr = arr.to_pandas()
        assert sys.getrefcount(np_arr) == 2
        np_arr = None  # noqa

    assert sys.getrefcount(arr) == 2

    for i in range(10):
        arr = pa.array(range(10))
        np_arr = arr.to_pandas()
        arr = None
        gc.collect()

        # Ensure base is still valid

        # Because of py.test's assert inspection magic, if you put getrefcount
        # on the line being examined, it will be 1 higher than you expect
        base_refcount = sys.getrefcount(np_arr.base)
        assert base_refcount == 2
        np_arr.sum()
示例#20
0
def test_recordbatch_slice():
    data = [
        pa.array(range(5)),
        pa.array([-10, -5, 0, 5, 10])
    ]
    names = ['c0', 'c1']

    batch = pa.RecordBatch.from_arrays(data, names)

    sliced = batch.slice(2)

    assert sliced.num_rows == 3

    expected = pa.RecordBatch.from_arrays(
        [x.slice(2) for x in data], names)
    assert sliced.equals(expected)

    sliced2 = batch.slice(2, 2)
    expected2 = pa.RecordBatch.from_arrays(
        [x.slice(2, 2) for x in data], names)
    assert sliced2.equals(expected2)

    # 0 offset
    assert batch.slice(0).equals(batch)

    # Slice past end of array
    assert len(batch.slice(len(batch))) == 0

    with pytest.raises(IndexError):
        batch.slice(-1)
示例#21
0
def test_chunked_array_str():
    data = [
        pa.array([1, 2, 3]),
        pa.array([4, 5, 6])
    ]
    data = pa.chunked_array(data)
    assert str(data) == """[
示例#22
0
def test_list_from_arrays():
    offsets_arr = np.array([0, 2, 5, 8], dtype='i4')
    offsets = pa.array(offsets_arr, type='int32')
    pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h']
    values = pa.array(pyvalues, type='binary')

    result = pa.ListArray.from_arrays(offsets, values)
    expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]])

    assert result.equals(expected)

    # With nulls
    offsets = [0, None, 2, 6]

    values = ['a', 'b', 'c', 'd', 'e', 'f']

    result = pa.ListArray.from_arrays(offsets, values)
    expected = pa.array([values[:2], None, values[2:]])

    assert result.equals(expected)

    # Another edge case
    offsets2 = [0, 2, None, 6]
    result = pa.ListArray.from_arrays(offsets2, values)
    expected = pa.array([values[:2], values[2:], None])
    assert result.equals(expected)
示例#23
0
def test_recordbatch_from_arrays_validate_lengths():
    # ARROW-2820
    data = [pa.array([1]), pa.array(["tokyo", "like", "happy"]),
            pa.array(["derek"])]

    with pytest.raises(ValueError):
        pa.RecordBatch.from_arrays(data, ['id', 'tags', 'name'])
示例#24
0
def test_array_slice():
    arr = pa.array(range(10))

    sliced = arr.slice(2)
    expected = pa.array(range(2, 10))
    assert sliced.equals(expected)

    sliced2 = arr.slice(2, 4)
    expected2 = pa.array(range(2, 6))
    assert sliced2.equals(expected2)

    # 0 offset
    assert arr.slice(0).equals(arr)

    # Slice past end of array
    assert len(arr.slice(len(arr))) == 0

    with pytest.raises(IndexError):
        arr.slice(-1)

    # Test slice notation
    assert arr[2:].equals(arr.slice(2))
    assert arr[2:5].equals(arr.slice(2, 3))
    assert arr[-5:].equals(arr.slice(len(arr) - 5))
    with pytest.raises(IndexError):
        arr[::-1]
    with pytest.raises(IndexError):
        arr[::2]

    n = len(arr)
    for start in range(-n * 2, n * 2):
        for stop in range(-n * 2, n * 2):
            assert arr[start:stop].to_pylist() == arr.to_pylist()[start:stop]
示例#25
0
def test_chunked_array_asarray():
    data = [
        pa.array([0]),
        pa.array([1, 2, 3])
    ]
    chunked_arr = pa.chunked_array(data)

    np_arr = np.asarray(chunked_arr)
    assert np_arr.tolist() == [0, 1, 2, 3]
    assert np_arr.dtype == np.dtype('int64')

    # An optional type can be specified when calling np.asarray
    np_arr = np.asarray(chunked_arr, dtype='str')
    assert np_arr.tolist() == ['0', '1', '2', '3']

    # Types are modified when there are nulls
    data = [
        pa.array([1, None]),
        pa.array([1, 2, 3])
    ]
    chunked_arr = pa.chunked_array(data)

    np_arr = np.asarray(chunked_arr)
    elements = np_arr.tolist()
    assert elements[0] == 1.
    assert np.isnan(elements[1])
    assert elements[2:] == [1., 2., 3.]
    assert np_arr.dtype == np.dtype('float64')
示例#26
0
def dataframe_to_types(df, preserve_index, columns=None):
    (all_names,
     column_names,
     index_descriptors,
     index_columns,
     columns_to_convert,
     _) = _get_columns_to_convert(df, None, preserve_index, columns)

    types = []
    # If pandas knows type, skip conversion
    for c in columns_to_convert:
        values = c.values
        if _pandas_api.is_categorical(values):
            type_ = pa.array(c, from_pandas=True).type
        else:
            values, type_ = get_datetimetz_type(values, c.dtype, None)
            type_ = pa.lib._ndarray_to_arrow_type(values, type_)
            if type_ is None:
                type_ = pa.array(c, from_pandas=True).type
        types.append(type_)

    metadata = construct_metadata(df, column_names, index_columns,
                                  index_descriptors, preserve_index, types)

    return all_names, types, metadata
示例#27
0
def test_sequence_timestamp_from_int_with_unit():
    data = [1]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert str(arr_s[0]) == "Timestamp('1970-01-01 00:00:01')"

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert str(arr_ms[0]) == "Timestamp('1970-01-01 00:00:00.001000')"

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert str(arr_us[0]) == "Timestamp('1970-01-01 00:00:00.000001')"

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert str(arr_ns[0]) == "Timestamp('1970-01-01 00:00:00.000000001')"

    with pytest.raises(pa.ArrowException):
        class CustomClass():
            pass
        pa.array([1, CustomClass()], type=ns)
        pa.array([1, CustomClass()], type=pa.date32())
        pa.array([1, CustomClass()], type=pa.date64())
示例#28
0
def test_sequence_timestamp_with_unit():
    data = [
        datetime.datetime(2007, 7, 13, 1, 23, 34, 123456),
    ]

    s = pa.timestamp('s')
    ms = pa.timestamp('ms')
    us = pa.timestamp('us')
    ns = pa.timestamp('ns')

    arr_s = pa.array(data, type=s)
    assert len(arr_s) == 1
    assert arr_s.type == s
    assert arr_s[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                 23, 34, 0)

    arr_ms = pa.array(data, type=ms)
    assert len(arr_ms) == 1
    assert arr_ms.type == ms
    assert arr_ms[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123000)

    arr_us = pa.array(data, type=us)
    assert len(arr_us) == 1
    assert arr_us.type == us
    assert arr_us[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)

    arr_ns = pa.array(data, type=ns)
    assert len(arr_ns) == 1
    assert arr_ns.type == ns
    assert arr_ns[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                  23, 34, 123456)
示例#29
0
def test_decimal_array_with_none_and_nan():
    values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')]
    array = pa.array(values)
    assert array.type == pa.decimal128(4, 3)
    assert array.to_pylist() == values[:2] + [None, None]

    array = pa.array(values, type=pa.decimal128(10, 4))
    assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None]
    def test_array_from_pandas_type_cast(self):
        arr = np.arange(10, dtype='int64')

        target_type = pa.int8()

        result = pa.array(arr, type=target_type)
        expected = pa.array(arr.astype('int8'))
        assert result.equals(expected)
示例#31
0
def test_chunked_array_str():
    data = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]
    data = pa.chunked_array(data)
    assert str(data) == """[
示例#32
0
 def _to_arrow_array(cls, scalars):
     return pa.array(scalars)
示例#33
0
def test_in_memory_table_filter(in_memory_pa_table):
    mask = pa.array([i % 2 == 0 for i in range(len(in_memory_pa_table))])
    table = InMemoryTable(in_memory_pa_table).filter(mask)
    assert table.table == in_memory_pa_table.filter(mask)
    assert isinstance(table, InMemoryTable)
示例#34
0
def test_chunked_array_mismatch_types():
    with pytest.raises(pa.ArrowInvalid):
        pa.chunked_array([pa.array([1, 2]), pa.array(['foo', 'bar'])])
示例#35
0
def test_recordbatch_from_struct_array_invalid():
    with pytest.raises(TypeError):
        pa.RecordBatch.from_struct_array(pa.array(range(5)))
示例#36
0
def simple_ints_table():
    data = [pa.array([-10, -5, 0, 5, 10])]
    return pa.Table.from_arrays(data, names=['some_ints'])
示例#37
0
def _to_record_batch(df):
    data = []
    names = list(df.columns.values)
    for c in names:
        data.append(pa.array(df[c]))
    return pa.RecordBatch.from_arrays(data, names)
 def convert_column(col, ty):
     return pa.array(col, from_pandas=True, type=ty)
    def setUp(self):
        super(NonStreamingCustomStatsGeneratorTest, self).setUp()
        # Integration tests involving Beam and AMI are challenging to write
        # because Beam PCollections are unordered while the results of adjusted MI
        # depend on the order of the data for small datasets. This test case tests
        # MI with one label which will give a value of 0 regardless of
        # the ordering of elements in the PCollection. The purpose of this test is
        # to ensure that the Mutual Information pipeline is able to handle a
        # variety of input types. Unit tests ensuring correctness of the MI value
        # itself are included in sklearn_mutual_information_test.

        # fa is categorical, fb is numeric, fc is multivalent and fd has null values
        self.tables = [
            pa.Table.from_arrays([
                pa.array([['Red']]),
                pa.array([[1.0]]),
                pa.array([[1, 3, 1]]),
                pa.array([[0.4]]),
                pa.array([['Label']]),
            ], ['fa', 'fb', 'fc', 'fd', 'label_key']),
            pa.Table.from_arrays([
                pa.array([['Green']]),
                pa.array([[2.2]]),
                pa.array([[2, 6]]),
                pa.array([[0.4]]),
                pa.array([['Label']]),
            ], ['fa', 'fb', 'fc', 'fd', 'label_key']),
            pa.Table.from_arrays([
                pa.array([['Blue']]),
                pa.array([[3.3]]),
                pa.array([[4, 6]]),
                pa.array([[0.3]]),
                pa.array([['Label']]),
            ], ['fa', 'fb', 'fc', 'fd', 'label_key']),
            pa.Table.from_arrays([
                pa.array([['Green']]),
                pa.array([[1.3]]),
                pa.array([None]),
                pa.array([[0.2]]),
                pa.array([['Label']]),
            ], ['fa', 'fb', 'fc', 'fd', 'label_key']),
            pa.Table.from_arrays([
                pa.array([['Red']]),
                pa.array([[1.2]]),
                pa.array([[1]]),
                pa.array([[0.3]]),
                pa.array([['Label']]),
            ], ['fa', 'fb', 'fc', 'fd', 'label_key']),
            pa.Table.from_arrays([
                pa.array([['Blue']]),
                pa.array([[0.5]]),
                pa.array([[3, 2]]),
                pa.array([[0.4]]),
                pa.array([['Label']]),
            ], ['fa', 'fb', 'fc', 'fd', 'label_key']),
            pa.Table.from_arrays([
                pa.array([['Blue']]),
                pa.array([[1.3]]),
                pa.array([[1, 4]]),
                pa.array([[1.7]]),
                pa.array([['Label']]),
            ], ['fa', 'fb', 'fc', 'fd', 'label_key']),
            pa.Table.from_arrays([
                pa.array([['Green']]),
                pa.array([[2.3]]),
                pa.array([[0]]),
                pa.array([[np.NaN]], type=pa.list_(pa.float64())),
                pa.array([['Label']]),
            ], ['fa', 'fb', 'fc', 'fd', 'label_key']),
            pa.Table.from_arrays([
                pa.array([['Green']]),
                pa.array([[0.3]]),
                pa.array([[3]]),
                pa.array([[4.4]]),
                pa.array([['Label']]),
            ], ['fa', 'fb', 'fc', 'fd', 'label_key']),
        ]

        self.schema = text_format.Parse(
            """
        feature {
          name: "fa"
          type: BYTES
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "fb"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "fc"
          type: INT
          value_count: {
            min: 0
            max: 2
          }
        }
        feature {
          name: "fd"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "label_key"
          type: BYTES
          shape {
            dim {
              size: 1
            }
          }
        }""", schema_pb2.Schema())
def np2arrowArray(x):
    if len(x.shape) > 1:
        x = np.transpose(x, [2,0,1])
        return pa.array([x.tolist()])
    else:
        return pa.array([x.tolist()])
示例#41
0
    def recurse(obj, mask):
        if isinstance(obj, numpy.ndarray):
            return pyarrow.array(obj, mask=mask)

        elif isinstance(obj, awkward0.array.chunked.ChunkedArray
                        ):  # includes AppendableArray
            raise TypeError(
                "only top-level ChunkedArrays can be converted to Arrow (as RecordBatches)"
            )

        elif isinstance(obj, awkward0.array.indexed.IndexedArray):
            if mask is None:
                return pyarrow.DictionaryArray.from_arrays(
                    obj.index, recurse(obj.content, mask))
            else:
                return recurse(obj.content[obj.index], mask)

        elif isinstance(obj, awkward0.array.indexed.SparseArray):
            return recurse(obj.dense, mask)

        elif isinstance(obj, awkward0.array.jagged.JaggedArray):
            obj = obj.compact()
            if mask is not None:
                mask = obj.tojagged(mask).flatten()
            arrow_type = pyarrow.ListArray
            # 64bit offsets not yet completely golden in arrow
            # if hasattr(pyarrow, 'LargeListArray') and obj.starts.itemsize > 4:
            #     arrow_type = pyarrow.LargeListArray
            return arrow_type.from_arrays(obj.offsets,
                                          recurse(obj.content, mask))

        elif isinstance(obj, awkward0.array.masked.IndexedMaskedArray):
            thismask = obj.boolmask(maskedwhen=True)
            if mask is not None:
                thismask = mask | thismask
            if len(obj.content) == 0:
                content = obj.numpy.empty(len(obj.mask), dtype=obj.DEFAULTTYPE)
            else:
                content = obj.content[obj.mask]
            return recurse(content, thismask)

        elif isinstance(
                obj,
                awkward0.array.masked.MaskedArray):  # includes BitMaskedArray
            thismask = obj.boolmask(maskedwhen=True)
            if mask is not None:
                thismask = mask | thismask
            return recurse(obj.content, thismask)

        elif isinstance(obj, awkward0.array.objects.StringArray):
            if obj.encoding is None and hasattr(pyarrow.BinaryArray,
                                                'from_buffers'):
                arrow_type = pyarrow.BinaryArray
                arrow_offset_type = pyarrow.binary()
                # 64bit offsets not yet completely golden in arrow
                # if hasattr(pyarrow, 'LargeBinaryArray') and obj.starts.itemsize > 4:
                #     arrow_type = pyarrow.LargeBinaryArray
                #     arrow_offset_type = pyarrow.large_binary()
                convert = lambda length, offsets, content: arrow_type.from_buffers(
                    arrow_offset_type, length, [None, offsets, content])
            elif codecs.lookup(obj.encoding) is codecs.lookup(
                    "utf-8") or obj.encoding is None:
                arrow_type = pyarrow.StringArray
                # if hasattr(pyarrow, 'LargeStringArray') and obj.starts.itemsize > 4:
                #     arrow_type = pyarrow.LargeStringArray
                convert = lambda length, offsets, content: arrow_type.from_buffers(
                    length, offsets, content)
            else:
                raise ValueError(
                    "only encoding=None or encoding='utf-8' can be converted to Arrow"
                )

            obj = obj.compact()
            offsets = obj.offsets
            if offsets.dtype != numpy.dtype(numpy.int32):
                offsets = offsets.astype(numpy.int32)

            return convert(
                len(offsets) - 1, pyarrow.py_buffer(offsets),
                pyarrow.py_buffer(obj.content))

        elif isinstance(obj, awkward0.array.objects.ObjectArray):
            # throw away Python object interpretation, which Arrow can't handle while being multilingual
            return recurse(obj.content, mask)

        elif isinstance(obj, awkward0.array.table.Table):
            return pyarrow.StructArray.from_arrays(
                [recurse(x, mask) for x in obj.contents.values()],
                list(obj.contents))

        elif isinstance(obj, awkward0.array.union.UnionArray):
            contents = []
            for i, x in enumerate(obj.contents):
                if mask is None:
                    thismask = None
                else:
                    thistags = (obj.tags == i)
                    thismask = obj.numpy.empty(len(x), dtype=obj.MASKTYPE)
                    thismask[obj.index[thistags]] = mask[
                        thistags]  # hmm... obj.index could have repeats; the Arrow mask in that case would not be well-defined...
                contents.append(recurse(x, thismask))

            return pyarrow.UnionArray.from_dense(
                pyarrow.array(obj.tags.astype(numpy.int8)),
                pyarrow.array(obj.index.astype(numpy.int32)), contents)

        elif isinstance(obj, awkward0.array.virtual.VirtualArray):
            return recurse(obj.array, mask)

        else:
            raise TypeError("cannot convert type {0} to Arrow".format(
                type(obj)))
示例#42
0
def test_recordbatch_column_sets_private_name():
    # ARROW-6429
    rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0'])
    assert rb[0]._name == 'a0'
示例#43
0
def test_recordbatch_empty_metadata():
    data = [pa.array(range(5)), pa.array([-10, -5, 0, 5, 10])]

    batch = pa.RecordBatch.from_arrays(data, ['c0', 'c1'])
    assert batch.schema.metadata is None
示例#44
0
 def do_get(self, context, ticket):
     assert self.expected_ticket == ticket.ticket
     data1 = [pa.array([-10, -5, 0, 5, 10], type=pa.int32())]
     table = pa.Table.from_arrays(data1, names=['a'])
     return flight.RecordBatchStream(table)
示例#45
0
def as_column(arbitrary):
    """Create a Column from an arbitrary object

    Currently support inputs are:

    * ``Column``
    * ``Buffer``
    * numba device array
    * numpy array
    * pandas.Categorical

    Returns
    -------
    result : subclass of TypedColumnBase
        - CategoricalColumn for pandas.Categorical input.
        - NumericalColumn for all other inputs.
    """
    from . import numerical, categorical, datetime

    if isinstance(arbitrary, Column):
        if not isinstance(arbitrary, TypedColumnBase):
            # interpret as numeric
            data = arbitrary.view(numerical.NumericalColumn,
                                  dtype=arbitrary.dtype)
        else:
            data = arbitrary

    elif isinstance(arbitrary, Buffer):
        data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary))
        if (data.dtype in [np.float16, np.float32, np.float64]
                and arbitrary.size > 0):
            mask = cudautils.mask_from_devary(arbitrary)
            data = data.set_mask(mask)

    elif isinstance(arbitrary, np.ndarray):
        if arbitrary.dtype.kind == 'M':
            data = datetime.DatetimeColumn.from_numpy(arbitrary)
        else:
            data = as_column(rmm.to_device(arbitrary))

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            raise NotImplementedError("Strings are not yet supported")
        elif isinstance(arbitrary, pa.NullArray):
            pamask = Buffer(np.empty(0, dtype='int8'))
            padata = Buffer(np.empty(0,
                                     dtype=arbitrary.type.to_pandas_dtype()))
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=0,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))
        elif isinstance(arbitrary, pa.DictionaryArray):
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(
                    arbitrary.indices.type.to_pandas_dtype()))
            data = categorical.CategoricalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                categories=arbitrary.dictionary.to_pylist(),
                ordered=arbitrary.type.ordered,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            arbitrary = arbitrary.cast(pa.timestamp('ms'))
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]')))
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date64Array):
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]')))
            data = datetime.DatetimeColumn(data=padata,
                                           mask=pamask,
                                           null_count=arbitrary.null_count,
                                           dtype=np.dtype('M8[ms]'))
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value", UserWarning)
            arbitrary = arbitrary.cast(pa.date64())
            data = as_column(arbitrary)
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            arbitrary = arbitrary.cast(pa.int8())
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(np.array(arbitrary.buffers()[1]).view(dtype))
            data = numerical.NumericalColumn(data=padata,
                                             mask=pamask,
                                             null_count=arbitrary.null_count,
                                             dtype=dtype)
        else:
            if arbitrary.buffers()[0]:
                pamask = Buffer(np.array(arbitrary.buffers()[0]))
            else:
                pamask = None
            padata = Buffer(
                np.array(arbitrary.buffers()[1]).view(
                    np.dtype(arbitrary.type.to_pandas_dtype())))
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                null_count=arbitrary.null_count,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()))

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        data = as_column(pa.array(arbitrary, from_pandas=True))

    elif np.isscalar(arbitrary):
        if hasattr(arbitrary, 'dtype'):
            data_type = _gdf.np_to_pa_dtype(arbitrary.dtype)
            if data_type in (pa.date64(), pa.date32()):
                # PyArrow can't construct date64 or date32 arrays from np
                # datetime types
                arbitrary = arbitrary.astype('int64')
            data = as_column(pa.array([arbitrary], type=data_type))
        else:
            data = as_column(pa.array([arbitrary]))

    else:
        data = as_column(pa.array(arbitrary))

    return data
示例#46
0
 def do_get(self, context, ticket):
     data = [pa.array([-10, -5, 0, 5, 10])]
     table = pa.Table.from_arrays(data, names=['a'])
     return flight.GeneratorStream(table.schema, self.number_batches(table))
示例#47
0
def test_concat_tables_with_promotion_error():
    t1 = pa.Table.from_arrays([pa.array([1, 2], type=pa.int64())], ["f"])
    t2 = pa.Table.from_arrays([pa.array([1, 2], type=pa.float32())], ["f"])

    with pytest.raises(pa.ArrowInvalid):
        pa.concat_tables([t1, t2], promote=True)
示例#48
0
def test_is_dictionary():
    assert types.is_dictionary(
        pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])))
    assert not types.is_dictionary(pa.int32())
示例#49
0
def test_table_column_sets_private_name():
    # ARROW-6429
    t = pa.table([pa.array([1, 2, 3, 4])], names=['a0'])
    assert t[0]._name == 'a0'
示例#50
0
        def method(self, other):
            is_arithmetic = \
                True if op.__name__ in ops.ARITHMETIC_BINOPS else False
            pandas_only = cls._pandas_only()

            is_other_array = False
            if not is_scalar(other):
                is_other_array = True
                other = np.asarray(other)

            self_is_na = self.isna()
            other_is_na = pd.isna(other)
            mask = self_is_na | other_is_na

            if pa is None or pandas_only:
                if is_arithmetic:
                    ret = np.empty(self.shape, dtype=object)
                else:
                    ret = np.zeros(self.shape, dtype=bool)
                valid = ~mask
                arr = self._arrow_array.to_pandas().to_numpy() \
                    if self._use_arrow else self._ndarray
                o = other[valid] if is_other_array else other
                ret[valid] = op(arr[valid], o)
                if is_arithmetic:
                    return ArrowStringArray(ret)
                else:
                    return pd.arrays.BooleanArray(ret, mask)

            chunks = []
            mask_chunks = []
            start = 0
            for chunk_array in self._arrow_array.chunks:
                chunk_array = np.asarray(chunk_array.to_pandas())
                end = start + len(chunk_array)
                chunk_mask = mask[start:end]
                chunk_valid = ~chunk_mask

                if is_arithmetic:
                    result = np.empty(chunk_array.shape, dtype=object)
                else:
                    result = np.zeros(chunk_array.shape, dtype=bool)

                chunk_other = other
                if is_other_array:
                    chunk_other = other[start:end]
                    chunk_other = chunk_other[chunk_valid]

                # calculate only for both not None
                result[chunk_valid] = op(chunk_array[chunk_valid], chunk_other)

                if is_arithmetic:
                    chunks.append(
                        pa.array(result, type=pa.string(), from_pandas=True))
                else:
                    chunks.append(result)
                    mask_chunks.append(chunk_mask)

            if is_arithmetic:
                return ArrowStringArray(pa.chunked_array(chunks))
            else:
                return pd.arrays.BooleanArray(np.concatenate(chunks),
                                              np.concatenate(mask_chunks))
示例#51
0
def test_recordbatch_from_arrays_validate_schema():
    # ARROW-6263
    arr = pa.array([1, 2])
    schema = pa.schema([pa.field('f0', pa.list_(pa.utf8()))])
    with pytest.raises(NotImplementedError):
        pa.record_batch([arr], schema=schema)
示例#52
0
 def _to_arrow_array(cls, scalars):
     return pa.array(scalars).cast(pa.string())
示例#53
0
    def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
        """Set one or more values inplace.

        Parameters
        ----------
        key : int, ndarray, or slice
            When called from, e.g. ``Series.__setitem__``, ``key`` will be
            one of

            * scalar int
            * ndarray of integers.
            * boolean ndarray
            * slice object

        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
            value or values to be set of ``key``.

        Returns
        -------
        None
        """
        key = check_array_indexer(self, key)

        if is_integer(key):
            if not is_scalar(value):
                raise ValueError("Must pass scalars with scalar indexer")
            elif isna(value):
                value = None
            elif not isinstance(value, str):
                raise ValueError("Scalar must be NA or str")

            # Slice data and insert in-between
            new_data = [
                *self._data[0:key].chunks,
                pa.array([value], type=pa.string()),
                *self._data[(key + 1):].chunks,
            ]
            self._data = pa.chunked_array(new_data)
        else:
            # Convert to integer indices and iteratively assign.
            # TODO: Make a faster variant of this in Arrow upstream.
            #       This is probably extremely slow.

            # Convert all possible input key types to an array of integers
            if is_bool_dtype(key):
                # TODO(ARROW-9430): Directly support setitem(booleans)
                key_array = np.argwhere(key).flatten()
            elif isinstance(key, slice):
                key_array = np.array(range(len(self))[key])
            else:
                # TODO(ARROW-9431): Directly support setitem(integers)
                key_array = np.asanyarray(key)

            if is_scalar(value):
                value = np.broadcast_to(value, len(key_array))
            else:
                value = np.asarray(value)

            if len(key_array) != len(value):
                raise ValueError("Length of indexer and values mismatch")

            for k, v in zip(key_array, value):
                self[k] = v
示例#54
0
def test_in_memory_table_append_column(in_memory_pa_table):
    field_ = "new_field"
    column = pa.array([i for i in range(len(in_memory_pa_table))])
    table = InMemoryTable(in_memory_pa_table).append_column(field_, column)
    assert table.table == in_memory_pa_table.append_column(field_, column)
    assert isinstance(table, InMemoryTable)
  def test_mi_regression_with_float_label_and_numeric_features(self):
    label_array = pa.array([
        [0.1], [0.2], [0.8], [0.7], [0.2], [0.3], [0.9],
        [0.4], [0.1], [0.0], [0.4], [0.6], [0.4], [0.8]])
    # Random floats that do not map onto the label
    terrible_feat_array = pa.array([
        [0.4], [0.1], [0.4], [0.4], [0.8], [0.7], [0.2],
        [0.1], [0.0], [0.4], [0.8], [0.2], [0.5], [0.1]])
    batch = pa.RecordBatch.from_arrays(
        [label_array, label_array, terrible_feat_array],
        ["label_key", "perfect_feature", "terrible_feature"])

    schema = text_format.Parse(
        """
        feature {
          name: "perfect_feature"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "terrible_feature"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "label_key"
          type: FLOAT
          shape {
            dim {
              size: 1
            }
          }
        }
        """, schema_pb2.Schema())

    expected = text_format.Parse(
        """
        features {
          path {
            step: "perfect_feature"
          }
          custom_stats {
            name: "sklearn_adjusted_mutual_information"
            num: 1.0096965
          }
          custom_stats {
            name: "sklearn_mutual_information"
            num: 1.1622766
          }
        }
        features {
          path {
            step: "terrible_feature"
          }
          custom_stats {
            name: "sklearn_adjusted_mutual_information"
            num: 0.0211485
          }
          custom_stats {
            name: "sklearn_mutual_information"
            num: 0.0211485
          }
        }""", statistics_pb2.DatasetFeatureStatistics())
    self._assert_mi_output_equal(batch, expected, schema,
                                 types.FeaturePath(["label_key"]))
示例#56
0
    def take(self,
             indices: Sequence[int],
             allow_fill: bool = False,
             fill_value: Any = None) -> "ExtensionArray":
        """
        Take elements from an array.

        Parameters
        ----------
        indices : sequence of int
            Indices to be taken.
        allow_fill : bool, default False
            How to handle negative values in `indices`.

            * False: negative values in `indices` indicate positional indices
              from the right (the default). This is similar to
              :func:`numpy.take`.

            * True: negative values in `indices` indicate
              missing values. These values are set to `fill_value`. Any other
              other negative values raise a ``ValueError``.

        fill_value : any, optional
            Fill value to use for NA-indices when `allow_fill` is True.
            This may be ``None``, in which case the default NA value for
            the type, ``self.dtype.na_value``, is used.

            For many ExtensionArrays, there will be two representations of
            `fill_value`: a user-facing "boxed" scalar, and a low-level
            physical NA value. `fill_value` should be the user-facing version,
            and the implementation should handle translating that to the
            physical version for processing the take if necessary.

        Returns
        -------
        ExtensionArray

        Raises
        ------
        IndexError
            When the indices are out of bounds for the array.
        ValueError
            When `indices` contains negative values other than ``-1``
            and `allow_fill` is True.

        See Also
        --------
        numpy.take
        api.extensions.take

        Notes
        -----
        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
        ``iloc``, when `indices` is a sequence of values. Additionally,
        it's called by :meth:`Series.reindex`, or any other method
        that causes realignment, with a `fill_value`.
        """
        # TODO: Remove once we got rid of the (indices < 0) check
        if not is_array_like(indices):
            indices_array = np.asanyarray(indices)
        else:
            indices_array = indices

        if len(self._data) == 0 and (indices_array >= 0).any():
            raise IndexError("cannot do a non-empty take")
        if indices_array.size > 0 and indices_array.max() >= len(self._data):
            raise IndexError("out of bounds value in 'indices'.")

        if allow_fill:
            fill_mask = indices_array < 0
            if fill_mask.any():
                validate_indices(indices_array, len(self._data))
                # TODO(ARROW-9433): Treat negative indices as NULL
                indices_array = pa.array(indices_array, mask=fill_mask)
                result = self._data.take(indices_array)
                if isna(fill_value):
                    return type(self)(result)
                # TODO: ArrowNotImplementedError: Function fill_null has no
                # kernel matching input types (array[string], scalar[string])
                result = type(self)(result)
                result[fill_mask] = fill_value
                return result
                # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
            else:
                # Nothing to fill
                return type(self)(self._data.take(indices))
        else:  # allow_fill=False
            # TODO(ARROW-9432): Treat negative indices as indices from the right.
            if (indices_array < 0).any():
                # Don't modify in-place
                indices_array = np.copy(indices_array)
                indices_array[indices_array < 0] += len(self._data)
            return type(self)(self._data.take(indices_array))
示例#57
0
def test_date_time_types():
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2000-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.array(data7, type=t7)

    t7_us = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value
    data7_us = np.array([start, start + 1000, start + 2000],
                        dtype='int64') // 1000
    a7_us = pa.array(data7_us, type=t7_us)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' to 'timestamp[us]'
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.0')

    # date64 as date32
    # time32[s] to time32[ms]
    # 'timestamp[ns]' is saved as INT96 timestamp
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table,
                     expected=expected,
                     version='2.0',
                     use_deprecated_int96_timestamps=True)

    # Check that setting flavor to 'spark' uses int96 timestamps
    _check_roundtrip(table, expected=expected, version='2.0', flavor='spark')

    # Unsupported stuff
    def _assert_unsupported(array):
        table = pa.Table.from_arrays([array], ['unsupported'])
        buf = io.BytesIO()

        with pytest.raises(NotImplementedError):
            _write_table(table, buf, version="2.0")

    t7 = pa.time64('ns')
    a7 = pa.array(data4.astype('int64'), type=t7)

    _assert_unsupported(a7)
示例#58
0
 def _from_sequence(cls, scalars, dtype=None, copy=False):
     cls._chk_pyarrow_available()
     # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value
     scalars = lib.ensure_string_array(scalars, copy=False)
     return cls(pa.array(scalars, type=pa.string(), from_pandas=True))
示例#59
0
def test_sum(arrow_type):
    arr = pa.array([1, 2, 3, 4], type=arrow_type)
    assert arr.sum() == 10
  def test_mi_classif_with_categorical_all_unique_labels(self):
    label_array = pa.array([[0], [2], [0], [1], [2], [1], [1], [0], [2], [1],
                            [0]])
    # A categorical feature that maps directly on to the label.
    perfect_feat_array = pa.array([["Red"], ["Blue"], ["Red"], ["Green"],
                                   ["Blue"], ["Green"], ["Green"], ["Red"],
                                   ["Blue"], ["Green"], ["Red"]])
    # A categorical feature that has all values unique.
    unique_feat_array = pa.array([["Red1"], ["Red2"], ["Red3"], ["Red4"],
                                  ["Red5"], ["Red6"], ["Red7"], ["Red8"],
                                  ["Red9"], ["Red10"], ["Red11"]])
    batch = pa.RecordBatch.from_arrays(
        [label_array, perfect_feat_array, unique_feat_array],
        ["label_key", "perfect_feature", "unique_feat_array"])

    schema = text_format.Parse(
        """
        feature {
          name: "label_key"
          type: INT
          int_domain {
            is_categorical: false
          }
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "perfect_feature"
          type: BYTES
          shape {
            dim {
              size: 1
            }
          }
        }
        feature {
          name: "unique_feat_array"
          type: BYTES
          shape {
            dim {
              size: 1
            }
          }
        }
        """, schema_pb2.Schema())
    # Note that the MI is different from above since the label is declared as
    # continuous feature.
    expected = text_format.Parse(
        """
        features {
          path {
            step: "perfect_feature"
          }
          custom_stats {
            name: 'sklearn_adjusted_mutual_information'
            num: 1.7319986
          }
          custom_stats {
            name: 'sklearn_mutual_information'
            num: 1.7319986
          }
        }""", statistics_pb2.DatasetFeatureStatistics())
    self._assert_mi_output_equal(batch, expected, schema,
                                 types.FeaturePath(["label_key"]))