示例#1
0
文件: test_schema.py 项目: rok/arrow
def test_schema():
    fields = [
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string()),
        pa.field('baz', pa.list_(pa.int8()))
    ]
    sch = pa.schema(fields)

    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]

    assert len(sch) == 3
    assert sch[0].name == 'foo'
    assert sch[0].type == fields[0].type
    assert sch.field_by_name('foo').name == 'foo'
    assert sch.field_by_name('foo').type == fields[0].type

    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([None])
示例#2
0
def test_table_unsafe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])

    with pytest.raises(pa.ArrowInvalid,
                       match='Floating point value truncated'):
        table.cast(target_schema)

    casted_table = table.cast(target_schema, safe=False)
    assert casted_table.equals(expected_table)
示例#3
0
def test_table_safe_casting():
    data = [
        pa.array(range(5), type=pa.int64()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int32()),
        pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    table = pa.Table.from_arrays(data, names=tuple('abcd'))

    expected_data = [
        pa.array(range(5), type=pa.int32()),
        pa.array([-10, -5, 0, 5, 10], type=pa.int16()),
        pa.array([1, 2, 3, 4, 5], type=pa.int64()),
        pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string())
    ]
    expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd'))

    target_schema = pa.schema([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int16()),
        pa.field('c', pa.int64()),
        pa.field('d', pa.string())
    ])
    casted_table = table.cast(target_schema)

    assert casted_table.equals(expected_table)
示例#4
0
文件: test_array.py 项目: rok/arrow
def test_cast_from_null():
    in_data = [None] * 3
    in_type = pa.null()
    out_types = [
        pa.null(),
        pa.uint8(),
        pa.float16(),
        pa.utf8(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.int16()),
        pa.decimal128(19, 4),
        pa.timestamp('us'),
        pa.timestamp('us', tz='UTC'),
        pa.timestamp('us', tz='Europe/Paris'),
        pa.struct([pa.field('a', pa.int32()),
                   pa.field('b', pa.list_(pa.int8())),
                   pa.field('c', pa.string())]),
        ]
    for out_type in out_types:
        _check_cast_case((in_data, in_type, in_data, out_type))

    out_types = [
        pa.dictionary(pa.int32(), pa.string()),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
        pa.union([pa.field('a', pa.binary(10)),
                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
        ]
    in_arr = pa.array(in_data, type=pa.null())
    for out_type in out_types:
        with pytest.raises(NotImplementedError):
            in_arr.cast(out_type)
示例#5
0
文件: test_orc.py 项目: dremio/arrow
def test_orcfile_empty():
    from pyarrow import orc
    f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
    table = f.read()
    assert table.num_rows == 0
    schema = table.schema
    expected_schema = pa.schema([
        ('boolean1', pa.bool_()),
        ('byte1', pa.int8()),
        ('short1', pa.int16()),
        ('int1', pa.int32()),
        ('long1', pa.int64()),
        ('float1', pa.float32()),
        ('double1', pa.float64()),
        ('bytes1', pa.binary()),
        ('string1', pa.string()),
        ('middle', pa.struct([
            ('list', pa.list_(pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ]))),
            ])),
        ('list', pa.list_(pa.struct([
            ('int1', pa.int32()),
            ('string1', pa.string()),
            ]))),
        ('map', pa.list_(pa.struct([
            ('key', pa.string()),
            ('value', pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ])),
            ]))),
        ])
    assert schema == expected_schema
示例#6
0
def test_type_schema_pickling():
    cases = [
        pa.int8(),
        pa.string(),
        pa.binary(),
        pa.binary(10),
        pa.list_(pa.string()),
        pa.struct([
            pa.field('a', 'int8'),
            pa.field('b', 'string')
        ]),
        pa.time32('s'),
        pa.time64('us'),
        pa.date32(),
        pa.date64(),
        pa.timestamp('ms'),
        pa.timestamp('ns'),
        pa.decimal(12, 2),
        pa.field('a', 'string', metadata={b'foo': b'bar'})
    ]

    for val in cases:
        roundtripped = pickle.loads(pickle.dumps(val))
        assert val == roundtripped

    fields = []
    for i, f in enumerate(cases):
        if isinstance(f, pa.Field):
            fields.append(f)
        else:
            fields.append(pa.field('_f{}'.format(i), f))

    schema = pa.schema(fields, metadata={b'foo': b'bar'})
    roundtripped = pickle.loads(pickle.dumps(schema))
    assert schema == roundtripped
示例#7
0
    def test_custom_nulls(self):
        # Infer nulls with custom values
        opts = ConvertOptions(null_values=['Xxx', 'Zzz'])
        rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n"
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.null()),
                            ('b', pa.string()),
                            ('c', pa.string()),
                            ('d', pa.int64())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [None, None],
            'b': [u"Xxx", u"#N/A"],
            'c': [u"1", u""],
            'd': [2, None],
            }

        opts = ConvertOptions(null_values=[])
        rows = b"a,b\n#N/A,\n"
        table = self.read_bytes(rows, convert_options=opts)
        schema = pa.schema([('a', pa.string()),
                            ('b', pa.string())])
        assert table.schema == schema
        assert table.to_pydict() == {
            'a': [u"#N/A"],
            'b': [u""],
            }
示例#8
0
def test_sequence_utf8_to_unicode():
    # ARROW-1225
    data = [b'foo', None, b'bar']
    arr = pa.array(data, type=pa.string())
    assert arr[0].as_py() == u'foo'

    # test a non-utf8 unicode string
    val = (u'mañana').encode('utf-16-le')
    with pytest.raises(pa.ArrowInvalid):
        pa.array([val], type=pa.string())
示例#9
0
def test_is_binary_string():
    assert types.is_binary(pa.binary())
    assert not types.is_binary(pa.string())

    assert types.is_string(pa.string())
    assert types.is_unicode(pa.string())
    assert not types.is_string(pa.binary())

    assert types.is_fixed_size_binary(pa.binary(5))
    assert not types.is_fixed_size_binary(pa.binary())
示例#10
0
文件: test_csv.py 项目: wesm/arrow
def test_convert_options():
    cls = ConvertOptions
    opts = cls()

    assert opts.check_utf8 is True
    opts.check_utf8 = False
    assert opts.check_utf8 is False

    assert opts.strings_can_be_null is False
    opts.strings_can_be_null = True
    assert opts.strings_can_be_null is True

    assert opts.column_types == {}
    # Pass column_types as mapping
    opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
    assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()}
    opts.column_types = {'v': 'int16', 'w': 'null'}
    assert opts.column_types == {'v': pa.int16(), 'w': pa.null()}
    # Pass column_types as schema
    schema = pa.schema([('a', pa.int32()), ('b', pa.string())])
    opts.column_types = schema
    assert opts.column_types == {'a': pa.int32(), 'b': pa.string()}
    # Pass column_types as sequence
    opts.column_types = [('x', pa.binary())]
    assert opts.column_types == {'x': pa.binary()}

    with pytest.raises(TypeError, match='DataType expected'):
        opts.column_types = {'a': None}
    with pytest.raises(TypeError):
        opts.column_types = 0

    assert isinstance(opts.null_values, list)
    assert '' in opts.null_values
    assert 'N/A' in opts.null_values
    opts.null_values = ['xxx', 'yyy']
    assert opts.null_values == ['xxx', 'yyy']

    assert isinstance(opts.true_values, list)
    opts.true_values = ['xxx', 'yyy']
    assert opts.true_values == ['xxx', 'yyy']

    assert isinstance(opts.false_values, list)
    opts.false_values = ['xxx', 'yyy']
    assert opts.false_values == ['xxx', 'yyy']

    opts = cls(check_utf8=False, column_types={'a': pa.null()},
               null_values=['N', 'nn'], true_values=['T', 'tt'],
               false_values=['F', 'ff'], strings_can_be_null=True)
    assert opts.check_utf8 is False
    assert opts.column_types == {'a': pa.null()}
    assert opts.null_values == ['N', 'nn']
    assert opts.false_values == ['F', 'ff']
    assert opts.true_values == ['T', 'tt']
    assert opts.strings_can_be_null is True
示例#11
0
文件: test_schema.py 项目: rok/arrow
def test_schema_equals_propagates_check_metadata():
    # ARROW-4088
    schema1 = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string())
    ])
    schema2 = pa.schema([
        pa.field('foo', pa.int32()),
        pa.field('bar', pa.string(), metadata={'a': 'alpha'}),
    ])
    assert not schema1.equals(schema2)
    assert schema1.equals(schema2, check_metadata=False)
示例#12
0
def test_table_pickle():
    data = [
        pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()),
        pa.chunked_array([["some", "strings", None, ""]], type=pa.string()),
    ]
    schema = pa.schema([pa.field('ints', pa.uint32()),
                        pa.field('strs', pa.string())],
                       metadata={b'foo': b'bar'})
    table = pa.Table.from_arrays(data, schema=schema)

    result = pickle.loads(pickle.dumps(table))
    result._validate()
    assert result.equals(table)
示例#13
0
def test_array_mixed_unicode_bytes():
    values = [u'qux', b'foo', bytearray(b'barz')]
    b_values = [b'qux', b'foo', b'barz']
    u_values = [u'qux', u'foo', u'barz']

    arr = pa.array(values)
    expected = pa.array(b_values, type=pa.binary())
    assert arr.type == pa.binary()
    assert arr.equals(expected)

    arr = pa.array(values, type=pa.string())
    expected = pa.array(u_values, type=pa.string())
    assert arr.type == pa.string()
    assert arr.equals(expected)
示例#14
0
def test_type_to_pandas_dtype():
    M8_ns = np.dtype('datetime64[ns]')
    cases = [
        (pa.null(), np.float64),
        (pa.bool_(), np.bool_),
        (pa.int8(), np.int8),
        (pa.int16(), np.int16),
        (pa.int32(), np.int32),
        (pa.int64(), np.int64),
        (pa.uint8(), np.uint8),
        (pa.uint16(), np.uint16),
        (pa.uint32(), np.uint32),
        (pa.uint64(), np.uint64),
        (pa.float16(), np.float16),
        (pa.float32(), np.float32),
        (pa.float64(), np.float64),
        (pa.date32(), M8_ns),
        (pa.date64(), M8_ns),
        (pa.timestamp('ms'), M8_ns),
        (pa.binary(), np.object_),
        (pa.binary(12), np.object_),
        (pa.string(), np.object_),
        (pa.list_(pa.int8()), np.object_),
    ]
    for arrow_type, numpy_type in cases:
        assert arrow_type.to_pandas_dtype() == numpy_type
示例#15
0
 def test_unicode(self):
     data = [u("foo"), u("bar"), None, u("arrow")]
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 4
     assert arr.null_count == 1
     assert arr.type == pyarrow.string()
     assert arr.to_pylist() == [u("foo"), u("bar"), None, u("arrow")]
示例#16
0
    def test_field(self):
        t = arrow.string()
        f = arrow.field('foo', t)

        assert f.name == 'foo'
        assert f.type is t
        assert repr(f) == "Field('foo', type=string)"
示例#17
0
文件: test_types.py 项目: rok/arrow
def test_dictionary_type():
    ty0 = pa.dictionary(pa.int32(), pa.string())
    assert ty0.index_type == pa.int32()
    assert ty0.value_type == pa.string()
    assert ty0.ordered is False

    ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True)
    assert ty1.index_type == pa.int8()
    assert ty1.value_type == pa.float64()
    assert ty1.ordered is True

    # construct from non-arrow objects
    ty2 = pa.dictionary('int8', 'string')
    assert ty2.index_type == pa.int8()
    assert ty2.value_type == pa.string()
    assert ty2.ordered is False
示例#18
0
def test_struct_from_tuples():
    ty = pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.string()),
                    pa.field('c', pa.bool_())])

    data = [(5, 'foo', True),
            (6, 'bar', False)]
    expected = [{'a': 5, 'b': 'foo', 'c': True},
                {'a': 6, 'b': 'bar', 'c': False}]
    arr = pa.array(data, type=ty)

    data_as_ndarray = np.empty(len(data), dtype=object)
    data_as_ndarray[:] = data
    arr2 = pa.array(data_as_ndarray, type=ty)
    assert arr.to_pylist() == expected

    assert arr.equals(arr2)

    # With omitted values
    data = [(5, 'foo', None),
            None,
            (6, None, False)]
    expected = [{'a': 5, 'b': 'foo', 'c': None},
                None,
                {'a': 6, 'b': None, 'c': False}]
    arr = pa.array(data, type=ty)
    assert arr.to_pylist() == expected

    # Invalid tuple size
    for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]:
        with pytest.raises(ValueError, match="(?i)tuple size"):
            pa.array([tup], type=ty)
示例#19
0
def test_is_union():
    for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]:
        assert types.is_union(pa.union([pa.field('a', pa.int32()),
                                        pa.field('b', pa.int8()),
                                        pa.field('c', pa.string())],
                                       mode=mode))
    assert not types.is_union(pa.list_(pa.int32()))
示例#20
0
def test_empty_cast():
    types = [
        pa.null(),
        pa.bool_(),
        pa.int8(),
        pa.int16(),
        pa.int32(),
        pa.int64(),
        pa.uint8(),
        pa.uint16(),
        pa.uint32(),
        pa.uint64(),
        pa.float16(),
        pa.float32(),
        pa.float64(),
        pa.date32(),
        pa.date64(),
        pa.binary(),
        pa.binary(length=4),
        pa.string(),
    ]

    for (t1, t2) in itertools.product(types, types):
        try:
            # ARROW-4766: Ensure that supported types conversion don't segfault
            # on empty arrays of common types
            pa.array([], type=t1).cast(t2)
        except pa.lib.ArrowNotImplementedError:
            continue
示例#21
0
def test_sequence_unicode():
    data = [u'foo', u'bar', None, u'mañana']
    arr = pa.array(data)
    assert len(arr) == 4
    assert arr.null_count == 1
    assert arr.type == pa.string()
    assert arr.to_pylist() == data
示例#22
0
 def test_unicode(self):
     data = [u'foo', u'bar', None, u'mañana']
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 4
     assert arr.null_count == 1
     assert arr.type == pyarrow.string()
     assert arr.to_pylist() == data
示例#23
0
def test_struct_from_dicts_inference():
    expected_type = pa.struct([pa.field('a', pa.int64()),
                               pa.field('b', pa.string()),
                               pa.field('c', pa.bool_())])
    data = [{'a': 5, 'b': u'foo', 'c': True},
            {'a': 6, 'b': u'bar', 'c': False}]
    arr = pa.array(data)
    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == data

    # With omitted values
    data = [{'a': 5, 'c': True},
            None,
            {},
            {'a': None, 'b': u'bar'}]
    expected = [{'a': 5, 'b': None, 'c': True},
                None,
                {'a': None, 'b': None, 'c': None},
                {'a': None, 'b': u'bar', 'c': None}]
    arr = pa.array(data)
    data_as_ndarray = np.empty(len(data), dtype=object)
    data_as_ndarray[:] = data
    arr2 = pa.array(data)

    check_struct_type(arr.type, expected_type)
    assert arr.to_pylist() == expected
    assert arr.equals(arr2)

    # Nested
    expected_type = pa.struct([
        pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())),
                                 pa.field('ab', pa.bool_())])),
        pa.field('b', pa.string())])
    data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'},
            {'a': {'aa': None, 'ab': False}, 'b': None},
            {'a': None, 'b': 'bar'}]
    arr = pa.array(data)
    assert arr.to_pylist() == data

    # Edge cases
    arr = pa.array([{}])
    assert arr.type == pa.struct([])
    assert arr.to_pylist() == [{}]

    # Mixing structs and scalars is rejected
    with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)):
        pa.array([1, {'a': 2}])
示例#24
0
def test_type_list():
    value_type = pa.int32()
    list_type = pa.list_(value_type)
    assert str(list_type) == 'list<item: int32>'

    field = pa.field('my_item', pa.string())
    l2 = pa.list_(field)
    assert str(l2) == 'list<my_item: string>'
    def test_infer_lists(self):
        data = OrderedDict([
            ('nan_ints', [[None, 1], [2, 3]]),
            ('ints', [[0, 1], [2, 3]]),
            ('strs', [[None, u'b'], [u'c', u'd']]),
            ('nested_strs', [[[None, u'b'], [u'c', u'd']], None])
        ])
        df = pd.DataFrame(data)

        expected_schema = pa.schema([
            pa.field('nan_ints', pa.list_(pa.int64())),
            pa.field('ints', pa.list_(pa.int64())),
            pa.field('strs', pa.list_(pa.string())),
            pa.field('nested_strs', pa.list_(pa.list_(pa.string())))
        ])

        self._check_pandas_roundtrip(df, expected_schema=expected_schema)
    def test_unicode(self):
        repeats = 1000
        values = [u'foo', None, u'bar', u'mañana', np.nan]
        df = pd.DataFrame({'strings': values * repeats})
        field = pa.field('strings', pa.string())
        schema = pa.schema([field])

        self._check_pandas_roundtrip(df, expected_schema=schema)
示例#27
0
def test_chunked_array_basics():
    data = pa.chunked_array([], type=pa.string())
    assert data.type == pa.string()
    assert data.to_pylist() == []

    with pytest.raises(ValueError):
        pa.chunked_array([])

    data = pa.chunked_array([
        [1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]
    ])
    assert isinstance(data.chunks, list)
    assert all(isinstance(c, pa.lib.Int64Array) for c in data.chunks)
    assert all(isinstance(c, pa.lib.Int64Array) for c in data.iterchunks())
    assert len(data.chunks) == 3
示例#28
0
def test_struct_from_mixed_sequence():
    # It is forbidden to mix dicts and tuples when initializing a struct array
    ty = pa.struct([pa.field('a', pa.int32()),
                    pa.field('b', pa.string()),
                    pa.field('c', pa.bool_())])
    data = [(5, 'foo', True),
            {'a': 6, 'b': 'bar', 'c': False}]
    with pytest.raises(TypeError):
        pa.array(data, type=ty)
示例#29
0
文件: test_schema.py 项目: rok/arrow
def test_schema_from_tuples():
    fields = [
        ('foo', pa.int32()),
        ('bar', pa.string()),
        ('baz', pa.list_(pa.int8())),
    ]
    sch = pa.schema(fields)
    assert sch.names == ['foo', 'bar', 'baz']
    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
    assert len(sch) == 3
    assert repr(sch) == """\
foo: int32
bar: string
baz: list<item: int8>
  child 0, item: int8"""

    with pytest.raises(TypeError):
        pa.schema([('foo', None)])
示例#30
0
def test_cast_binary_to_utf8():
    binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary())
    utf8_arr = binary_arr.cast(pa.utf8())
    expected = pa.array(['foo', 'bar', 'baz'], type=pa.utf8())

    assert utf8_arr.equals(expected)

    non_utf8_values = [(u'mañana').encode('utf-16-le')]
    non_utf8_binary = pa.array(non_utf8_values)
    assert non_utf8_binary.type == pa.binary()
    with pytest.raises(ValueError):
        non_utf8_binary.cast(pa.string())

    non_utf8_all_null = pa.array(non_utf8_values, mask=np.array([True]),
                                 type=pa.binary())
    # No error
    casted = non_utf8_all_null.cast(pa.string())
    assert casted.null_count == 1
示例#31
0
"""Conversion between different types of arrays"""
import numpy as np
import pyarrow as pa
import vaex.utils

supported_arrow_array_types = (pa.Array, pa.ChunkedArray)
supported_array_types = (np.ndarray, ) + supported_arrow_array_types

string_types = [pa.string(), pa.large_string()]


def full(n, value, dtype):
    from .datatype import DataType
    dtype = DataType(dtype)
    values = np.full(n, value, dtype=dtype.numpy)
    if dtype.is_arrow:
        return pa.array(values)
    else:
        return values


def is_arrow_array(ar):
    return isinstance(ar, supported_arrow_array_types)


def is_numpy_array(ar):
    return isinstance(ar, np.ndarray)


def filter(ar, boolean_mask):
    if isinstance(ar, supported_arrow_array_types):
    tf.io.FixedLenFeature((1, ), tf.int64, default_value=[0]),
    'image/class/text':
    tf.io.FixedLenFeature((1, ), tf.string, default_value=[b'']),
    'image/format':
    tf.io.FixedLenFeature((1, ), tf.string, default_value=[b'']),
    'image/filename':
    tf.io.FixedLenFeature((1, ), tf.string, default_value=[b'']),
    'image/encoded':
    tf.io.FixedLenFeature((1, ), tf.string, default_value=[b''])
}

parquet_schema = {
    'image/height': pa.int64(),
    'image/width': pa.int64(),
    'image/channels': pa.int64(),
    'image/colorspace': pa.string(),
    'image/class/label': pa.int64(),
    'image/class/text': pa.string(),
    'image/format': pa.string(),
    'image/filename': pa.string(),
    'image/encoded': pa.binary()
}


def reformat_row(row):
    import pyarrow as pa

    out_row = {}
    for key, val in row.items():
        out_type = parquet_schema[key]
        np_val = val.numpy()
示例#33
0
 def test_try_incompatible_extension_type(self):
     arr = pa.array(TypedSequence(["foo", "bar"], try_type=Array2DExtensionType((1, 3), "int64")))
     self.assertEqual(arr.type, pa.string())
示例#34
0
def lambda_handler(event, _):
    """Lambda entry point"""
    source_path = 's3://' + event['bucket'] + '/' + event['source_key']
    target_path = 's3://' + event['bucket'] + '/' + event['target_key']

    print('Source: ' + source_path)
    print('Target: ' + target_path)

    s3_client = boto3.client('s3')
    found = False
    # Ensure file exists before we actually run the conversion
    while not found:
        try:
            s3_client.head_object(Bucket=event['bucket'],
                                  Key=event['source_key'])
        except botocore.exceptions.ClientError:
            print('Waiting for: "' + source_path + '" to exist')
            time.sleep(10)
        else:
            print('Found: "' + source_path + '"')
            found = True

    s3fs_source = s3fs.S3FileSystem()
    s3fs_target = s3fs.S3FileSystem()

    with s3fs_source.open(source_path, 'rb') as source_file, \
            s3fs_target.open(target_path, 'wb') as target_file:
        # Open a stream reader for the csv file
        csv_stream = pd.read_csv(source_file,
                                 skiprows=0,
                                 compression='gzip',
                                 dtype=object,
                                 iterator=True,
                                 chunksize=100000)

        parquet_writer = None
        for i, chunk in enumerate(csv_stream):
            print('Reading chunk: ' + str(i))

            # First chunck, get schema and setup writer
            if not parquet_writer:
                # Fetch columns from header, hardcodes type to string
                columns = [
                    pa.field(column, pa.string()) for column in chunk.columns
                ]

                # Generate schema from columns
                parquet_schema = pa.schema(columns)

                # Open a writer to S3
                parquet_writer = pq.ParquetWriter(target_file,
                                                  parquet_schema,
                                                  compression='snappy')

            # Read the first chunk
            table = pa.Table.from_pandas(chunk, preserve_index=False)

            print('Writing chunk: ' + str(i))
            parquet_writer.write_table(table)

        parquet_writer.close()
        print('Done processing "' + source_path + '"')

    return event['target_key']
示例#35
0
 def test_string(self):
     data = ['foo', b'bar', None, 'arrow']
     arr = pyarrow.from_pylist(data)
     assert len(arr) == 4
     assert arr.null_count == 1
     assert arr.type == pyarrow.string()
def test_str_length(array, expected, offset):
    array = pa.array(array, pa.string())[offset:]
    np.testing.assert_array_equal(
        str_length(NumbaStringArray.make(array)),
        np.asarray(expected[offset:], dtype=np.int32),
    )
def test_is_datetime():
    assert is_datetime(pyarrow.timestamp("us", tz=None))
    assert not is_datetime(pyarrow.timestamp("ms", tz=None))
    assert not is_datetime(pyarrow.timestamp("us", tz="UTC"))
    assert not is_datetime(pyarrow.timestamp("ns", tz="UTC"))
    assert not is_datetime(pyarrow.string())
示例#38
0
import pyarrow as pa

PQ_SCHEMAS = dict()

# site_visits
fields = [
    pa.field('visit_id', pa.int64(), nullable=False),
    pa.field('crawl_id', pa.uint32(), nullable=False),
    pa.field('instance_id', pa.uint32(), nullable=False),
    pa.field('site_url', pa.string(), nullable=False),
    pa.field('site_rank', pa.uint32())
]
PQ_SCHEMAS['site_visits'] = pa.schema(fields)

# flash_cookies
fields = [
    pa.field('crawl_id', pa.uint32(), nullable=False),
    pa.field('visit_id', pa.int64(), nullable=False),
    pa.field('instance_id', pa.uint32(), nullable=False),
    pa.field('domain', pa.string()),
    pa.field('filename', pa.string()),
    pa.field('local_path', pa.string()),
    pa.field('key', pa.string()),
    pa.field('content', pa.string())
]
PQ_SCHEMAS['flash_cookies'] = pa.schema(fields)

# crawl_history
fields = [
    pa.field('crawl_id', pa.uint32(), nullable=False),
    pa.field('visit_id', pa.int64(), nullable=False),
示例#39
0
def test_sql(redshift_table, postgresql_table, mysql_table,
             databases_parameters, db_type):
    if db_type == "postgresql":
        table = postgresql_table
    elif db_type == "mysql":
        table = mysql_table
    else:
        table = redshift_table
    df = get_df()
    if db_type == "redshift":
        df.drop(["binary"], axis=1, inplace=True)
    engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}",
                                   echo=False)
    index = True if engine.name == "redshift" else False
    wr.db.to_sql(
        df=df,
        con=engine,
        name=table,
        schema=databases_parameters[db_type]["schema"],
        if_exists="replace",
        index=index,
        index_label=None,
        chunksize=None,
        method=None,
        dtype={"iint32": sqlalchemy.types.Integer},
    )
    df = wr.db.read_sql_query(
        sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}",
        con=engine)
    ensure_data_types(df, has_list=False)
    engine = wr.db.get_engine(
        db_type=db_type,
        host=databases_parameters[db_type]["host"],
        port=databases_parameters[db_type]["port"],
        database=databases_parameters[db_type]["database"],
        user=databases_parameters["user"],
        password=databases_parameters["password"],
        echo=False,
    )
    dfs = wr.db.read_sql_query(
        sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}",
        con=engine,
        chunksize=1,
        dtype={
            "iint8": pa.int8(),
            "iint16": pa.int16(),
            "iint32": pa.int32(),
            "iint64": pa.int64(),
            "float": pa.float32(),
            "double": pa.float64(),
            "decimal": pa.decimal128(3, 2),
            "string_object": pa.string(),
            "string": pa.string(),
            "date": pa.date32(),
            "timestamp": pa.timestamp(unit="ns"),
            "binary": pa.binary(),
            "category": pa.float64(),
        },
    )
    for df in dfs:
        ensure_data_types(df, has_list=False)
    if db_type != "redshift":
        account_id = boto3.client("sts").get_caller_identity().get("Account")
        engine = wr.catalog.get_engine(
            connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id)
        wr.db.to_sql(
            df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"),
            con=engine,
            name=table,
            schema=databases_parameters[db_type]["schema"],
            if_exists="replace",
            index=True,
            index_label="index",
        )
        schema = None
        if db_type == "postgresql":
            schema = databases_parameters[db_type]["schema"]
        df = wr.db.read_sql_table(con=engine,
                                  table=table,
                                  schema=schema,
                                  index_col="index")
        assert df.shape == (3, 1)
示例#40
0
def _text_replace_case_sensitive(data: pa.Array, pat: str, repl: str,
                                 max_repl: int) -> pa.Array:
    """
    Replace occurrences of ``pat`` with ``repl`` in the Series/Index with some other string. For every
    row, only the first ``max_repl`` replacements will be performed. If ``max_repl = -1`` we consider that
    we have no limit for the number of replacements.

    This implementation does basic byte-by-byte comparison and is independent
    of any locales or encodings.
    """

    # Convert to UTF-8 bytes
    pat_bytes: bytes = pat.encode()
    repl_bytes: bytes = repl.encode()

    offsets_buffer, data_buffer = _extract_string_buffers(data)

    if data.null_count == 0:
        valid_buffer = np.empty(0, dtype=np.uint8)
    else:
        valid_buffer = _buffer_to_view(data.buffers()[0])

    if len(pat) > 0:
        output_t = _text_replace_case_sensitive_numba(
            len(data),
            valid_buffer,
            data.offset,
            offsets_buffer,
            data_buffer,
            pat_bytes,
            repl_bytes,
            max_repl,
        )
    else:
        output_t = _text_replace_case_sensitive_empty_pattern(
            len(data),
            valid_buffer,
            data.offset,
            offsets_buffer,
            data_buffer,
            repl_bytes,
            max_repl,
        )

    output_offsets, output_buffer = output_t

    if data.null_count == 0:
        output_valid = None
    else:
        output_valid = data.buffers()[0].slice(data.offset // 8)
        if data.offset % 8 != 0:
            output_valid = shift_unaligned_bitmap(output_valid,
                                                  data.offset % 8, len(data))

    buffers = [
        output_valid,
        pa.py_buffer(output_offsets),
        pa.py_buffer(output_buffer)
    ]
    return pa.Array.from_buffers(pa.string(), len(data), buffers,
                                 data.null_count)
示例#41
0
def test_type_string():
    t = pa.string()
    assert str(t) == 'string'
示例#42
0
    def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
        """Set one or more values inplace.

        Parameters
        ----------
        key : int, ndarray, or slice
            When called from, e.g. ``Series.__setitem__``, ``key`` will be
            one of

            * scalar int
            * ndarray of integers.
            * boolean ndarray
            * slice object

        value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
            value or values to be set of ``key``.

        Returns
        -------
        None
        """
        key = check_array_indexer(self, key)

        if is_integer(key):
            key = cast(int, key)

            if not is_scalar(value):
                raise ValueError("Must pass scalars with scalar indexer")
            elif isna(value):
                value = None
            elif not isinstance(value, str):
                raise ValueError("Scalar must be NA or str")

            # Slice data and insert in-between
            new_data = [
                *self._data[0:key].chunks,
                pa.array([value], type=pa.string()),
                *self._data[(key + 1):].chunks,
            ]
            self._data = pa.chunked_array(new_data)
        else:
            # Convert to integer indices and iteratively assign.
            # TODO: Make a faster variant of this in Arrow upstream.
            #       This is probably extremely slow.

            # Convert all possible input key types to an array of integers
            if isinstance(key, slice):
                key_array = np.array(range(len(self))[key])
            elif is_bool_dtype(key):
                # TODO(ARROW-9430): Directly support setitem(booleans)
                key_array = np.argwhere(key).flatten()
            else:
                # TODO(ARROW-9431): Directly support setitem(integers)
                key_array = np.asanyarray(key)

            if is_scalar(value):
                value = np.broadcast_to(value, len(key_array))
            else:
                value = np.asarray(value)

            if len(key_array) != len(value):
                raise ValueError("Length of indexer and values mismatch")

            for k, v in zip(key_array, value):
                self[k] = v
示例#43
0
    def __init__(self, expressions, dtype=None, shape=None, fill_value=None):
        self.is_masked = any([e.is_masked for e in expressions])
        self.fill_value = fill_value
        if self.is_masked and fill_value is None:
            for expression in expressions:
                if expression.is_masked:
                    try:
                        # fast path
                        self.fill_value = expression[0:1].fill_value
                        break
                    except:  # noqa
                        # slower path (we have to evaluate everything)
                        self.fill_value = expression.values.fill_value
                        break
            else:
                raise ValueError(
                    'Concatenating expressions with masked values, but no fill value is found'
                )
        if dtype is None:
            dtypes = [e.dtype for e in expressions]

            any_strings = any([is_string_type(dtype) for dtype in dtypes])
            if any_strings:
                self.dtype = pa.string(
                )  # TODO: how do we know it should not be large_string?
            else:
                # np.datetime64/timedelta64 and find_common_type don't mix very well
                if all([dtype == 'datetime64' for dtype in dtypes]):
                    self.dtype = dtypes[0]
                elif all([dtype == 'timedelta64' for dtype in dtypes]):
                    self.dtype = dtypes[0]
                else:
                    if all([dtype == dtypes[0] for dtype in dtypes
                            ]):  # find common types doesn't always behave well
                        self.dtype = dtypes[0]
                    if any([dtype.kind in 'SU' for dtype in dtypes
                            ]):  # strings are also done manually
                        if all([dtype.kind in 'SU' for dtype in dtypes]):
                            index = np.argmax(
                                [dtype.itemsize for dtype in dtypes])
                            self.dtype = dtypes[index]
                        else:
                            index = np.argmax([
                                df.columns[self.column_name].astype(
                                    'O').astype('U').dtype.itemsize
                                for df in dfs
                            ])
                            self.dtype = dfs[index].columns[
                                self.column_name].astype('O').astype('U').dtype
                    else:
                        self.dtype = np.find_common_type(
                            [k.numpy for k in dtypes], [])
                    logger.debug("common type for %r is %r", dtypes,
                                 self.dtype)
            # make sure all expression are the same type
            self.expressions = [
                e if vaex.array_types.same_type(e.dtype, self.dtype) else
                e.astype(self.dtype) for e in expressions
            ]
        else:
            # if dtype is given, we assume every expression/column is the same dtype
            self.dtype = dtype
            self.expressions = expressions[:]
        if shape is not None:
            self.shape = (len(self), ) + shape
        else:
            self.shape = (len(self), ) + self.expressions[0].evaluate(
                0, 1, array_type='numpy', parallel=False).shape[1:]
            for i in range(1, len(self.expressions)):
                expression = self.expressions[i]
                shape_i = (len(self), ) + expressions[i].evaluate(
                    0, 1, array_type='numpy', parallel=False).shape[1:]
                if self.shape != shape_i:
                    raise ValueError(
                        "shape of of expression %s, array index 0, is %r and is incompatible with the shape of the same column of array index %d, %r"
                        % (self.expressions[0], self.shape, i, shape_i))
示例#44
0
    assert result2.equals(arr)


def test_cast_date64_to_int():
    arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.date64())
    expected = pa.array([0, 1, 2], type='i8')

    result = arr.cast('i8')

    assert result.equals(expected)


@pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], None), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    result = pickle.loads(pickle.dumps(array))
    assert array.equals(result)


@pytest.mark.parametrize('narr', [
    np.arange(10, dtype=np.int64),
示例#45
0
        (pa.int8(), 'int8'),
        (pa.int16(), 'int16'),
        (pa.int32(), 'int32'),
        (pa.int64(), 'int64'),
        (pa.uint8(), 'uint8'),
        (pa.uint16(), 'uint16'),
        (pa.uint32(), 'uint32'),
        (pa.uint64(), 'uint64'),
        (pa.float16(), 'float16'),
        (pa.float32(), 'float32'),
        (pa.float64(), 'float64'),
        (pa.date32(), 'date'),
        (pa.date64(), 'date'),
        (pa.binary(), 'bytes'),
        (pa.binary(length=4), 'bytes'),
        (pa.string(), 'unicode'),
        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
        (pa.decimal128(18, 3), 'decimal'),
        (pa.timestamp('ms'), 'datetime'),
        (pa.timestamp('us', 'UTC'), 'datetimetz'),
        (pa.time32('s'), 'time'),
        (pa.time64('us'), 'time')
    ]
)
def test_logical_type(type, expected):
    assert get_logical_type(type) == expected


def test_array_uint64_from_py_over_range():
    arr = pa.array([2 ** 63], type=pa.uint64())
    expected = pa.array(np.array([2 ** 63], dtype='u8'))
示例#46
0
    def setUp(self):
        # Reducing the size of thread pools. Without this test execution may fail in
        # environments with limited amount of resources.
        filebasedsource.MAX_NUM_THREADS_FOR_SIZE_ESTIMATION = 2
        self.temp_dir = tempfile.mkdtemp()

        self.RECORDS = [{
            'name': 'Thomas',
            'favorite_number': 1,
            'favorite_color': 'blue'
        }, {
            'name': 'Henry',
            'favorite_number': 3,
            'favorite_color': 'green'
        }, {
            'name': 'Toby',
            'favorite_number': 7,
            'favorite_color': 'brown'
        }, {
            'name': 'Gordon',
            'favorite_number': 4,
            'favorite_color': 'blue'
        }, {
            'name': 'Emily',
            'favorite_number': -1,
            'favorite_color': 'Red'
        }, {
            'name': 'Percy',
            'favorite_number': 6,
            'favorite_color': 'Green'
        }, {
            'name': 'Peter',
            'favorite_number': 3,
            'favorite_color': None
        }]

        self.SCHEMA = pa.schema([('name', pa.string(), False),
                                 ('favorite_number', pa.int64(), False),
                                 ('favorite_color', pa.string())])

        self.SCHEMA96 = pa.schema([('name', pa.string(), False),
                                   ('favorite_number', pa.timestamp('ns'),
                                    False), ('favorite_color', pa.string())])

        self.RECORDS_NESTED = [{
            'items': [
                {
                    'name': 'Thomas',
                    'favorite_number': 1,
                    'favorite_color': 'blue'
                },
                {
                    'name': 'Henry',
                    'favorite_number': 3,
                    'favorite_color': 'green'
                },
            ]
        }, {
            'items': [
                {
                    'name': 'Toby',
                    'favorite_number': 7,
                    'favorite_color': 'brown'
                },
            ]
        }]

        self.SCHEMA_NESTED = pa.schema([
            ('items',
             pa.list_(
                 pa.struct([('name', pa.string(), False),
                            ('favorite_number', pa.int64(), False),
                            ('favorite_color', pa.string())])))
        ])
示例#47
0
        "FLOAT64",
        pyarrow.float64().id:
        "FLOAT64",
        pyarrow.time32("ms").id:
        "TIME",
        pyarrow.time64("ns").id:
        "TIME",
        pyarrow.timestamp("ns").id:
        "TIMESTAMP",
        pyarrow.date32().id:
        "DATE",
        pyarrow.date64().id:
        "DATETIME",  # because millisecond resolution
        pyarrow.binary().id:
        "BYTES",
        pyarrow.string().id:
        "STRING",  # also alias for pyarrow.utf8()
        pyarrow.decimal128(38, scale=9).id:
        "NUMERIC",
        # The exact decimal's scale and precision are not important, as only
        # the type ID matters, and it's the same for all decimal128 instances.
    }

else:  # pragma: NO COVER
    BQ_TO_ARROW_SCALARS = {}  # pragma: NO COVER
    ARROW_SCALAR_IDS_TO_BQ = {}  # pragma: NO_COVER


def bq_to_arrow_struct_data_type(field):
    arrow_fields = []
    for subfield in field.fields:
示例#48
0
    def infer_schema(self, data):
        """
        Infer a schema for a given data input. The schema can be used to test with schema validator.
        This function currently supports DataFrame, Numpy, Dictionary, List and basic python types.::

            data = pandas.DataFrame(...)
            schema = infer_schema(data)

        This function returns None if it can not infer the schema.
        """
        schema = None

        if data is None:
            schema = pa.null()
        elif isinstance(data, dict):
            schema = {'type': dict, 'fields': {}}

            for key, value in data.items():
                schema['fields'][key] = self.infer_schema(value)
        elif isinstance(data, pd.DataFrame):
            schema = {'type': pd.DataFrame, 'fields': {}}

            # sample the table to get the schema
            pa_schema = pa.Table.from_pandas(data[:_SAMPLE_SIZE],
                                             preserve_index=False).schema
            for i, name in enumerate(pa_schema.names):
                schema['fields'][name] = pa_schema.types[i]
        elif isinstance(data, pd.Series):
            schema = {
                'type': pd.Series,
                'item': pa.Array.from_pandas(data).type,
            }
        elif isinstance(data, np.ndarray):
            pa_type = pa.from_numpy_dtype(
                data.dtype) if data.dtype.num != 17 else pa.string()

            if len(data.shape) == 1:  # 1d array
                schema = {
                    'type': np.ndarray,
                    'item': pa_type,
                }
            else:
                shape = [
                    v if i != 0 else None for i, v in enumerate(data.shape)
                ]
                schema = {
                    'type': np.ndarray,
                    'item': pa_type,
                    'shape': tuple(shape),
                }
        elif isinstance(data, pa.Table):
            schema = data.schema
        elif isinstance(data, (list, tuple)) and len(data) > 0:
            # try to infer type of the data
            current_type = self.infer_schema(data[0])
            for i in range(1, min(len(data), _SAMPLE_SIZE)):
                new_type = self.infer_schema(data[i])

                if new_type != current_type:
                    current_type = None
                    break

            # does not support multiple type yet
            if current_type:
                if isinstance(current_type, pa.DataType):
                    schema = pa.list_(current_type)
                else:
                    schema = {'type': list, 'item': current_type}
        elif type(data) in _python_mapping:
            schema = _python_mapping[type(data)]()
        else:
            return {'type': type(data)}

        return schema
示例#49
0
MANY_TYPES = [
    pa.null(),
    pa.bool_(),
    pa.int32(),
    pa.time32('s'),
    pa.time64('us'),
    pa.date32(),
    pa.timestamp('us'),
    pa.timestamp('us', tz='UTC'),
    pa.timestamp('us', tz='Europe/Paris'),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.binary(10),
    pa.list_(pa.int32()),
    pa.struct([
        pa.field('a', pa.int32()),
        pa.field('b', pa.int8()),
        pa.field('c', pa.string())
    ]),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())],
             mode=pa.lib.UnionMode_DENSE),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())],
             mode=pa.lib.UnionMode_SPARSE),
    # XXX Needs array pickling
示例#50
0
 def test_try_incompatible_type(self):
     arr = pa.array(TypedSequence(["foo", "bar"], try_type=pa.int64()))
     self.assertEqual(arr.type, pa.string())
示例#51
0
import pyarrow as pa

PQ_SCHEMAS = dict()

# site_visits
fields = [
    pa.field('visit_id', pa.int64(), nullable=False),
    pa.field('crawl_id', pa.int32(), nullable=False),
    pa.field('instance_id', pa.int32(), nullable=False),
    pa.field('site_url', pa.string(), nullable=False)
]
PQ_SCHEMAS['site_visits'] = pa.schema(fields)

# flash_cookies
fields = [
    pa.field('crawl_id', pa.int32(), nullable=False),
    pa.field('visit_id', pa.int64(), nullable=False),
    pa.field('instance_id', pa.int32(), nullable=False),
    pa.field('domain', pa.string()),
    pa.field('filename', pa.string()),
    pa.field('local_path', pa.string()),
    pa.field('key', pa.string()),
    pa.field('content', pa.string())
]
PQ_SCHEMAS['flash_cookies'] = pa.schema(fields)

# profile_cookies
fields = [
    pa.field('crawl_id', pa.int32(), nullable=False),
    pa.field('visit_id', pa.int64(), nullable=False),
    pa.field('instance_id', pa.int32(), nullable=False),
示例#52
0
def test_is_dictionary():
    assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string()))
    assert not types.is_dictionary(pa.int32())
示例#53
0
MANY_TYPES = [
    pa.null(),
    pa.bool_(),
    pa.int32(),
    pa.time32('s'),
    pa.time64('us'),
    pa.date32(),
    pa.timestamp('us'),
    pa.timestamp('us', tz='UTC'),
    pa.timestamp('us', tz='Europe/Paris'),
    pa.float16(),
    pa.float32(),
    pa.float64(),
    pa.decimal128(19, 4),
    pa.string(),
    pa.binary(),
    pa.binary(10),
    pa.list_(pa.int32()),
    pa.struct([pa.field('a', pa.int32()),
               pa.field('b', pa.int8()),
               pa.field('c', pa.string())]),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
    pa.union([pa.field('a', pa.binary(10)),
              pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
    # XXX Needs array pickling
    # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
]

def test_isnull(array, expected, offset):
    array = pa.array(array, pa.string())[offset:]
    np.testing.assert_array_equal(
        isnull(NumbaStringArray.make(array)),
        np.asarray(expected[offset:], dtype=np.bool),
    )
示例#55
0
                          ('uint32', range(0, 10)), ('int32', range(0, 10)),
                          ('uint64', range(0, 10)), ('int64', range(0, 10)),
                          ('float', [0.0, 0.1, 0.2]),
                          ('double', [0.0, 0.1, 0.2]),
                          ('string', ['a', 'b', 'c']),
                          ('binary', [b'a', b'b', b'c']),
                          (pa.binary(3), [b'abc', b'bcd', b'cde'])])
def test_cast_identities(ty, values):
    arr = pa.array(values, type=ty)
    assert arr.cast(ty).equals(arr)


pickle_test_parametrize = pytest.mark.parametrize(
    ('data', 'typ'),
    [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()),
     ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()),
     ([], None), ([[1, 2], [3]], pa.list_(pa.int64())),
     ([[4, 5], [6]], pa.large_list(pa.int16())),
     ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
     ([(1, 'a'), (2, 'c'), None
       ], pa.struct([pa.field('a', pa.int64()),
                     pa.field('b', pa.string())]))])


@pickle_test_parametrize
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
        result = pickle.loads(pickle.dumps(array, proto))
示例#56
0
def array_inhom_chunks():
    chunk1 = pa.array(list("abc"), pa.string())
    chunk2 = pa.array(list("12345"), pa.string())
    chunk3 = pa.array(list("Z"), pa.string())
    chunked_array = pa.chunked_array([chunk1, chunk2, chunk3])
    return fr.FletcherChunkedArray(chunked_array)
示例#57
0
            with self.assertRaises(OverflowError):
                _ = pa.array(TypedSequence([["x" * 1024]] * ((2 << 20) + 1)))  # ListArray with a bit more than 2GB


def _check_output(output, expected_num_chunks: int):
    stream = pa.BufferReader(output) if isinstance(output, pa.Buffer) else pa.memory_map(output)
    f = pa.ipc.open_stream(stream)
    pa_table: pa.Table = f.read_all()
    assert len(pa_table.to_batches()) == expected_num_chunks
    assert pa_table.to_pydict() == {"col_1": ["foo", "bar"], "col_2": [1, 2]}
    del pa_table


@pytest.mark.parametrize("writer_batch_size", [None, 1, 10])
@pytest.mark.parametrize(
    "fields", [None, {"col_1": pa.string(), "col_2": pa.int64()}, {"col_1": pa.string(), "col_2": pa.int32()}]
)
def test_write(fields, writer_batch_size):
    output = pa.BufferOutputStream()
    schema = pa.schema(fields) if fields else None
    with ArrowWriter(stream=output, schema=schema, writer_batch_size=writer_batch_size) as writer:
        writer.write({"col_1": "foo", "col_2": 1})
        writer.write({"col_1": "bar", "col_2": 2})
        num_examples, num_bytes = writer.finalize()
    assert num_examples == 2
    assert num_bytes > 0
    if not fields:
        fields = {"col_1": pa.string(), "col_2": pa.int64()}
    assert writer._schema == pa.schema(fields, metadata=writer._schema.metadata)
    _check_output(output.getvalue(), expected_num_chunks=num_examples if writer_batch_size == 1 else 1)
示例#58
0
 def __call__(self):
     return pa.string()
示例#59
0
    ('string', ['a', 'b', 'c']),
    ('binary', [b'a', b'b', b'c']),
    (pa.binary(3), [b'abc', b'bcd', b'cde'])
])
def test_cast_identities(ty, values):
    arr = pa.array(values, type=ty)
    assert arr.cast(ty).equals(arr)


pickle_test_parametrize = pytest.mark.parametrize(
    ('data', 'typ'),
    [
        ([True, False, True, True], pa.bool_()),
        ([1, 2, 4, 6], pa.int64()),
        ([1.0, 2.5, None], pa.float64()),
        (['a', None, 'b'], pa.string()),
        ([], None),
        ([[1, 2], [3]], pa.list_(pa.int64())),
        ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
        ([(1, 'a'), (2, 'c'), None],
            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
    ]
)


@pickle_test_parametrize
def test_array_pickle(data, typ):
    # Allocate here so that we don't have any Arrow data allocated.
    # This is needed to ensure that allocator tests can be reliable.
    array = pa.array(data, type=typ)
    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
示例#60
0
def test_chunked_array_basics():
    data = pa.chunked_array([], type=pa.string())
    assert data.to_pylist() == []

    with pytest.raises(ValueError):
        pa.chunked_array([])