def test_table_safe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) casted_table = table.cast(target_schema) assert casted_table.equals(expected_table)
def test_sequence_nesting_levels(): data = [1, 2, None] arr = pa.array(data) assert arr.type == pa.int64() assert arr.to_pylist() == data data = [[1], [2], None] arr = pa.array(data) assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == data data = [[1], [2, 3, 4], [None]] arr = pa.array(data) assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == data data = [None, [[None, 1]], [[2, 3, 4], None], [None]] arr = pa.array(data) assert arr.type == pa.list_(pa.list_(pa.int64())) assert arr.to_pylist() == data exceptions = (pa.ArrowInvalid, pa.ArrowTypeError) # Mixed nesting levels are rejected with pytest.raises(exceptions): pa.array([1, 2, [1]]) with pytest.raises(exceptions): pa.array([1, 2, []]) with pytest.raises(exceptions): pa.array([[1], [2], [None, [1]]])
def test_table_unsafe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) with pytest.raises(pa.ArrowInvalid, match='Floating point value truncated'): table.cast(target_schema) casted_table = table.cast(target_schema, safe=False) assert casted_table.equals(expected_table)
def test_struct_type(): fields = [pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32())] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 assert list(ty) == fields for a, b in zip(ty, fields): a == b # Construct from list of tuples ty = pa.struct([('a', pa.int64()), ('a', pa.int32()), ('b', pa.int32())]) assert list(ty) == fields for a, b in zip(ty, fields): a == b # Construct from mapping fields = [pa.field('a', pa.int64()), pa.field('b', pa.int32())] ty = pa.struct(OrderedDict([('a', pa.int64()), ('b', pa.int32())])) assert list(ty) == fields for a, b in zip(ty, fields): a == b
def dataframe_with_arrays(include_index=False): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float32()), ('f8', pa.float64())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_struct_type(): fields = [ # Duplicate field name on purpose pa.field('a', pa.int64()), pa.field('a', pa.int32()), pa.field('b', pa.int32()) ] ty = pa.struct(fields) assert len(ty) == ty.num_children == 3 assert list(ty) == fields assert ty[0].name == 'a' assert ty[2].type == pa.int32() with pytest.raises(IndexError): assert ty[3] assert ty['b'] == ty[2] # Duplicate with pytest.warns(UserWarning): with pytest.raises(KeyError): ty['a'] # Not found with pytest.raises(KeyError): ty['c'] # Neither integer nor string with pytest.raises(TypeError): ty[None] for a, b in zip(ty, fields): a == b # Construct from list of tuples ty = pa.struct([('a', pa.int64()), ('a', pa.int32()), ('b', pa.int32())]) assert list(ty) == fields for a, b in zip(ty, fields): a == b # Construct from mapping fields = [pa.field('a', pa.int64()), pa.field('b', pa.int32())] ty = pa.struct(OrderedDict([('a', pa.int64()), ('b', pa.int32())])) assert list(ty) == fields for a, b in zip(ty, fields): a == b # Invalid args with pytest.raises(TypeError): pa.struct([('a', None)])
def test_fields_hashable(): in_dict = {} fields = [pa.field('a', pa.int32()), pa.field('a', pa.int64()), pa.field('a', pa.int64(), nullable=False), pa.field('b', pa.int32()), pa.field('b', pa.int32(), nullable=False)] for i, field in enumerate(fields): in_dict[field] = i assert len(in_dict) == len(fields) for i, field in enumerate(fields): assert in_dict[field] == i
def dataframe_with_lists(include_index=False): """ Dataframe with list columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ arrays = OrderedDict() fields = [] fields.append(pa.field('int64', pa.list_(pa.int64()))) arrays['int64'] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4], None, [], np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2, dtype=np.int64)[::2] ] fields.append(pa.field('double', pa.list_(pa.float64()))) arrays['double'] = [ [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], [0., 1., 2., 3., 4.], None, [], np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2], ] fields.append(pa.field('bytes_list', pa.list_(pa.binary()))) arrays['bytes_list'] = [ [b"1", b"f"], None, [b"1"], [b"1", b"2", b"3"], [], ] fields.append(pa.field('str_list', pa.list_(pa.string()))) arrays['str_list'] = [ [u"1", u"ä"], None, [u"1"], [u"1", u"2", u"3"], [], ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_simple_ints(self): # Infer integer columns rows = b"a,b,c\n1,2,3\n4,5,6\n" table = self.read_bytes(rows) schema = pa.schema([('a', pa.int64()), ('b', pa.int64()), ('c', pa.int64())]) assert table.schema == schema assert table.to_pydict() == { 'a': [1, 4], 'b': [2, 5], 'c': [3, 6], }
def test_table_from_arrays_preserves_column_metadata(): # Added to test https://issues.apache.org/jira/browse/ARROW-3866 arr0 = pa.array([1, 2]) arr1 = pa.array([3, 4]) field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B")) field1 = pa.field('field2', pa.int64(), nullable=False) columns = [ pa.column(field0, arr0), pa.column(field1, arr1) ] table = pa.Table.from_arrays(columns) assert b"a" in table.column(0).field.metadata assert table.column(1).field.nullable is False
def test_simple_ints(self): # Infer integer columns rows = b'{"a": 1,"b": 2, "c": 3}\n{"a": 4,"b": 5, "c": 6}\n' table = self.read_bytes(rows) schema = pa.schema([('a', pa.int64()), ('b', pa.int64()), ('c', pa.int64())]) assert table.schema == schema assert table.to_pydict() == { 'a': [1, 4], 'b': [2, 5], 'c': [3, 6], }
def test_list_array_flatten(): typ2 = pa.list_( pa.list_( pa.int64() ) ) arr2 = pa.array([ None, [ [1, None, 2], None, [3, 4] ], [], [ [], [5, 6], None ], [ [7, 8] ] ]) assert arr2.type.equals(typ2) typ1 = pa.list_(pa.int64()) arr1 = pa.array([ [1, None, 2], None, [3, 4], [], [5, 6], None, [7, 8] ]) assert arr1.type.equals(typ1) typ0 = pa.int64() arr0 = pa.array([ 1, None, 2, 3, 4, 5, 6, 7, 8 ]) assert arr0.type.equals(typ0) assert arr2.flatten().equals(arr1) assert arr1.flatten().equals(arr0) assert arr2.flatten().flatten().equals(arr0)
def test_orcfile_empty(): from pyarrow import orc f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile')) table = f.read() assert table.num_rows == 0 schema = table.schema expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert schema == expected_schema
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_iterator_without_size(): expected = pa.array((0, 1, 2)) arr1 = pa.array(iter(range(3))) assert arr1.equals(expected) # Same with explicit type arr1 = pa.array(iter(range(3)), type=pa.int64()) assert arr1.equals(expected)
def test_list_of_int(self): data = [[1, 2, 3], [], None, [1, 2]] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pyarrow.list_(pyarrow.int64()) assert arr.to_pylist() == data
def test_sequence_integer_inferred(seq): expected = [1, None, 3, None] arr = pa.array(seq(expected)) assert len(arr) == 4 assert arr.null_count == 2 assert arr.type == pa.int64() assert arr.to_pylist() == expected
def test_integer(self): expected = [1, None, 3, None] arr = pyarrow.from_pylist(expected) assert len(arr) == 4 assert arr.null_count == 2 assert arr.type == pyarrow.int64() assert arr.to_pylist() == expected
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_nested_arrays(seq): arr = pa.array(seq([np.array([], dtype=np.int64), np.array([1, 2], dtype=np.int64), None])) assert len(arr) == 3 assert arr.null_count == 1 assert arr.type == pa.list_(pa.int64()) assert arr.to_pylist() == [[], [1, 2], None]
def test_asarray(): arr = pa.array(range(4)) # The iterator interface gives back an array of Int64Value's np_arr = np.asarray([_ for _ in arr]) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('O') assert type(np_arr[0]) == pa.lib.Int64Value # Calling with the arrow array gives back an array with 'int64' dtype np_arr = np.asarray(arr) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('int64') # An optional type can be specified when calling np.asarray np_arr = np.asarray(arr, dtype='str') assert np_arr.tolist() == ['0', '1', '2', '3'] # If PyArrow array has null values, numpy type will be changed as needed # to support nulls. arr = pa.array([0, 1, 2, None]) assert arr.type == pa.int64() np_arr = np.asarray(arr) elements = np_arr.tolist() assert elements[:3] == [0., 1., 2.] assert np.isnan(elements[3]) assert np_arr.dtype == np.dtype('float64')
def test_buffers_nested(): a = pa.array([[1, 2], None, [3, None, 4, 5]], type=pa.list_(pa.int64())) buffers = a.buffers() assert len(buffers) == 4 # The parent buffers null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 offsets = buffers[1].to_pybytes() assert struct.unpack('4i', offsets) == (0, 2, 2, 6) # The child buffers null_bitmap = buffers[2].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00110111 values = buffers[3].to_pybytes() assert struct.unpack('qqq8xqq', values) == (1, 2, 3, 4, 5) a = pa.array([(42, None), None, (None, 43)], type=pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.int16())])) buffers = a.buffers() assert len(buffers) == 5 # The parent buffer null_bitmap = buffers[0].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000101 # The child buffers: 'a' null_bitmap = buffers[1].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000001 values = buffers[2].to_pybytes() assert struct.unpack('bxx', values) == (42,) # The child buffers: 'b' null_bitmap = buffers[3].to_pybytes() assert bytearray(null_bitmap)[0] == 0b00000100 values = buffers[4].to_pybytes() assert struct.unpack('4xh', values) == (43,)
def _from_jvm_int_type(jvm_type): """ Convert a JVM int type to its Python equivalent. Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int Returns ------- typ: pyarrow.DataType """ if jvm_type.isSigned: if jvm_type.bitWidth == 8: return pa.int8() elif jvm_type.bitWidth == 16: return pa.int16() elif jvm_type.bitWidth == 32: return pa.int32() elif jvm_type.bitWidth == 64: return pa.int64() else: if jvm_type.bitWidth == 8: return pa.uint8() elif jvm_type.bitWidth == 16: return pa.uint16() elif jvm_type.bitWidth == 32: return pa.uint32() elif jvm_type.bitWidth == 64: return pa.uint64()
def test_custom_nulls(self): # Infer nulls with custom values opts = ConvertOptions(null_values=['Xxx', 'Zzz']) rows = b"a,b,c,d\nZzz,Xxx,1,2\nXxx,#N/A,,Zzz\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.null()), ('b', pa.string()), ('c', pa.string()), ('d', pa.int64())]) assert table.schema == schema assert table.to_pydict() == { 'a': [None, None], 'b': [u"Xxx", u"#N/A"], 'c': [u"1", u""], 'd': [2, None], } opts = ConvertOptions(null_values=[]) rows = b"a,b\n#N/A,\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.string()), ('b', pa.string())]) assert table.schema == schema assert table.to_pydict() == { 'a': [u"#N/A"], 'b': [u""], }
def test_infinite_iterator(): expected = pa.array((0, 1, 2)) arr1 = pa.array(itertools.count(0), size=3) assert arr1.equals(expected) # Same with explicit type arr1 = pa.array(itertools.count(0), type=pa.int64(), size=3) assert arr1.equals(expected)
def test_struct_from_dicts_inference(): expected_type = pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string()), pa.field('c', pa.bool_())]) data = [{'a': 5, 'b': u'foo', 'c': True}, {'a': 6, 'b': u'bar', 'c': False}] arr = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == data # With omitted values data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': u'bar'}] expected = [{'a': 5, 'b': None, 'c': True}, None, {'a': None, 'b': None, 'c': None}, {'a': None, 'b': u'bar', 'c': None}] arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == expected assert arr.equals(arr2) # Nested expected_type = pa.struct([ pa.field('a', pa.struct([pa.field('aa', pa.list_(pa.int64())), pa.field('ab', pa.bool_())])), pa.field('b', pa.string())]) data = [{'a': {'aa': [5, 6], 'ab': True}, 'b': 'foo'}, {'a': {'aa': None, 'ab': False}, 'b': None}, {'a': None, 'b': 'bar'}] arr = pa.array(data) assert arr.to_pylist() == data # Edge cases arr = pa.array([{}]) assert arr.type == pa.struct([]) assert arr.to_pylist() == [{}] # Mixing structs and scalars is rejected with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)): pa.array([1, {'a': 2}])
def test_int_object_nulls(self): arr = np.array([None, 1, np.int64(3)] * 5, dtype=object) df = pd.DataFrame({'ints': arr}) expected = pd.DataFrame({'ints': pd.to_numeric(arr)}) field = pa.field('ints', pa.int64()) schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected=expected, expected_schema=schema)
def test_empty_table(): schema = pa.schema([ pa.field('oneField', pa.int64()) ]) table = schema.empty_table() assert isinstance(table, pa.Table) assert table.num_rows == 0 assert table.schema == schema
def test_infer_lists(self): data = OrderedDict([ ('nan_ints', [[None, 1], [2, 3]]), ('ints', [[0, 1], [2, 3]]), ('strs', [[None, u'b'], [u'c', u'd']]), ('nested_strs', [[[None, u'b'], [u'c', u'd']], None]) ]) df = pd.DataFrame(data) expected_schema = pa.schema([ pa.field('nan_ints', pa.list_(pa.int64())), pa.field('ints', pa.list_(pa.int64())), pa.field('strs', pa.list_(pa.string())), pa.field('nested_strs', pa.list_(pa.list_(pa.string()))) ]) self._check_pandas_roundtrip(df, expected_schema=expected_schema)
def test_chunked_array_equals(): def eq(xarrs, yarrs): if isinstance(xarrs, pa.ChunkedArray): x = xarrs else: x = pa.chunked_array(xarrs) if isinstance(yarrs, pa.ChunkedArray): y = yarrs else: y = pa.chunked_array(yarrs) assert x.equals(y) assert y.equals(x) assert x == y assert x != str(y) def ne(xarrs, yarrs): if isinstance(xarrs, pa.ChunkedArray): x = xarrs else: x = pa.chunked_array(xarrs) if isinstance(yarrs, pa.ChunkedArray): y = yarrs else: y = pa.chunked_array(yarrs) assert not x.equals(y) assert not y.equals(x) assert x != y eq(pa.chunked_array([], type=pa.int32()), pa.chunked_array([], type=pa.int32())) ne(pa.chunked_array([], type=pa.int32()), pa.chunked_array([], type=pa.int64())) a = pa.array([0, 2], type=pa.int32()) b = pa.array([0, 2], type=pa.int64()) c = pa.array([0, 3], type=pa.int32()) d = pa.array([0, 2, 0, 3], type=pa.int32()) eq([a], [a]) ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) assert not pa.chunked_array([], type=pa.int32()).equals(None)
assert all_op(arrow, skipna) == pandas.all(skipna=skipna) # Split in the middle and check whether this still works if len(data) > 2: arrow = pa.chunked_array( [data[: len(data) // 2], data[len(data) // 2 :]], type=pa.bool_() ) assert all_op(arrow, skipna) == pandas.all(skipna=skipna) @pytest.mark.parametrize( ("array", "fill_null_value", "expected"), [ (pa.array([2, 1], type=pa.int16()), -1, np.array([2, 1], dtype=np.int16)), (pa.array([2, None], type=pa.int32()), -1, np.array([2, -1], dtype=np.int32)), (pa.array([2, None], type=pa.int64()), -1.5, np.array([2, -1], dtype=np.int64)), (pa.array([1, None], type=pa.uint8()), 257, np.array([1, 1], dtype=np.uint8)), (pa.array([None, None], type=pa.int8()), 5, np.array([5, 5], dtype=np.int8)), (pa.array([], type=pa.int8()), 5, np.array([], dtype=np.int8)), ], ) def test_integer_array_to_numpy(array, fill_null_value, expected): actual = integer_array_to_numpy(array, fill_null_value) assert actual.dtype == expected.dtype np.testing.assert_array_equal(actual, expected) @pytest.mark.parametrize( ("array", "indices"), [ (
def test_in_chunk_offsets(data: List[List[int]]): arr = pa.chunked_array(data, type=pa.int64()) # Simple case: Passing in the actual chunk offsets should yield a valid selection offsets = list(_calculate_chunk_offsets(arr)) in_offsets = _in_chunk_offsets(arr, offsets) check_valid_in_offsets(arr, in_offsets)
def test_iterate_over_decimal_chunk(): random.seed(datetime.datetime.now()) precision = random.randint(1, 38) scale = random.randint(0, precision) datatype = None if precision <= 2: datatype = pyarrow.int8() elif precision <= 4: datatype = pyarrow.int16() elif precision <= 9: datatype = pyarrow.int32() elif precision <= 19: datatype = pyarrow.int64() else: datatype = pyarrow.decimal128(precision, scale) def decimal_generator(_precision, _scale): def decimal128_generator(precision, scale): data = [] for i in range(precision): data.append(str(random.randint(0, 9))) if scale: data.insert(-scale, '.') return decimal.Decimal("".join(data)) def int64_generator(precision): data = random.randint(-9223372036854775808, 9223372036854775807) return int(str(data)[:precision if data >= 0 else precision + 1]) def int32_generator(precision): data = random.randint(-2147483648, 2147483637) return int(str(data)[:precision if data >= 0 else precision + 1]) def int16_generator(precision): data = random.randint(-32768, 32767) return int(str(data)[:precision if data >= 0 else precision + 1]) def int8_generator(precision): data = random.randint(-128, 127) return int(str(data)[:precision if data >= 0 else precision + 1]) if _precision <= 2: return int8_generator(_precision) elif _precision <= 4: return int16_generator(_precision) elif _precision <= 9: return int32_generator(_precision) elif _precision <= 19: return int64_generator(_precision) else: return decimal128_generator(_precision, _scale) def expected_data_transform_decimal(_precision, _scale): def expected_data_transform_decimal_impl(data, precision=_precision, scale=_scale): if precision <= 19: return decimal.Decimal(data).scaleb(-scale) else: return data return expected_data_transform_decimal_impl column_meta = { "logicalType" : "FIXED", "precision" : str(precision), "scale" : str(scale) } iterate_over_test_chunk([datatype, datatype], [column_meta, column_meta], lambda: decimal_generator(precision, scale), expected_data_transform_decimal(precision, scale))
def test_limited_iterator_types(): arr1 = pa.array(iter(range(3)), type=pa.int64(), size=3) arr2 = pa.array((0, 1, 2)) assert arr1.equals(arr2)
def __init__(self): pa.PyExtensionType.__init__(self, pa.int64())
def test_column_type_int64_same(): table = pyarrow.table( {"A": pyarrow.array([1, 2, -1, None, 3, None, 1], pyarrow.int64())}) assert_arrow_table_identity(table)
b = pa.array([0, 2], type=pa.int64()) c = pa.array([0, 3], type=pa.int32()) d = pa.array([0, 2, 0, 3], type=pa.int32()) eq([a], [a]) ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) assert not pa.chunked_array([], type=pa.int32()).equals(None) @pytest.mark.parametrize( ('data', 'typ'), [([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None ], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))]) def test_chunked_array_pickle(data, typ): arrays = [] while data: arrays.append(pa.array(data[:2], type=typ)) data = data[2:] array = pa.chunked_array(arrays, type=typ) array.validate() result = pickle.loads(pickle.dumps(array)) result.validate()
def __init__(self, freq): # attributes need to be set first before calling # super init (as that calls serialize) self._freq = freq pa.ExtensionType.__init__(self, pa.int64(), 'pandas.period')
def test_generic_ext_type(): period_type = PeriodType('D') assert period_type.extension_name == "pandas.period" assert period_type.storage_type == pa.int64()
# fmt:on PANDAS_GE_0_26_0 = LooseVersion(pd.__version__) >= "0.26.0" if PANDAS_GE_0_26_0: from pandas.core.indexers import check_array_indexer _python_type_map = { pa.null().id: six.text_type, pa.bool_().id: bool, pa.int8().id: int, pa.uint8().id: int, pa.int16().id: int, pa.uint16().id: int, pa.int32().id: int, pa.uint32().id: int, pa.int64().id: int, pa.uint64().id: int, pa.float16().id: float, pa.float32().id: float, pa.float64().id: float, pa.date32().id: datetime.date, pa.date64().id: datetime.date, pa.timestamp("ms").id: datetime.datetime, pa.binary().id: six.binary_type, pa.string().id: six.text_type, # Use any list type here, only LIST is important pa.list_(pa.string()).id: list, pa.large_list(pa.string()).id: list, } _string_type_map = {"date64[ms]": pa.date64(), "string": pa.string()}
def __arrow_ext_serialize__(self): metadata = {"subtype": str(self.subtype), "closed": self.closed} return json.dumps(metadata).encode() @classmethod def __arrow_ext_deserialize__(cls, storage_type, serialized): metadata = json.loads(serialized.decode()) subtype = pyarrow.type_for_alias(metadata["subtype"]) closed = metadata["closed"] return ArrowIntervalType(subtype, closed) def __eq__(self, other): if isinstance(other, pyarrow.BaseExtensionType): return (type(self) == type(other) and self.subtype == other.subtype and self.closed == other.closed) else: return NotImplemented def __hash__(self): return hash((str(self), str(self.subtype), self.closed)) def to_pandas_dtype(self): import pandas as pd return pd.IntervalDtype(self.subtype.to_pandas_dtype()) # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type)
from pyarrow.compat import unittest # noqa from pyarrow.pandas_compat import _pandas_api # noqa import pyarrow as pa import collections import datetime import decimal import itertools import math import traceback import numpy as np import pytz int_type_pairs = [(np.int8, pa.int8()), (np.int16, pa.int16()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint16()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__() class MyInt:
def test_get_pyarrow_translated_schema(self): string_input_schema = [{ "type": "STRING", "name": "string1", "mode": "REQUIRED" }, { "type": "NUMERIC", "name": "numeric1", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "integer1", "mode": "REQUIRED" }, { "type": "FLOAT", "name": "float1", "mode": "NULLABLE" }, { "type": "BOOLEAN", "name": "boolean1", "mode": "REQUIRED" }, { "type": "TIMESTAMP", "name": "timestamp1", "mode": "REQUIRED" }, { "type": "DATE", "name": "date1", "mode": "REQUIRED" }, { "type": "TIME", "name": "time1", "mode": "REQUIRED" }, { "type": "DATETIME", "name": "datetime1", "mode": "REQUIRED" }, { "type": "RECORD", "name": "record1", "mode": "REPEATED", "fields": [{ "type": "BOOLEAN", "name": "boolean1", "mode": "REQUIRED" }, { "type": "TIMESTAMP", "name": "timestamp1", "mode": "REQUIRED" }] }] expected_pa_schema = pa.schema([ pa.field(name='string1', type=pa.string() #nullable=False ), pa.field(name='numeric1', type=pa.int64() #nullable=True ), pa.field( name='integer1', type=pa.int64(), #nullable=False ), pa.field(name='float1', type=pa.float64() #nullable=True ), pa.field(name='boolean1', type=pa.bool_() #nullable=False ), pa.field(name='timestamp1', type=pa.timestamp('us') #nullable=False ), pa.field( name='date1', type=pa.date32(), #nullable=False ), pa.field(name='time1', type=pa.time64('us') #nullable=False ), pa.field(name='datetime1', type=pa.timestamp('us') #nullable=False ), pa.field( name='record1', type=pa.list_( pa.struct([ pa.field(name='boolean1', type=pa.bool_() #nullable=False ), pa.field(name='timestamp1', type=pa.timestamp('us') #nullable=False ) ]))) ]) pyarrow_schema = get_pyarrow_translated_schema(string_input_schema) self.assertEqual(pyarrow_schema, expected_pa_schema)
def get_pyarrow_schema(): fields = [ pa.field('id', pa.int64()), pa.field('ts_submit', pa.int64()), pa.field('submission_site', pa.int32()), pa.field('runtime', pa.int64()), pa.field('resource_type', pa.string()), pa.field('resource_amount_requested', pa.float64()), pa.field('parents', pa.list_(pa.int64())), pa.field('children', pa.list_(pa.int64())), pa.field('user_id', pa.int32()), pa.field('group_id', pa.int32()), pa.field('nfrs', pa.string()), pa.field('workflow_id', pa.int64()), pa.field('wait_time', pa.int64()), pa.field('params', pa.string()), pa.field('memory_requested', pa.float64()), pa.field('network_io_time', pa.int64()), pa.field('disk_io_time', pa.int64()), pa.field('disk_space_requested', pa.float64()), pa.field('energy_consumption', pa.int64()), pa.field('resource_used', pa.int64()), ] return pa.schema(fields)
for i, field in enumerate(fields): in_dict[field] = i assert len(in_dict) == len(fields) for i, field in enumerate(fields): assert in_dict[field] == i @pytest.mark.parametrize('t,check_func', [ (pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64) ]) def test_exact_primitive_types(t, check_func): assert check_func(t) def test_bit_width(): for ty, expected in [(pa.bool_(), 1), (pa.int8(), 8),
def test_concat_tables_with_promotion_error(): t1 = pa.Table.from_arrays([pa.array([1, 2], type=pa.int64())], ["f"]) t2 = pa.Table.from_arrays([pa.array([1, 2], type=pa.float32())], ["f"]) with pytest.raises(pa.ArrowInvalid): pa.concat_tables([t1, t2], promote=True)
def test_casting_to_extension_type_raises(): arr = pa.array([1, 2, 3, 4], pa.int64()) with pytest.raises(pa.ArrowNotImplementedError): arr.cast(IntegerType())
def test_list_with_non_list(seq): # List types don't accept non-sequences with pytest.raises(TypeError): pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64())) with pytest.raises(TypeError): pa.array(seq([[], [1, 2], 3]), type=pa.large_list(pa.int64()))
def clean_data_common(self, processed_data, raw_data): """Fix the type and default value of of each extracted field This routine is common to all services. It ensures that all the missing fields, as defined by the schema, are added to the records extracted. Furthermore, each field is set to the specified type. """ # Build default data structure schema_rec = {} def_vals = self._get_default_vals() ptype_map = { pa.string(): str, pa.int32(): int, pa.int64(): int, pa.float32(): float, pa.float64(): float, pa.date64(): float, pa.list_(pa.string()): list, pa.list_(pa.int64()): list, pa.bool_(): bool, } for field in self.schema: default = def_vals[field.type] schema_rec.update({field.name: default}) if isinstance(raw_data, list): read_from = raw_data[0] else: read_from = raw_data for entry in processed_data: entry.update({"hostname": read_from["hostname"]}) entry.update({"namespace": read_from["namespace"]}) entry.update({"timestamp": read_from["timestamp"]}) entry.update({"sqvers": self.version}) for fld in schema_rec: if fld not in entry: if fld == "active": entry.update({fld: True}) else: entry.update({fld: schema_rec[fld]}) else: fld_type = self.schema.field(fld).type if not isinstance(entry[fld], ptype_map[fld_type]): try: entry[fld] = ptype_map[fld_type](entry[fld]) except (ValueError, TypeError): entry[fld] = schema_rec[fld] elif isinstance(entry[fld], list): for i, ele in enumerate(entry[fld]): if not isinstance(ele, ptype_map[fld_type.value_type]): try: if ptype_map[fld_type.value_type] == int: entry[fld][i] = int(entry[fld][i]) elif ptype_map[fld_type.value_type] == str: entry[fld][i] = str(entry[fld][i]) else: raise ValueError except (ValueError, TypeError): entry[fld][i] = schema_rec[fld] return processed_data
def test_sequence_custom_integers(seq): expected = [0, 42, 2**33 + 1, -2**63] data = list(map(MyInt, expected)) arr = pa.array(seq(data), type=pa.int64()) assert arr.to_pylist() == expected
def test_iterate_over_timestamp_tz_chunk(): random.seed(datetime.datetime.now()) scale = random.randint(0, 9) column_meta = [ {"byteLength": "16" if scale > 3 else "8", "logicalType": "TIMESTAMP_TZ", "scale": str(scale)}, {"byteLength": "16" if scale > 3 else "8", "logicalType": "TIMESTAMP_TZ", "scale": str(scale)} ] type1 = pyarrow.struct([pyarrow.field('epoch', pyarrow.int64()), pyarrow.field('timezone', pyarrow.int32()), pyarrow.field('fraction', pyarrow.int32())]) type2 = pyarrow.struct([pyarrow.field('epoch', pyarrow.int64()), pyarrow.field('timezone', pyarrow.int32())]) data_type = type1 if scale > 3 else type2 def timestamp_tz_generator(scale): epoch = random.randint(-621355968, 2534023007) frac = random.randint(0, 10**scale - 1) * (10**(9 - scale)) if scale > 3 else random.randint(0, 10**scale - 1) timezone = random.randint(1, 2879) if scale > 3: return {'epoch': epoch, 'timezone': timezone, 'fraction' : frac} else: epoch = str(epoch) frac = str(frac) ZEROFILL = '000000000' frac = ZEROFILL[:scale - len(frac)] + frac return {'epoch': int(epoch + frac) if scale else int(epoch), 'timezone': timezone} def expected_data_transform_tz(_scale): def expected_data_transform_tz_impl(data, scale=_scale): timezone = data['timezone'] tzinfo = _generate_tzinfo_from_tzoffset(timezone - 1440) epoch = data['epoch'] if scale > 3: frac = data['fraction'] if epoch < 0: epoch += 1 frac = 10**9 - frac frac = str(int(frac / 10**(9 - scale))) ZERO_FILL = '000000000' frac = ZERO_FILL[:scale - len(frac)] + frac epoch = int(str(epoch) + frac) microsec = str(epoch) if scale > 6: microsec = microsec[:-scale] + "." + microsec[-scale:-scale + 6] else: microsec = microsec[:-scale] + "." + microsec[-scale:] if scale else microsec if platform.system() == 'Windows': t = datetime.datetime.utcfromtimestamp(0) + datetime.timedelta(seconds=(float(microsec))) if pytz.utc != tzinfo: t += tzinfo.utcoffset(t) return t.replace(tzinfo=tzinfo) else: return datetime.datetime.fromtimestamp(float(microsec), tz=tzinfo) return expected_data_transform_tz_impl iterate_over_test_chunk([data_type, data_type], column_meta, lambda: timestamp_tz_generator(scale), expected_data_transform_tz(scale))
def test_struct_from_dicts_inference(): expected_type = pa.struct([ pa.field('a', pa.int64()), pa.field('b', pa.string()), pa.field('c', pa.bool_()) ]) data = [{'a': 5, 'b': 'foo', 'c': True}, {'a': 6, 'b': 'bar', 'c': False}] arr = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == data # With omitted values data = [{'a': 5, 'c': True}, None, {}, {'a': None, 'b': 'bar'}] expected = [{ 'a': 5, 'b': None, 'c': True }, None, { 'a': None, 'b': None, 'c': None }, { 'a': None, 'b': 'bar', 'c': None }] arr = pa.array(data) data_as_ndarray = np.empty(len(data), dtype=object) data_as_ndarray[:] = data arr2 = pa.array(data) check_struct_type(arr.type, expected_type) assert arr.to_pylist() == expected assert arr.equals(arr2) # Nested expected_type = pa.struct([ pa.field( 'a', pa.struct([ pa.field('aa', pa.list_(pa.int64())), pa.field('ab', pa.bool_()) ])), pa.field('b', pa.string()) ]) data = [{ 'a': { 'aa': [5, 6], 'ab': True }, 'b': 'foo' }, { 'a': { 'aa': None, 'ab': False }, 'b': None }, { 'a': None, 'b': 'bar' }] arr = pa.array(data) assert arr.to_pylist() == data # Edge cases arr = pa.array([{}]) assert arr.type == pa.struct([]) assert arr.to_pylist() == [{}] # Mixing structs and scalars is rejected with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)): pa.array([1, {'a': 2}])
def test_empty_table(): schema = pa.schema([pa.field('oneField', pa.int64())]) table = schema.empty_table() assert isinstance(table, pa.Table) assert table.num_rows == 0 assert table.schema == schema
original_index.update(new_index) assert ( str(e.value) == "Trying to update an index with the wrong column. Got `another_col` but expected `col`" ) @pytest.mark.parametrize( "dtype", [ pa.binary(), pa.bool_(), pa.date32(), pa.float32(), pa.float64(), pa.int64(), pa.int8(), pa.string(), pa.timestamp("ns"), ], ) def test_index_empty(store, dtype): storage_key = "dataset_uuid/some_index.parquet" index1 = ExplicitSecondaryIndex(column="col", index_dct={}, dtype=dtype, index_storage_key=storage_key) key1 = index1.store(store, "dataset_uuid") index2 = ExplicitSecondaryIndex(column="col", index_storage_key=key1).load(store)
def test_safe_cast_nan_to_int_raises(): arr = pa.array([np.nan, 1.]) with pytest.raises(pa.ArrowInvalid, match='Floating point value truncated'): arr.cast(pa.int64(), safe=True)
def test_column_selection(tempdir): from pyarrow import orc # create a table with nested types inner = pa.field('inner', pa.int64()) middle = pa.field('middle', pa.struct([inner])) fields = [ pa.field('basic', pa.int32()), pa.field('list', pa.list_(pa.field('item', pa.int32()))), pa.field('struct', pa.struct([middle, pa.field('inner2', pa.int64())])), pa.field( 'list-struct', pa.list_( pa.field( 'item', pa.struct([ pa.field('inner1', pa.int64()), pa.field('inner2', pa.int64()) ])))), pa.field('basic2', pa.int64()), ] arrs = [[0], [[1, 2]], [{ "middle": { "inner": 3 }, "inner2": 4 }], [[{ "inner1": 5, "inner2": 6 }, { "inner1": 7, "inner2": 8 }]], [9]] table = pa.table(arrs, schema=pa.schema(fields)) path = str(tempdir / 'test.orc') orc.write_table(table, path) orc_file = orc.ORCFile(path) # default selecting all columns result1 = orc_file.read() assert result1.equals(table) # selecting with columns names result2 = orc_file.read(columns=["basic", "basic2"]) assert result2.equals(table.select(["basic", "basic2"])) result3 = orc_file.read(columns=["list", "struct", "basic2"]) assert result3.equals(table.select(["list", "struct", "basic2"])) # using dotted paths result4 = orc_file.read(columns=["struct.middle.inner"]) expected4 = pa.table({"struct": [{"middle": {"inner": 3}}]}) assert result4.equals(expected4) result5 = orc_file.read(columns=["struct.inner2"]) expected5 = pa.table({"struct": [{"inner2": 4}]}) assert result5.equals(expected5) result6 = orc_file.read( columns=["list", "struct.middle.inner", "struct.inner2"]) assert result6.equals(table.select(["list", "struct"])) result7 = orc_file.read(columns=["list-struct.inner1"]) expected7 = pa.table({"list-struct": [[{"inner1": 5}, {"inner1": 7}]]}) assert result7.equals(expected7) # selecting with (Arrow-based) field indices result2 = orc_file.read(columns=[0, 4]) assert result2.equals(table.select(["basic", "basic2"])) result3 = orc_file.read(columns=[1, 2, 3]) assert result3.equals(table.select(["list", "struct", "list-struct"])) # error on non-existing name or index with pytest.raises(IOError): # liborc returns ParseError, which gets translated into IOError # instead of ValueError orc_file.read(columns=["wrong"]) with pytest.raises(ValueError): orc_file.read(columns=[5])
from redvox.common import file_statistics as fs from redvox.common.parallel_utils import maybe_parallel_map from redvox.common.station import Station from redvox.common.errors import RedVoxExceptions id_py_stct = pa.struct([ ("id", pa.string()), ("uuid", pa.string()), ("start_time", pa.float64()), ]) meta_py_stct = pa.struct([ ("api", pa.float64()), ("sub_api", pa.float64()), ("make", pa.string()), ("model", pa.string()), ("os", pa.int64()), ("os_version", pa.string()), ("app", pa.string()), ("app_version", pa.string()), ("is_private", pa.bool_()), ("packet_duration_s", pa.float64()), ("station_description", pa.string()), ]) PERCENT_FREE_MEM_USE = .8 # Percentage of total free memory to use when creating stations (1. is 100%) class ApiReader: """ Reads data from api 900 or api 1000 format, converting all data read into RedvoxPacketM for ease of comparison and use.
def testIsListLike(self): for t in (pa.list_(pa.int64()), pa.large_list(pa.int64())): self.assertTrue(arrow_util.is_list_like(t)) for t in (pa.binary(), pa.int64(), pa.large_string()): self.assertFalse(arrow_util.is_list_like(t))
def test_limited_iterator_size_underflow(): arr1 = pa.array(iter(range(3)), type=pa.int64(), size=10) arr2 = pa.array((0, 1, 2)) assert arr1.equals(arr2)
b = pa.array([0, 2], type=pa.int64()) c = pa.array([0, 3], type=pa.int32()) d = pa.array([0, 2, 0, 3], type=pa.int32()) eq([a], [a]) ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) @pytest.mark.parametrize( ('data', 'typ'), [ ([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) ] ) def test_chunked_array_pickle(data, typ): arrays = [] while data: arrays.append(pa.array(data[:2], type=typ)) data = data[2:] array = pa.chunked_array(arrays, type=typ)