def test_table_pickle(): data = [ pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()), pa.chunked_array([["some", "strings", None, ""]], type=pa.string()), ] schema = pa.schema([pa.field('ints', pa.uint32()), pa.field('strs', pa.string())], metadata={b'foo': b'bar'}) table = pa.Table.from_arrays(data, schema=schema) result = pickle.loads(pickle.dumps(table)) result._validate() assert result.equals(table)
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def _from_jvm_int_type(jvm_type): """ Convert a JVM int type to its Python equivalent. Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$Int Returns ------- typ: pyarrow.DataType """ if jvm_type.isSigned: if jvm_type.bitWidth == 8: return pa.int8() elif jvm_type.bitWidth == 16: return pa.int16() elif jvm_type.bitWidth == 32: return pa.int32() elif jvm_type.bitWidth == 64: return pa.int64() else: if jvm_type.bitWidth == 8: return pa.uint8() elif jvm_type.bitWidth == 16: return pa.uint16() elif jvm_type.bitWidth == 32: return pa.uint32() elif jvm_type.bitWidth == 64: return pa.uint64()
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_from_numpy_dtype(): cases = [(np.dtype('bool'), pa.bool_()), (np.dtype('int8'), pa.int8()), (np.dtype('int16'), pa.int16()), (np.dtype('int32'), pa.int32()), (np.dtype('int64'), pa.int64()), (np.dtype('uint8'), pa.uint8()), (np.dtype('uint16'), pa.uint16()), (np.dtype('uint32'), pa.uint32()), (np.dtype('float16'), pa.float16()), (np.dtype('float32'), pa.float32()), (np.dtype('float64'), pa.float64()), (np.dtype('U'), pa.string()), (np.dtype('S'), pa.binary()), (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns'))] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt # Things convertible to numpy dtypes work assert pa.from_numpy_dtype('U') == pa.string() assert pa.from_numpy_dtype(np.unicode) == pa.string() assert pa.from_numpy_dtype('int32') == pa.int32() assert pa.from_numpy_dtype(bool) == pa.bool_() with pytest.raises(NotImplementedError): pa.from_numpy_dtype(np.dtype('O')) with pytest.raises(TypeError): pa.from_numpy_dtype('not_convertible_to_dtype')
def test_bit_width(): for ty, expected in [(pa.bool_(), 1), (pa.int8(), 8), (pa.uint32(), 32), (pa.float16(), 16), (pa.decimal128(19, 4), 128), (pa.binary(42), 42 * 8)]: assert ty.bit_width == expected for ty in [pa.binary(), pa.string(), pa.list_(pa.int16())]: with pytest.raises(ValueError, match="fixed width"): ty.bit_width
def test_table(n, types=None, offset=None, length=None, nullable=True): if types is None: types = [ pyarrow.null(), pyarrow.bool_(), pyarrow.int8(), pyarrow.int16(), pyarrow.int32(), pyarrow.int64(), pyarrow.uint8(), pyarrow.uint16(), pyarrow.uint32(), pyarrow.uint64(), pyarrow.float16(), pyarrow.float32(), pyarrow.float64(), pyarrow.date32(), pyarrow.date64(), pyarrow.timestamp('s'), pyarrow.timestamp('ms'), pyarrow.timestamp('us'), pyarrow.timestamp('ns'), pyarrow.time32('s'), pyarrow.time32('ms'), pyarrow.time64('us'), pyarrow.time64('ns'), pyarrow.string(), pyarrow.binary(), pyarrow.binary(4), pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), True), pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), True), pyarrow.dictionary(pyarrow.int32(), pyarrow.string(), False), pyarrow.dictionary(pyarrow.int64(), pyarrow.int64(), False), pyarrow.list_(pyarrow.int32()), pyarrow.struct([pyarrow.field('int32', pyarrow.int32())]), pyarrow.list_( pyarrow.struct([pyarrow.field('int32', pyarrow.int32())])), pyarrow.struct( [pyarrow.field('int32', pyarrow.list_(pyarrow.int32()))]), ] data = list() for t in types: name = str(t) array = TestArrayGenerator(n, t, False).array if offset is not None: array = array.slice(offset, length) data.append(pyarrow.column(name, array)) if nullable: name = str(t) + ' (null)' array = TestArrayGenerator(n, t, True).array if offset is not None: array = array.slice(offset, length) data.append(pyarrow.column(name, array)) return pyarrow.Table.from_arrays(data)
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ('duration[s]', pa.duration('s')), ('duration[ms]', pa.duration('ms')), ('duration[us]', pa.duration('us')), ('duration[ns]', pa.duration('ns')), ('month_day_nano_interval', pa.month_day_nano_interval()), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_update_arrow_update_float_schema_with_uint32(self, util): array = [random.randint(0, 2000000) for i in range(100)] data = pd.DataFrame({"a": np.array(array, dtype=np.uint32)}) schema = pa.schema({"a": pa.uint32()}) arrow = util.make_arrow_from_pandas(data, schema) tbl = Table({"a": float}) tbl.update(arrow) assert tbl.view().to_dict()["a"] == array
def dataframe_with_arrays(include_index=False): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float32()), ('f8', pa.float64())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def test_context(): # Create platform = pf.Platform("echo") # Init platform.init() # Create a schema with some stuff fields = [ pa.field("a", pa.uint64(), False), pa.field("b", pa.string(), False), pa.field("c", pa.uint64(), True), pa.field("d", pa.list_(pa.field("e", pa.uint32(), True)), False) ] schema = pa.schema(fields) a = pa.array([1, 2, 3, 4], type=pa.uint64()) b = pa.array(["hello", "world", "fletcher", "arrow"], type=pa.string()) c = pa.array([5, 6, 7, 8], mask=np.array([True, False, True, True]), type=pa.uint64()) d = pa.array([[9, 10, 11, 12], [13, 14], [15, 16, 17], [18]], type=pa.list_(pa.uint32())) f = pa.array([19, 20, 21, 22], type=pa.uint32()) g = pa.array([23, 24, 25, 26], type=pa.uint32()) rb = pa.RecordBatch.from_arrays([a, b, c, d], schema) context = pf.Context(platform) context.queue_record_batch(rb) context.queue_array(f) context.queue_array(g, field=pa.field("g", pa.uint32(), False)) # Write buffers context.enable() # Terminate platform.terminate()
def normalize_arrow_dtype(dtype): # noqa: C901 if dtype in ['bool']: return pa.bool_() if dtype in ['int8_t', 'int8', 'byte']: return pa.int8() if dtype in ['uint8_t', 'uint8', 'char']: return pa.uint8() if dtype in ['int16_t', 'int16', 'short']: return pa.int16() if dtype in ['uint16_t', 'uint16']: return pa.uint16() if dtype in ['int32_t', 'int32', 'int']: return pa.int32() if dtype in ['uint32_t', 'uint32']: return pa.uint32() if dtype in ['int64_t', 'int64', 'long']: return pa.int64() if dtype in ['uint64_t', 'uint64']: return pa.uint64() if dtype in ['half']: return pa.float16() if dtype in ['float', 'float32']: return pa.float32() if dtype in ['double', 'float64']: return pa.float64() if dtype in ['string', 'std::string', 'std::__1::string', 'str']: return pa.large_string() if dtype in ['large_list<item: int32>']: return pa.large_list(pa.int32()) if dtype in ['large_list<item: uint32>']: return pa.large_list(pa.uint32()) if dtype in ['large_list<item: int64>']: return pa.large_list(pa.int64()) if dtype in ['large_list<item: uint64>']: return pa.large_list(pa.uint64()) if dtype in ['large_list<item: float>']: return pa.large_list(pa.float()) if dtype in ['large_list<item: double>']: return pa.large_list(pa.double()) if dtype in ['null', 'NULL', 'None', None]: return pa.null() raise ValueError('Unsupported data type: %s' % dtype)
def list_array_builder(client, array, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::LargeListArray' meta['length_'] = len(array) meta['null_count_'] = array.null_count meta['offset_'] = array.offset if isinstance(array, pa.ListArray): buffer = array.buffers()[1] length = len(buffer) // (pa.uint32().bit_width // 8) offset_array = pa.Array.from_buffers(pa.uint32(), length, [None, buffer]) offset_array = offset_array.cast(pa.uint64()) offset_buffer = offset_array.buffers()[1] else: # is pa.LargeListArray offset_buffer = array.buffers()[1] meta.add_member('null_bitmap_', buffer_builder(client, array.buffers()[0], builder)) meta.add_member('buffer_offsets_', buffer_builder(client, offset_buffer, builder)) meta.add_member('values_', builder.run(client, array.values)) meta['nbytes'] = array.nbytes return client.create_metadata(meta)
def read_product(self, keep_groups=None, drop_groups=None, keep_modules=None, drop_modules=None): prod_cols = [ 'upc', 'upc_ver_uc', 'upc_descr', 'product_module_code', 'product_module_descr', 'product_group_code', 'product_group_descr', 'brand_code_uc', 'brand_descr', 'multi', 'size1_code_uc', 'size1_amount', 'size1_units', 'dataset_found_uc', 'size1_change_flag_uc' ] prod_dict = { 'upc': pa.int64(), 'upc_ver_uc': pa.int8(), 'product_module_code': pa.uint16(), 'brand_code_uc': pa.uint32(), 'multi': pa.uint16(), 'size1_code_uc': pa.uint16() } prod_df = csv.read_csv(self.product_file, read_options=csv.ReadOptions(encoding='latin'), parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( column_types=prod_dict, include_columns=prod_cols)).to_pandas() if keep_groups: prod_df = prod_df[prod_df['product_group_code'].isin(keep_groups)] if drop_groups: prod_df = prod_df[~prod_df['product_group_code'].isin(drop_groups)] if keep_modules: prod_df = prod_df[prod_df['product_module_code'].isin( keep_modules)] if drop_modules: prod_df = prod_df[~prod_df['product_module_code'].isin(drop_modules )] # dictionary encoding to save space prod_df['size1_units'] = prod_df['size1_units'].astype('category') prod_df['product_module_descr'] = prod_df[ 'product_module_descr'].astype('category') prod_df['product_group_code'] = prod_df['product_group_code'].astype( 'category') # clean up product info prod_df['upc_descr'] = prod_df['upc_descr'].str.strip().str.replace( 'RTE', '') prod_df['brand_descr'] = prod_df['brand_descr'].str.strip( ).str.replace('CTL BR', 'Private Label') self.prod_df = prod_df.copy() return
def test_format_uint32_array(): assert format_number_array( pa.array( [1, 1, 2, 2, 3_000, 3_000, 4_000_000, 4_000_000, None, None, 6, 6], pa.uint32(), ), parse_number_format("{:,d}"), ).to_pylist() == [ "1", "1", "2", "2", "3,000", "3,000", "4,000,000", "4,000,000", None, None, "6", "6", ]
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def _map_arrow_type(arrow_type): arrow_to_dh = { pa.null(): '', pa.bool_(): '', pa.int8(): 'byte', pa.int16(): 'short', pa.int32(): 'int', pa.int64(): 'long', pa.uint8(): '', pa.uint16(): 'char', pa.uint32(): '', pa.uint64(): '', pa.float16(): '', pa.float32(): 'float', pa.float64(): 'double', pa.time32('s'): '', pa.time32('ms'): '', pa.time64('us'): '', pa.time64('ns'): 'io.deephaven.time.DateTime', pa.timestamp('us', tz=None): '', pa.timestamp('ns', tz=None): '', pa.date32(): 'java.time.LocalDate', pa.date64(): 'java.time.LocalDate', pa.binary(): '', pa.string(): 'java.lang.String', pa.utf8(): 'java.lang.String', pa.large_binary(): '', pa.large_string(): '', pa.large_utf8(): '', # decimal128(int precision, int scale=0) # list_(value_type, int list_size=-1) # large_list(value_type) # map_(key_type, item_type[, keys_sorted]) # struct(fields) # dictionary(index_type, value_type, …) # field(name, type, bool nullable = True[, metadata]) # schema(fields[, metadata]) # from_numpy_dtype(dtype) } dh_type = arrow_to_dh.get(arrow_type) if not dh_type: # if this is a case of timestamp with tz specified if isinstance(arrow_type, pa.TimestampType): dh_type = "io.deephaven.time.DateTime" if not dh_type: raise DHError(f'unsupported arrow data type : {arrow_type}') return {"deephaven:type": dh_type}
def _get_numba_typ_from_pa_typ(pa_typ): import pyarrow as pa _typ_map = { # boolean pa.bool_(): types.bool_, # signed int types pa.int8(): types.int8, pa.int16(): types.int16, pa.int32(): types.int32, pa.int64(): types.int64, # unsigned int types pa.uint8(): types.uint8, pa.uint16(): types.uint16, pa.uint32(): types.uint32, pa.uint64(): types.uint64, # float types (TODO: float16?) pa.float32(): types.float32, pa.float64(): types.float64, # String pa.string(): string_type, # date pa.date32(): types.NPDatetime('ns'), pa.date64(): types.NPDatetime('ns'), # time (TODO: time32, time64, ...) pa.timestamp('ns'): types.NPDatetime('ns'), pa.timestamp('us'): types.NPDatetime('ns'), pa.timestamp('ms'): types.NPDatetime('ns'), pa.timestamp('s'): types.NPDatetime('ns'), } if pa_typ not in _typ_map: raise ValueError("Arrow data type {} not supported yet".format(pa_typ)) return _typ_map[pa_typ]
def string_array_builder(client, array, builder): meta = ObjectMeta() meta['typename'] = 'vineyard::BaseBinaryArray<arrow::LargeStringArray>' meta['length_'] = len(array) meta['null_count_'] = array.null_count meta['offset_'] = array.offset null_bitmap = buffer_builder(client, array.buffers()[0], builder) if isinstance(array, pa.StringArray): buffer = array.buffers()[1] length = len(buffer) // (pa.uint32().bit_width // 8) offset_array = pa.Array.from_buffers(pa.uint32(), length, [None, buffer]) offset_array = offset_array.cast(pa.uint64()) offset_buffer = offset_array.buffers()[1] else: # is pa.LargeStringArray offset_buffer = array.buffers()[1] buffer_offsets = buffer_builder(client, offset_buffer, builder) buffer_data = buffer_builder(client, array.buffers()[2], builder) meta.add_member('buffer_offsets_', buffer_offsets) meta.add_member('buffer_data_', buffer_data) meta.add_member('null_bitmap_', null_bitmap) meta['nbytes'] = array.nbytes return client.create_metadata(meta)
def test_filter(): import pyarrow.gandiva as gandiva df = pd.DataFrame({"a": [1.0 * i for i in range(10000)]}) table = pa.Table.from_pandas(df) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field_by_name("a")) thousand = builder.make_literal(1000.0, pa.float64()) cond = builder.make_function("less_than", [node_a, thousand], pa.bool_()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
def test_nested_ndarray_different_dtypes(): data = [ np.array([1, 2, 3], dtype='int64'), None, np.array([4, 5, 6], dtype='uint32') ] arr = pa.array(data) expected = pa.array([[1, 2, 3], None, [4, 5, 6]], type=pa.list_(pa.int64())) assert arr.equals(expected) t2 = pa.list_(pa.uint32()) arr2 = pa.array(data, type=t2) expected2 = expected.cast(t2) assert arr2.equals(expected2)
def test2DSparseTensor(self): tensor_representation = text_format.Parse( """ sparse_tensor { value_column_name: "values" index_column_names: ["d0", "d1"] dense_shape { dim { size: 10 } dim { size: 20 } } } """, schema_pb2.TensorRepresentation()) record_batch = pa.RecordBatch.from_arrays( [ pa.array([[1], None, [2], [3, 4, 5], []], type=pa.list_(pa.int64())), # Also test that the index column can be of an integral type other # than int64. pa.array([[9], None, [9], [7, 8, 9], []], type=pa.list_(pa.uint32())), pa.array([[0], None, [0], [0, 1, 2], []], type=pa.list_(pa.int64())) ], ["values", "d0", "d1"]) adapter = tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig( record_batch.schema, {"output": tensor_representation})) converted = adapter.ToBatchTensors(record_batch) self.assertLen(converted, 1) self.assertIn("output", converted) actual_output = converted["output"] self.assertIsInstance( actual_output, (tf.SparseTensor, tf.compat.v1.SparseTensorValue)) self.assertSparseAllEqual( tf.compat.v1.SparseTensorValue(dense_shape=[5, 10, 20], indices=[[0, 9, 0], [2, 9, 0], [3, 7, 0], [3, 8, 1], [3, 9, 2]], values=tf.convert_to_tensor( [1, 2, 3, 4, 5], dtype=tf.int64)), actual_output) self.assertAdapterCanProduceNonEagerInEagerMode(adapter, record_batch)
def test_is_integer(): signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()] unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] for t in signed_ints + unsigned_ints: assert types.is_integer(t) for t in signed_ints: assert types.is_signed_integer(t) assert not types.is_unsigned_integer(t) for t in unsigned_ints: assert types.is_unsigned_integer(t) assert not types.is_signed_integer(t) assert not types.is_integer(pa.float32()) assert not types.is_signed_integer(pa.float32())
def _create_parquet_schema(dtypes): """Create parquet schema from Pandas dtypes Args: dtypes: A dict or Series of dtypes Returns: pyarrow.Schema """ import pyarrow as pa dtypes = dict(dtypes) fields = [] for varname, vartype in dtypes.items(): if vartype == np.float16: fields.append(pa.field(varname, pa.float16())) elif vartype == np.float32: fields.append(pa.field(varname, pa.float32())) elif vartype == np.float64: fields.append(pa.field(varname, pa.float64())) elif vartype == np.int8: fields.append(pa.field(varname, pa.int8())) elif vartype == np.int16: fields.append(pa.field(varname, pa.int16())) elif vartype == np.int32: fields.append(pa.field(varname, pa.int32())) elif vartype == np.int64: fields.append(pa.field(varname, pa.int64())) elif vartype == np.uint8: fields.append(pa.field(varname, pa.uint8())) elif vartype == np.uint16: fields.append(pa.field(varname, pa.uint16())) elif vartype == np.uint32: fields.append(pa.field(varname, pa.uint32())) elif vartype == np.uint64: fields.append(pa.field(varname, pa.uint64())) elif vartype == np.bool_: fields.append(pa.field(varname, pa.bool_())) elif (vartype == object) | (vartype.name == 'category'): fields.append(pa.field(varname, pa.string())) elif np.issubdtype(vartype, np.datetime64): fields.append(pa.field(varname, pa.timestamp('ns'))) assert len(dtypes) == len(fields) schema = pa.schema(fields) return schema
def test_filter(): import pyarrow.gandiva as gandiva table = pa.Table.from_arrays([pa.array([1.0 * i for i in range(10000)])], ['a']) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) thousand = builder.make_literal(1000.0, pa.float64()) cond = builder.make_function("less_than", [node_a, thousand], pa.bool_()) condition = builder.make_condition(cond) filter = gandiva.make_filter(table.schema, condition) # Gandiva generates compute kernel function named `@expr_X` assert filter.llvm_ir.find("@expr_") != -1 result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool()) assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
def coerce_arrow(array: pa.Array, rechunk: bool = True) -> pa.Array: # note: Decimal256 could not be cast to float if isinstance(array.type, pa.Decimal128Type): array = pa.compute.cast(array, pa.float64()) if hasattr(array, "num_chunks") and array.num_chunks > 1 and rechunk: # small integer keys can often not be combined, so let's already cast # to the uint32 used by polars if pa.types.is_dictionary( array.type) and (pa.types.is_int8(array.type.index_type) or pa.types.is_uint8(array.type.index_type) or pa.types.is_int16(array.type.index_type) or pa.types.is_uint16(array.type.index_type) or pa.types.is_int32(array.type.index_type)): array = pa.compute.cast( array, pa.dictionary(pa.uint32(), pa.large_string())).combine_chunks() return array
def test_convert_uint8_uint16_uint32(): # parquet only stores int32/int64 values natively. These are upcast to # be encoded. _test_convert_via_arrow( pyarrow.table({ "u8": pyarrow.array([1, 138, None], type=pyarrow.uint8()), "u16": pyarrow.array([1, 38383, None], type=pyarrow.uint16()), "u32": pyarrow.array([1, 4294967291, None], type=pyarrow.uint32()), }), "u8,u16,u32\r\n1,1,1\r\n138,38383,4294967291\r\n,,", [ dict(u8=1, u16=1, u32=1), dict(u8=138, u16=38383, u32=4294967291), dict(u8=None, u16=None, u32=None), ], )
def write_case1_pyarrow(size=1, page_version=1): data, path = case1(size) fields = [ pa.field('int64', pa.int64()), pa.field('float64', pa.float64()), pa.field('string', pa.utf8()), pa.field('bool', pa.bool_()), pa.field('date', pa.timestamp('ms')), pa.field('uint32', pa.uint32()), ] schema = pa.schema(fields) base_path = f"{PYARROW_PATH}/v{page_version}" t = pa.table(data, schema=schema) os.makedirs(base_path, exist_ok=True) pa.parquet.write_table(t, f"{base_path}/{path}", data_page_version=f"{page_version}.0")
def test_validate_schema_write_table(tempdir): # ARROW-2926 simple_fields = [ pa.field('POS', pa.uint32()), pa.field('desc', pa.string()) ] simple_schema = pa.schema(simple_fields) # simple_table schema does not match simple_schema simple_from_array = [pa.array([1]), pa.array(['bla'])] simple_table = pa.Table.from_arrays(simple_from_array, ['POS', 'desc']) path = tempdir / 'simple_validate_schema.parquet' with pq.ParquetWriter(path, simple_schema, version='2.0', compression='snappy', flavor='spark') as w: with pytest.raises(ValueError): w.write_table(simple_table)
def test_unsigned_roundtrip(self, duckdb_cursor): if not can_run: return parquet_filename = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'data', 'unsigned.parquet') data = (pyarrow.array([1,2,3,4,5,255], type=pyarrow.uint8()),pyarrow.array([1,2,3,4,5,65535], \ type=pyarrow.uint16()),pyarrow.array([1,2,3,4,5,4294967295], type=pyarrow.uint32()),\ pyarrow.array([1,2,3,4,5,18446744073709551615], type=pyarrow.uint64())) tbl = pyarrow.Table.from_arrays([data[0], data[1], data[2], data[3]], ['a', 'b', 'c', 'd']) pyarrow.parquet.write_table(tbl, parquet_filename) cols = 'a, b, c, d' unsigned_parquet_table = pyarrow.parquet.read_table(parquet_filename) unsigned_parquet_table.validate(full=True) rel_from_arrow = duckdb.arrow(unsigned_parquet_table).project( cols).arrow() rel_from_arrow.validate(full=True) rel_from_duckdb = duckdb.from_parquet(parquet_filename).project( cols).arrow() rel_from_duckdb.validate(full=True) assert rel_from_arrow.equals(rel_from_duckdb, check_metadata=True) con = duckdb.connect() con.execute( "select NULL c_null, (c % 4 = 0)::bool c_bool, (c%128)::tinyint c_tinyint, c::smallint*1000 c_smallint, c::integer*100000 c_integer, c::bigint*1000000000000 c_bigint, c::float c_float, c::double c_double, 'c_' || c::string c_string from (select case when range % 2 == 0 then range else null end as c from range(-10000, 10000)) sq" ) arrow_result = con.fetch_arrow_table() arrow_result.validate(full=True) arrow_result.combine_chunks() arrow_result.validate(full=True) round_tripping = duckdb.from_arrow_table(arrow_result).to_arrow_table() round_tripping.validate(full=True) assert round_tripping.equals(arrow_result, check_metadata=True)
def test_integer_no_nulls(self): data = {} fields = [] numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()), ('i4', A.int32()), ('i8', A.int64()), ('u1', A.uint8()), ('u2', A.uint16()), ('u4', A.uint32()), ('u8', A.uint64())] num_values = 100 for dtype, arrow_dtype in numpy_dtypes: info = np.iinfo(dtype) values = np.random.randint(info.min, min(info.max, np.iinfo('i8').max), size=num_values) data[dtype] = values.astype(dtype) fields.append(A.Field.from_py(dtype, arrow_dtype)) df = pd.DataFrame(data) schema = A.Schema.from_fields(fields) self._check_pandas_roundtrip(df, expected_schema=schema)
def get_schemas(): variant_table = pa.schema([ ('vId', pa.int64()), ('chrom', pa.string()), ('pos', pa.int32()), ('ref', pa.string()), ('alt', pa.string()) ]) annotations = pa.schema([ ('vId', pa.int64()), ('geneSymbol', pa.string()) ]) gt_table = pa.schema([ ('vId', pa.uint64()), ('callsetId', pa.uint32()), ('genotype', pa.uint8()) ]) return {"variants": variant_table, "annotations": annotations, "gts": gt_table}
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.string()) assert ty0.index_type == pa.int32() assert ty0.value_type == pa.string() assert ty0.ordered is False ty1 = pa.dictionary(pa.int8(), pa.float64(), ordered=True) assert ty1.index_type == pa.int8() assert ty1.value_type == pa.float64() assert ty1.ordered is True # construct from non-arrow objects ty2 = pa.dictionary('int8', 'string') assert ty2.index_type == pa.int8() assert ty2.value_type == pa.string() assert ty2.ordered is False # invalid index type raises with pytest.raises(TypeError): pa.dictionary(pa.string(), pa.int64()) with pytest.raises(TypeError): pa.dictionary(pa.uint32(), pa.string())
def test_integer_no_nulls(self): data = OrderedDict() fields = [] numpy_dtypes = [ ('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('longlong', pa.int64()), ('ulonglong', pa.uint64()) ] num_values = 100 for dtype, arrow_dtype in numpy_dtypes: info = np.iinfo(dtype) values = np.random.randint(max(info.min, np.iinfo(np.int_).min), min(info.max, np.iinfo(np.int_).max), size=num_values) data[dtype] = values.astype(dtype) fields.append(pa.field(dtype, arrow_dtype)) df = pd.DataFrame(data) schema = pa.schema(fields) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_from_numpy_dtype(): cases = [ (np.dtype('bool'), pa.bool_()), (np.dtype('int8'), pa.int8()), (np.dtype('int16'), pa.int16()), (np.dtype('int32'), pa.int32()), (np.dtype('int64'), pa.int64()), (np.dtype('uint8'), pa.uint8()), (np.dtype('uint16'), pa.uint16()), (np.dtype('uint32'), pa.uint32()), (np.dtype('float16'), pa.float16()), (np.dtype('float32'), pa.float32()), (np.dtype('float64'), pa.float64()), (np.dtype('U'), pa.string()), (np.dtype('S'), pa.binary()), (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns')) ] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt # Things convertible to numpy dtypes work assert pa.from_numpy_dtype('U') == pa.string() assert pa.from_numpy_dtype(np.unicode) == pa.string() assert pa.from_numpy_dtype('int32') == pa.int32() assert pa.from_numpy_dtype(bool) == pa.bool_() with pytest.raises(NotImplementedError): pa.from_numpy_dtype(np.dtype('O')) with pytest.raises(TypeError): pa.from_numpy_dtype('not_convertible_to_dtype')
def test_tensor_base_object(): tensor = pa.Tensor.from_numpy(np.random.randn(10, 4)) n = sys.getrefcount(tensor) array = tensor.to_numpy() assert sys.getrefcount(tensor) == n + 1 @pytest.mark.parametrize('dtype_str,arrow_type', [ ('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f2', pa.float16()), ('f4', pa.float32()), ('f8', pa.float64()) ]) def test_tensor_numpy_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) data = (100 * np.random.randn(10, 4)).astype(dtype) tensor = pa.Tensor.from_numpy(data) assert tensor.type == arrow_type repr(tensor) result = tensor.to_numpy()
# # The specifications were created using: # # om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')() # field = … # Code to instantiate the field # jvm_spec = om.writeValueAsString(field) @pytest.mark.parametrize('pa_type,jvm_spec', [ (pa.null(), '{"name":"null"}'), (pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'), (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",'
null_type = st.just(pa.null()) bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) signed_integer_types = st.sampled_from([ pa.int8(), pa.int16(), pa.int32(), pa.int64() ]) unsigned_integer_types = st.sampled_from([ pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64() ]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([ pa.float16(), pa.float32(), pa.float64() ]) decimal_type = st.builds( pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38) ) numeric_types = st.one_of(integer_types, floating_types, decimal_type)
pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i @pytest.mark.parametrize('t,check_func', [ (pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64) ]) def test_exact_primitive_types(t, check_func): assert check_func(t)
import datetime import decimal import itertools import numpy as np import six import pytz int_type_pairs = [ (np.int8, pa.int8()), (np.int16, pa.int16()), (np.int32, pa.int32()), (np.int64, pa.int64()), (np.uint8, pa.uint8()), (np.uint16, pa.uint16()), (np.uint32, pa.uint32()), (np.uint64, pa.uint64())] np_int_types, _ = zip(*int_type_pairs) class StrangeIterable: def __init__(self, lst): self.lst = lst def __iter__(self): return self.lst.__iter__() def check_struct_type(ty, expected):
result = pickle.loads(pickle.dumps(array)) assert array.equals(result) @pytest.mark.parametrize( ('type', 'expected'), [ (pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')