def _pandas_series_to_arrow( values: pd.Series | pd.DatetimeIndex, nan_to_none: bool = True, min_len: int | None = None, ) -> pa.Array: """ Convert a pandas Series to an Arrow Array. Parameters ---------- values Series to convert to arrow nan_to_none Interpret `NaN` as missing values min_len in case of null values, this length will be used to create a dummy f64 array (with all values set to null) Returns ------- """ dtype = values.dtype if dtype == "object" and len(values) > 0: first_non_none = _get_first_non_none( values.values) # type: ignore[arg-type] if isinstance(first_non_none, str): return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none) if first_non_none is None: return pa.nulls(min_len, pa.large_utf8()) return pa.array(values, from_pandas=nan_to_none) else: return pa.array(values, from_pandas=nan_to_none)
def coerce_arrow(array: pa.Array) -> pa.Array: # also coerces timezone to naive representation # units are accounted for by pyarrow if "timestamp" in str(array.type): warnings.warn( "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost", ) ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False) ms = pa.compute.cast(ts_ms, pa.int64()) del ts_ms array = pa.compute.cast(ms, pa.date64()) del ms # note: Decimal256 could not be cast to float elif isinstance(array.type, pa.Decimal128Type): array = pa.compute.cast(array, pa.float64()) # simplest solution is to cast to (large)-string arrays # this is copy and expensive elif isinstance(array.type, pa.DictionaryType): if pa.types.is_string(array.type.value_type): array = pa.compute.cast(array, pa.large_utf8()) else: raise ValueError( "polars does not support dictionary encoded types other than strings" ) if hasattr(array, "num_chunks") and array.num_chunks > 1: if pa.types.is_string(array.type): array = pa.compute.cast(array, pa.large_utf8()) elif pa.types.is_list(array.type): array = pa.compute.cast(array, pa.large_list()) array = array.combine_chunks() return array
def from_pandas(df: "pandas.DataFrame", rechunk: bool = True) -> "DataFrame": """ Convert from a pandas DataFrame to a polars DataFrame Parameters ---------- df DataFrame to convert rechunk Make sure that all data is contiguous. Returns ------- A Polars DataFrame """ # Note: we first tried to infer the schema via pyarrow and then modify the schema if needed. # However arrow 3.0 determines the type of a string like this: # pa.array(array).type # needlessly allocating and failing when the string is too large for the string dtype. data = {} for (name, dtype) in zip(df.columns, df.dtypes): if dtype == "object" and isinstance(df[name][0], str): data[name] = pa.array(df[name], pa.large_utf8()) elif dtype == "datetime64[ns]": data[name] = pa.compute.cast( pa.array(np.array(df[name].values, dtype="datetime64[ms]")), pa.date64()) else: data[name] = pa.array(df[name]) table = pa.table(data) return from_arrow_table(table, rechunk)
def coerce_arrow(array: pa.Array) -> pa.Array: # also coerces timezone to naive representation # units are accounted for by pyarrow if "timestamp" in str(array.type): warnings.warn( "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost", ) ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False) ms = pa.compute.cast(ts_ms, pa.int64()) del ts_ms array = pa.compute.cast(ms, pa.date64()) del ms # note: Decimal256 could not be cast to float elif isinstance(array.type, pa.Decimal128Type): array = pa.compute.cast(array, pa.float64()) if hasattr(array, "num_chunks") and array.num_chunks > 1: # we have to coerce before combining chunks, because pyarrow panics if # offsets overflow if pa.types.is_string(array.type): array = pa.compute.cast(array, pa.large_utf8()) elif pa.types.is_list(array.type): # pyarrow does not seem to support casting from list to largelist # so we use convert to large list ourselves and do the re-alloc on polars/arrow side chunks = [] for arr in array.iterchunks(): chunks.append(pl.from_arrow(arr).to_arrow()) array = pa.chunked_array(chunks) array = array.combine_chunks() return array
def test_column_types_dict(self): # Ask for dict-encoded column types in ConvertOptions column_types = [('a', pa.dictionary(pa.int32(), pa.utf8())), ('b', pa.dictionary(pa.int32(), pa.int64())), ('c', pa.dictionary(pa.int32(), pa.decimal128(11, 2))), ('d', pa.dictionary(pa.int32(), pa.large_utf8()))] opts = ConvertOptions(column_types=dict(column_types)) rows = (b"a,b,c,d\n" b"abc,123456,1.0,zz\n" b"defg,123456,0.5,xx\n" b"abc,N/A,1.0,xx\n") table = self.read_bytes(rows, convert_options=opts) schema = pa.schema(column_types) expected = { 'a': ["abc", "defg", "abc"], 'b': [123456, 123456, None], 'c': [Decimal("1.00"), Decimal("0.50"), Decimal("1.00")], 'd': ["zz", "xx", "xx"], } assert table.schema == schema assert table.to_pydict() == expected # Unsupported index type column_types[0] = ('a', pa.dictionary(pa.int8(), pa.utf8())) opts = ConvertOptions(column_types=dict(column_types)) with pytest.raises(NotImplementedError): table = self.read_bytes(rows, convert_options=opts)
def test_type_ids(): # Having this fixed is very important because internally we rely on this id # to parse from python for idx, arrow_type in [ (0, pa.null()), (1, pa.bool_()), (2, pa.uint8()), (3, pa.int8()), (4, pa.uint16()), (5, pa.int16()), (6, pa.uint32()), (7, pa.int32()), (8, pa.uint64()), (9, pa.int64()), (10, pa.float16()), (11, pa.float32()), (12, pa.float64()), (13, pa.string()), (13, pa.utf8()), (14, pa.binary()), (16, pa.date32()), (17, pa.date64()), (18, pa.timestamp("us")), (19, pa.time32("s")), (20, pa.time64("us")), (23, pa.decimal128(8, 1)), (34, pa.large_utf8()), (35, pa.large_binary()), ]: assert idx == arrow_type.id
def _pandas_series_to_arrow( values: Union["pd.Series", "pd.DatetimeIndex"], nan_to_none: bool = True, min_len: Optional[int] = None, ) -> "pa.Array": """ Convert a pandas Series to an Arrow Array. Parameters ---------- values Series to convert to arrow nan_to_none Interpret `NaN` as missing values min_len in case of null values, this length will be used to create a dummy f64 array (with all values set to null) Returns ------- """ dtype = values.dtype if dtype == "object" and len(values) > 0: if isinstance(values.values[0], str): return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none) # array is null array, we set to a float64 array if values.values[0] is None and min_len is not None: return pa.nulls(min_len, pa.float64()) else: return pa.array(values, from_pandas=nan_to_none) else: return pa.array(values, from_pandas=nan_to_none)
def _resize_arrow_type(t): if t == pa.string(): return pa.large_string() if t == pa.utf8(): return pa.large_utf8() if t == pa.binary(): return pa.large_binary() if isinstance(t, pa.lib.ListType): return pa.large_list(t.value_type) return t
def test_arrow_dict_to_polars() -> None: pa_dict = pa.DictionaryArray.from_arrays( indices=np.array([0, 1, 2, 3, 1, 0, 2, 3, 3, 2]), dictionary=np.array(["AAA", "BBB", "CCC", "DDD"]), ).cast(pa.large_utf8()) s = pl.Series( name="pa_dict", values=["AAA", "BBB", "CCC", "DDD", "BBB", "AAA", "CCC", "DDD", "DDD", "CCC"], ) assert s.series_equal(pl.Series("pa_dict", pa_dict))
def _map_arrow_type(arrow_type): arrow_to_dh = { pa.null(): '', pa.bool_(): '', pa.int8(): 'byte', pa.int16(): 'short', pa.int32(): 'int', pa.int64(): 'long', pa.uint8(): '', pa.uint16(): 'char', pa.uint32(): '', pa.uint64(): '', pa.float16(): '', pa.float32(): 'float', pa.float64(): 'double', pa.time32('s'): '', pa.time32('ms'): '', pa.time64('us'): '', pa.time64('ns'): 'io.deephaven.time.DateTime', pa.timestamp('us', tz=None): '', pa.timestamp('ns', tz=None): '', pa.date32(): 'java.time.LocalDate', pa.date64(): 'java.time.LocalDate', pa.binary(): '', pa.string(): 'java.lang.String', pa.utf8(): 'java.lang.String', pa.large_binary(): '', pa.large_string(): '', pa.large_utf8(): '', # decimal128(int precision, int scale=0) # list_(value_type, int list_size=-1) # large_list(value_type) # map_(key_type, item_type[, keys_sorted]) # struct(fields) # dictionary(index_type, value_type, …) # field(name, type, bool nullable = True[, metadata]) # schema(fields[, metadata]) # from_numpy_dtype(dtype) } dh_type = arrow_to_dh.get(arrow_type) if not dh_type: # if this is a case of timestamp with tz specified if isinstance(arrow_type, pa.TimestampType): dh_type = "io.deephaven.time.DateTime" if not dh_type: raise DHError(f'unsupported arrow data type : {arrow_type}') return {"deephaven:type": dh_type}
def _pandas_series_to_arrow( values: Union["pd.Series", "pd.DatetimeIndex"]) -> pa.Array: """ Convert a pandas Series to an Arrow array. """ dtype = values.dtype if dtype == "datetime64[ns]": # We first cast to ms because that's the unit of Date64, # Then we cast to via int64 to date64. Casting directly to Date64 lead to # loss of time information https://github.com/ritchie46/polars/issues/476 arr = pa.array(np.array(values.values, dtype="datetime64[ms]")) arr = pa.compute.cast(arr, pa.int64()) return pa.compute.cast(arr, pa.date64()) elif dtype == "object" and len(values) > 0 and isinstance( values.iloc[0], str): return pa.array(values, pa.large_utf8()) else: return pa.array(values)
def _pandas_series_to_arrow( values: Union["pd.Series", "pd.DatetimeIndex"], nan_to_none: bool = True, min_len: Optional[int] = None, ) -> "pa.Array": """ Convert a pandas Series to an Arrow Array. Parameters ---------- values Series to convert to arrow nan_to_none Interpret `NaN` as missing values min_len in case of null values, this length will be used to create a dummy f64 array (with all values set to null) Returns ------- """ dtype = values.dtype if dtype == "datetime64[ns]": # We first cast to ms because that's the unit of Datetime, # Then we cast to via int64 to datetime. Casting directly to Datetime lead to # loss of time information https://github.com/pola-rs/polars/issues/476 arr = pa.array(np.array(values.values, dtype="datetime64[ms]"), from_pandas=nan_to_none) arr = pa.compute.cast(arr, pa.int64()) return pa.compute.cast(arr, pa.timestamp("ms")) elif dtype == "object" and len(values) > 0: if isinstance(values.values[0], str): return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none) # array is null array, we set to a float64 array if values.values[0] is None and min_len is not None: return pa.nulls(min_len, pa.float64()) else: return pa.array(values, from_pandas=nan_to_none) else: return pa.array(values, from_pandas=nan_to_none)
def from_pandas( df: "pandas.DataFrame", rechunk: bool = True # noqa: F821 ) -> "DataFrame": """ Convert from a pandas DataFrame to a polars DataFrame Parameters ---------- df DataFrame to convert rechunk Make sure that all data is contiguous. Returns ------- A Polars DataFrame """ # Note: we first tried to infer the schema via pyarrow and then modify the schema if needed. # However arrow 3.0 determines the type of a string like this: # pa.array(array).type # needlessly allocating and failing when the string is too large for the string dtype. data = {} for (name, dtype) in zip(df.columns, df.dtypes): if dtype == "object" and isinstance(df[name][0], str): data[name] = pa.array(df[name], pa.large_utf8()) elif dtype == "datetime64[ns]": # We first cast to ms because that's the unit of Date64 # Then we cast to via int64 to date64. Casting directly to Date64 lead to # loss of time information https://github.com/ritchie46/polars/issues/476 arr = pa.array(np.array(df[name].values, dtype="datetime64[ms]")) arr = pa.compute.cast(arr, pa.int64()) data[name] = pa.compute.cast(arr, pa.date64()) else: data[name] = pa.array(df[name]) table = pa.table(data) return from_arrow(table, rechunk)
def convert_string_column(col: ColumnObject) -> pa.Array: """ Convert a string column to a Arrow array. """ # Missing if col.null_count > 0: if col.describe_null != (3, 0): raise TypeError("Only support arrow style mask data") # Retrieve the data buffers buffers = col.get_buffers() dbuffer, bdtype = buffers["data"] # buffer containing the UTF-8 code units obuffer, odtype = buffers["offsets"] # buffer containing the index offsets demarcating the beginning and end of each string mbuffer, mdtype = buffers["validity"] # buffer indicating the presence of missing values # Convert the buffers to NumPy arrays dt = (_DtypeKind.UINT, 8, None, None) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) dbuf = buffer_to_ndarray(dbuffer, dt) obuf = buffer_to_ndarray(obuffer, odtype) mbuf = buffer_to_ndarray(mbuffer, mdtype) # not sure what the best way to communicate the two types of strings is if obuffer._x.dtype == "int64": arrow_type = pa.large_utf8() elif obuffer._x.dtype == "int32": arrow_type = pa.utf8() else: raise TypeError(f"oops") length = obuf.size - 1 buffers = [None, pa.py_buffer(obuf), pa.py_buffer(dbuf)] arrow_array = pa.Array.from_buffers(arrow_type, length, buffers) # Apply the mask if col.null_count > 0: arrow_array = pa.array(arrow_array.tolist(), mask=mbuf) return arrow_array, buffers
def coerce_arrow(array: "pa.Array") -> "pa.Array": if array.type == pa.timestamp("s"): array = pa.compute.cast( pa.compute.multiply(pa.compute.cast(array, pa.int64()), 1000), pa.date64(), ) elif array.type == pa.timestamp("ms"): array = pa.compute.cast(pa.compute.cast(array, pa.int64()), pa.date64()) elif array.type == pa.timestamp("us"): array = pa.compute.cast( pa.compute.divide(pa.compute.cast(array, pa.int64()), 1000), pa.date64(), ) elif array.type == pa.timestamp("ns"): array = pa.compute.cast( pa.compute.divide(pa.compute.cast(array, pa.int64()), 1000000), pa.date64(), ) # note: Decimal256 could not be cast to float elif isinstance(array.type, pa.Decimal128Type): array = pa.compute.cast(array, pa.float64()) # simplest solution is to cast to (large)-string arrays # this is copy and expensive elif isinstance(array, pa.DictionaryArray): if array.dictionary.type == pa.string(): array = pa.compute.cast(pa.compute.cast(array, pa.utf8()), pa.large_utf8()) else: raise ValueError( "polars does not support dictionary encoded types other than strings" ) if hasattr(array, "num_chunks") and array.num_chunks > 1: array = array.combine_chunks() return array
def test_no_mem_copy(): strings = ["a", "", "cdef", "", "g"] # data for above string array dbuf = np.array([97, 99, 100, 101, 102, 103], dtype='uint8') obuf = np.array([0, 1, 1, 5, 5, 6], dtype='int64') length = 5 buffers = [None, pa.py_buffer(obuf), pa.py_buffer(dbuf)] s = pa.Array.from_buffers(pa.large_utf8(), length, buffers) x = np.arange(0, 5) df = vaex.from_arrays(x=x, s=s) df2 = _from_dataframe_to_vaex(df.__dataframe__()) # primitive data x[0] = 999 assert df2.x.tolist() == [999, 1, 2, 3, 4] # strings assert df.s.tolist() == strings assert df2.s.tolist() == strings # mutate the buffer data (which actually arrow and vaex both don't support/want) strings[0] = "b" dbuf[0] += 1 assert df.s.tolist() == strings assert df2.s.tolist() == strings
("decimal128(38,1)", pa.decimal128(38, 1)), ("decimal128(1,2)", pa.decimal128(1, 2)), ("time32(s)", pa.time32("s")), ("time32(ms)", pa.time32("ms")), ("time64(us)", pa.time64("us")), ("time64(ns)", pa.time64("ns")), ("timestamp(s)", pa.timestamp("s")), ("timestamp(ms)", pa.timestamp("ms")), ("timestamp(us)", pa.timestamp("us")), ("timestamp(ns)", pa.timestamp("ns")), ("date32", pa.date32()), ("date64", pa.date64()), ("string", pa.string()), ("large_string", pa.large_string()), ("utf8", pa.utf8()), ("large_utf8", pa.large_utf8()), ("binary", pa.binary()), ("binary(128)", pa.binary(128)), ("large_binary", pa.large_binary()), ("struct<num:int64>", pa.struct([("num", pa.int64())])), ("list<int64>", pa.list_(pa.int64())), ("list_<list<int64>>", pa.list_(pa.list_(pa.int64()))), ("list_<int64>", pa.list_(pa.int64())), ("list_<list_<int64>>", pa.list_(pa.list_(pa.int64()))), ("large_list<int64>", pa.large_list(pa.int64())), ("large_list<large_list<int64>>", pa.large_list(pa.large_list(pa.int64()))), ( "struct<num:int64, newnum:int64>", pa.struct([("num", pa.int64()), ("newnum", pa.int64())]), ),
def __init__( self, name: str, values: "Union[np.array, List[Optional[Any]]]" = None, nullable: bool = True, dtype: "Optional[DataType]" = None, ): """ Parameters ---------- name Name of the series values Values of the series nullable If nullable. None values in a list will be interpreted as missing. NaN values in a numpy array will be interpreted as missing. Note that missing and NaNs are not the same in Polars Series creation may be faster if set to False and there are no null values. """ # assume the first input were the values if values is None and not isinstance(name, str): values = name name = "" if values.__class__ == self.__class__: values.rename(name) self._s = values._s return self._s: PySeries # series path if isinstance(values, Series): self._from_pyseries(values) return elif isinstance(values, dict): raise ValueError( f"Constructing a Series with a dict is not supported for {values}" ) elif isinstance(values, pa.Array): self._s = self.from_arrow(name, values)._s return # castable to numpy if not isinstance(values, np.ndarray) and not nullable: values = np.array(values) if dtype is not None: if dtype == Int8: self._s = PySeries.new_i8(name, values) elif dtype == Int16: self._s = PySeries.new_i16(name, values) elif dtype == Int32: self._s = PySeries.new_i32(name, values) elif dtype == Int64: self._s = PySeries.new_i64(name, values) elif dtype == UInt8: self._s = PySeries.new_u8(name, values) elif dtype == UInt16: self._s = PySeries.new_u16(name, values) elif dtype == UInt32: self._s = PySeries.new_u32(name, values) elif dtype == UInt64: self._s = PySeries.new_u64(name, values) elif dtype == Float32: self._s = PySeries.new_f32(name, values) elif dtype == Float64: self._s = PySeries.new_f64(name, values) elif dtype == Boolean: self._s = PySeries.new_bool(name, values) elif dtype == Utf8: self._s = PySeries.new_str(name, values) else: raise ValueError( f"dtype {dtype} not yet supported when creating a Series") return # numpy path if isinstance(values, np.ndarray): if not values.data.contiguous: values = np.array(values) if len(values.shape) > 1: self._s = PySeries.new_object(name, values) return dtype = values.dtype if dtype == np.int64: self._s = PySeries.new_i64(name, values) elif dtype == np.int32: self._s = PySeries.new_i32(name, values) elif dtype == np.int16: self._s = PySeries.new_i16(name, values) elif dtype == np.int8: self._s = PySeries.new_i8(name, values) elif dtype == np.float32: self._s = PySeries.new_f32(name, values, nullable) elif dtype == np.float64: self._s = PySeries.new_f64(name, values, nullable) elif isinstance(values[0], str): self._s = PySeries.new_str(name, values) elif dtype == np.bool: self._s = PySeries.new_bool(name, values) elif dtype == np.uint8: self._s = PySeries.new_u8(name, values) elif dtype == np.uint16: self._s = PySeries.new_u16(name, values) elif dtype == np.uint32: self._s = PySeries.new_u32(name, values) elif dtype == np.uint64: self._s = PySeries.new_u64(name, values) else: self._s = PySeries.new_object(name, values) return # list path else: dtype = _find_first_non_none(values) # order is important as booleans are instance of int in python. if isinstance(dtype, bool): self._s = PySeries.new_opt_bool(name, values) elif isinstance(dtype, int): self._s = PySeries.new_opt_i64(name, values) elif isinstance(dtype, float): self._s = PySeries.new_opt_f64(name, values) elif isinstance(dtype, str): self._s = PySeries.new_str(name, values) # make list array elif isinstance(dtype, (list, tuple)): value_dtype = _find_first_non_none(dtype) # we can expect a failure if we pass `[[12], "foo", 9]` # in that case we catch the exception and create an object type try: if isinstance(value_dtype, bool): arrow_array = pa.array(values, pa.large_list(pa.bool_())) elif isinstance(value_dtype, int): arrow_array = pa.array(values, pa.large_list(pa.int64())) elif isinstance(value_dtype, float): arrow_array = pa.array(values, pa.large_list(pa.float64())) elif isinstance(value_dtype, str): arrow_array = pa.array(values, pa.large_list(pa.large_utf8())) else: self._s = PySeries.new_object(name, values) return self._s = Series.from_arrow(name, arrow_array)._s except pa.lib.ArrowInvalid: self._s = PySeries.new_object(name, values) else: self._s = PySeries.new_object(name, values)
} def py_type_to_constructor(dtype: Type[Any]) -> Callable[..., "PySeries"]: """ Get the right PySeries constructor for the given Python dtype. """ try: return _PY_TYPE_TO_CONSTRUCTOR[dtype] except KeyError: return PySeries.new_object if _PYARROW_AVAILABLE and not _DOCUMENTING: _PY_TYPE_TO_ARROW_TYPE = { float: pa.float64(), int: pa.int64(), str: pa.large_utf8(), bool: pa.bool_(), } def py_type_to_arrow_type(dtype: Type[Any]) -> "pa.lib.DataType": """ Convert a Python dtype to an Arrow dtype. """ try: return _PY_TYPE_TO_ARROW_TYPE[dtype] except KeyError: raise ValueError(f"Cannot parse dtype {dtype} into Arrow dtype.")
def pickle_set_string(x): keys = x.key_array() keys = pa.array(keys.to_numpy(), type=pa.large_utf8()) return create_set_string, (keys, x.null_value, x.nan_count, x.null_count, x.fingerprint)
class TestAbstractFileParserStatics: @pytest.mark.parametrize( # testing all datatypes as laid out here: https://json-schema.org/understanding-json-schema/reference/type.html "input_json_type, output_pyarrow_type", [ ("string", pa.large_string()), ("number", pa.float64()), ("integer", pa.int64()), ("object", pa.large_string()), ("array", pa.large_string()), ("boolean", pa.bool_()), ("null", pa.large_string()), ], ) def test_json_type_to_pyarrow_type(self, input_json_type: str, output_pyarrow_type: Any) -> None: # Json -> PyArrow direction LOGGER.info(f"asserting that JSON type '{input_json_type}' converts to PyArrow type '{output_pyarrow_type}'...") assert AbstractFileParser.json_type_to_pyarrow_type(input_json_type) == output_pyarrow_type @pytest.mark.parametrize( # testing all datatypes as laid out here: https://arrow.apache.org/docs/python/api/datatypes.html "input_pyarrow_types, output_json_type", [ ((pa.null(),), "string"), # null type ((pa.bool_(),), "boolean"), # boolean type ( (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()), "integer", ), # integer types ((pa.float16(), pa.float32(), pa.float64(), pa.decimal128(5, 10), pa.decimal256(3, 8)), "number"), # number types ((pa.time32("s"), pa.time64("ns"), pa.timestamp("ms"), pa.date32(), pa.date64()), "string"), # temporal types ((pa.binary(), pa.large_binary()), "string"), # binary types ((pa.string(), pa.utf8(), pa.large_string(), pa.large_utf8()), "string"), # string types ((pa.list_(pa.string()), pa.large_list(pa.timestamp("us"))), "string"), # array types ((pa.map_(pa.string(), pa.float32()), pa.dictionary(pa.int16(), pa.list_(pa.string()))), "string"), # object types ], ) def test_json_type_to_pyarrow_type_reverse(self, input_pyarrow_types: Tuple[Any], output_json_type: str) -> None: # PyArrow -> Json direction (reverse=True) for typ in input_pyarrow_types: LOGGER.info(f"asserting that PyArrow type '{typ}' converts to JSON type '{output_json_type}'...") assert AbstractFileParser.json_type_to_pyarrow_type(typ, reverse=True) == output_json_type @pytest.mark.parametrize( # if expecting fail, put pyarrow_schema as None "json_schema, pyarrow_schema", [ ( {"a": "string", "b": "number", "c": "integer", "d": "object", "e": "array", "f": "boolean", "g": "null"}, { "a": pa.large_string(), "b": pa.float64(), "c": pa.int64(), "d": pa.large_string(), "e": pa.large_string(), "f": pa.bool_(), "g": pa.large_string(), }, ), ({"single_column": "object"}, {"single_column": pa.large_string()}), ({}, {}), ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": pa.large_string(), "b": pa.large_string()}), (["string", "object"], None), # bad input type ], ) def test_json_schema_to_pyarrow_schema(self, json_schema: Mapping[str, Any], pyarrow_schema: Mapping[str, Any]) -> None: # Json -> PyArrow direction if pyarrow_schema is not None: assert AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) == pyarrow_schema else: with pytest.raises(Exception) as e_info: AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) LOGGER.debug(str(e_info)) @pytest.mark.parametrize( # if expecting fail, put json_schema as None "pyarrow_schema, json_schema", [ ( { "a": pa.utf8(), "b": pa.float16(), "c": pa.uint32(), "d": pa.map_(pa.string(), pa.float32()), "e": pa.bool_(), "f": pa.date64(), }, {"a": "string", "b": "number", "c": "integer", "d": "string", "e": "boolean", "f": "string"}, ), ({"single_column": pa.int32()}, {"single_column": "integer"}), ({}, {}), ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": "string", "b": "string"}), (["string", "object"], None), # bad input type ], ) def test_json_schema_to_pyarrow_schema_reverse(self, pyarrow_schema: Mapping[str, Any], json_schema: Mapping[str, Any]) -> None: # PyArrow -> Json direction (reverse=True) if json_schema is not None: assert AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) == json_schema else: with pytest.raises(Exception) as e_info: AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) LOGGER.debug(str(e_info))
def test_large_utf8(self): array = pyarrow.array( ["a", "b", "c"], pyarrow.large_utf8(), numpy.array([False, True, False]) ) self._test_data(array)
testcase_name="basic_types", inputs=[ { "bool": pa.array([False, None, True], type=pa.bool_()), "int64": pa.array([1, None, 3], type=pa.int64()), "uint64": pa.array([1, None, 3], type=pa.uint64()), "int32": pa.array([1, None, 3], type=pa.int32()), "uint32": pa.array([1, None, 3], type=pa.uint32()), "float": pa.array([1., None, 3.], type=pa.float32()), "double": pa.array([1., None, 3.], type=pa.float64()), "bytes": pa.array([b"abc", None, b"ghi"], type=pa.binary()), "large_bytes": pa.array([b"abc", None, b"ghi"], type=pa.large_binary()), "unicode": pa.array([u"abc", None, u"ghi"], type=pa.utf8()), "large_unicode": pa.array([u"abc", None, u"ghi"], type=pa.large_utf8()), }, { "bool": pa.array([None, False], type=pa.bool_()), "int64": pa.array([None, 4], type=pa.int64()), "uint64": pa.array([None, 4], type=pa.uint64()), "int32": pa.array([None, 4], type=pa.int32()), "uint32": pa.array([None, 4], type=pa.uint32()), "float": pa.array([None, 4.], type=pa.float32()), "double": pa.array([None, 4.], type=pa.float64()), "bytes": pa.array([None, b"jkl"], type=pa.binary()), "large_bytes": pa.array([None, b"jkl"], type=pa.large_binary()), "unicode": pa.array([None, u"jkl"], type=pa.utf8()), "large_unicode": pa.array([None, u"jkl"], type=pa.large_utf8()), }, ],
"int32": pa.array([1, None, 3], type=pa.int32()), "uint32": pa.array([1, None, 3], type=pa.uint32()), "float": pa.array([1., None, 3.], type=pa.float32()), "double": pa.array([1., None, 3.], type=pa.float64()), "bytes": pa.array([b"abc", None, b"ghi"], type=pa.binary()), "large_bytes": pa.array([b"abc", None, b"ghi"], type=pa.large_binary()), "unicode": pa.array([u"abc", None, u"ghi"], type=pa.utf8()), "large_unicode": pa.array([u"abc", None, u"ghi"], type=pa.large_utf8()), }, { "bool": pa.array([None, False], type=pa.bool_()), "int64": pa.array([None, 4], type=pa.int64()), "uint64": pa.array([None, 4], type=pa.uint64()), "int32": pa.array([None, 4], type=pa.int32()), "uint32": pa.array([None, 4], type=pa.uint32()), "float": pa.array([None, 4.], type=pa.float32()), "double": pa.array([None, 4.], type=pa.float64()), "bytes": pa.array([None, b"jkl"], type=pa.binary()), "large_bytes": pa.array([None, b"jkl"], type=pa.large_binary()), "unicode": pa.array([None, u"jkl"], type=pa.utf8()), "large_unicode": pa.array([None, u"jkl"], type=pa.large_utf8()),
ctx.register_udf("udf", fn, input_types, output_type) batches = ctx.sql("SELECT udf(a) AS tt FROM t").collect() result = batches[0].column(0) assert result == pa.array(expected_values) _null_mask = np.array([False, True, False]) @pytest.mark.parametrize( "arr", [ pa.array(["a", "b", "c"], pa.utf8(), _null_mask), pa.array(["a", "b", "c"], pa.large_utf8(), _null_mask), pa.array([b"1", b"2", b"3"], pa.binary(), _null_mask), pa.array([b"1111", b"2222", b"3333"], pa.large_binary(), _null_mask), pa.array([False, True, True], None, _null_mask), pa.array([0, 1, 2], None), helpers.data_binary_other(), helpers.data_date32(), helpers.data_with_nans(), # C data interface missing pytest.param( pa.array([b"1111", b"2222", b"3333"], pa.binary(4), _null_mask), marks=pytest.mark.xfail, ), pytest.param(helpers.data_datetime("s"), marks=pytest.mark.xfail), pytest.param(helpers.data_datetime("ms"), marks=pytest.mark.xfail), pytest.param(helpers.data_datetime("us"), marks=pytest.mark.xfail),
"i": Int32, ("q" if sys.platform == "win32" else "l"): Int64, "B": UInt8, "H": UInt16, "I": UInt32, ("Q" if sys.platform == "win32" else "L"): UInt64, "f": Float32, "d": Float64, "?": Boolean, } if _PYARROW_AVAILABLE: _PY_TYPE_TO_ARROW_TYPE: dict[type, pa.lib.DataType] = { float: pa.float64(), int: pa.int64(), str: pa.large_utf8(), bool: pa.bool_(), date: pa.date32(), time: pa.time64("us"), datetime: pa.timestamp("us"), timedelta: pa.duration("us"), } _DTYPE_TO_ARROW_TYPE = { Int8: pa.int8(), Int16: pa.int16(), Int32: pa.int32(), Int64: pa.int64(), UInt8: pa.uint8(), UInt16: pa.uint16(), UInt32: pa.uint32(),