def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError( "Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType([ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def test_roundtrip_rt_pa_rt(self, rt_cat: rt.Categorical, output_writable: bool, have_nulls: bool) -> None: """Test round-tripping from rt.Categorical to pyarrow.Array/pyarrow.Table and back.""" orig_cat_shape = rt_cat.shape if have_nulls: # riptable's filtering/masking uses a valid mask (where False means null/NA). indices = np.arange(len(rt_cat)) valid_mask = indices % 3 != 1 rt_cat = rt_cat.filter(valid_mask) assert rt_cat.shape == orig_cat_shape # isfiltered() doesn't work as expected for Dictionary/IntEnum-mode Categorical as of riptable 1.1.0. filtered_element_count = (rt.isnan(rt_cat._fa) if rt_cat.category_mode in (rt.rt_enum.CategoryMode.Dictionary, rt.rt_enum.CategoryMode.IntEnum) else rt_cat.isfiltered()).sum() assert filtered_element_count == (len(rt_cat) - valid_mask.sum()) result_pa_arr = rt_cat.to_arrow() # Verify the pyarrow array has the correct length, number of categories, etc. assert len(rt_cat) == len(result_pa_arr) assert pat.is_dictionary(result_pa_arr.type) assert len(result_pa_arr.dictionary) >= len(next(iter(rt_cat.category_dict.values()))), \ "The number of categories in the pyarrow array's dictionary is smaller than the number of categories in the input Categorical." if have_nulls: assert valid_mask.sum() > 0 assert (len(rt_cat) - valid_mask.sum()) == result_pa_arr.null_count # TEMP: Certain cases are marked as XFAIL here due to issues in Categorical. # * Cannot create a pre-filtered (i.e. filtered at construction time) Dictionary- or IntEnum-mode Categorical. # * Filtering a Dictionary- or IntEnum-mode Categorical causes unused categories to be dropped, # which is not the same behavior as for other Categorical modes. # * MultiKey Categoricals can't be created with an explicit list of category arrays + an index array, # like what is supported for other Categorical modes. if rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or (have_nulls and rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary): pytest.xfail("Expected failure due to issues with the Categorical constructor and/or filtering.") result_cat = rt.Categorical.from_arrow(result_pa_arr, zero_copy_only=False, writable=output_writable) # relaxed_cat_check <==> rt_cat.ordered, because if the categories are ordered, we expect them to be # in the same position after being roundtripped, so they should be mapped to the same integer before/after. # multi-key cats always seem to be ordered, even if ordered=False is specified when creating them. # TODO: Remove CategoryMode.Dictionary from the relaxed_cat_check here -- it's failing because our encoding in # pyarrow doesn't currenly preserve unused entries from the name <-> code mapping. Once that's fixed # we should be able to use the stronger equality check. assert_array_or_cat_equal(rt_cat, result_cat, relaxed_cat_check=rt_cat.ordered or rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary)
def test_is_dictionary(): assert types.is_dictionary(pa.dictionary(pa.int32(), pa.string())) assert not types.is_dictionary(pa.int32())
def is_complex(arrow_type: DataType, /) -> bool: """Check if data type is complex.""" return is_dictionary(arrow_type) or is_nested(arrow_type)
def test_is_dictionary(): assert types.is_dictionary( pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))) assert not types.is_dictionary(pa.int32())
def is_complex(arrow_type): return is_dictionary(arrow_type) or is_nested(arrow_type)