def _traverse(typ, counter): if isinstance(typ, Schema) or types.is_struct(typ): for field in typ: path = (field.name,) yield path, next(counter) for sub, c in _traverse(field.type, counter): yield path + sub, c elif _is_map(typ): for sub_c in _traverse(typ.value_type, counter): yield sub_c elif types.is_list(typ): # Skip one index for list type, since this can never be selected # directly next(counter) for sub_c in _traverse(typ.value_type, counter): yield sub_c elif types.is_union(typ): # Union types not supported, just skip the indexes for dtype in typ: next(counter) for sub_c in _traverse(dtype, counter): pass
def test_is_list(): assert types.is_list(pa.list_(pa.int32())) assert not types.is_list(pa.int32())
def _is_map(typ): return (types.is_list(typ) and types.is_struct(typ.value_type) and typ.value_type.num_children == 2 and typ.value_type[0].name == 'key' and typ.value_type[1].name == 'value')
def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> DataType: """Convert pyarrow type to Spark data type.""" from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types spark_type: DataType if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None: spark_type = TimestampNTZType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_duration(at): spark_type = DayTimeIntervalType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ] ) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def _is_map(typ): return (types.is_list(typ) and types.is_struct(typ.value_type) and typ.value_type.num_fields == 2 and typ.value_type[0].name == 'key' and typ.value_type[1].name == 'value')