def test_is_nested_or_struct(): struct_ex = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]) assert types.is_struct(struct_ex) assert not types.is_struct(pa.list_(pa.int32())) assert types.is_nested(struct_ex) assert types.is_nested(pa.list_(pa.int32())) assert not types.is_nested(pa.int32())
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError( "Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType([ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def convertPyArrowTypeToGlueType(pyarrowType: pa.DataType) -> str: if (types.is_string(pyarrowType) or types.is_unicode(pyarrowType) or types.is_large_string(pyarrowType) or types.is_large_unicode(pyarrowType)): return 'string' if (types.is_int64(pyarrowType) or types.is_uint64(pyarrowType)): return 'bigint' if (types.is_binary(pyarrowType)): return 'binary' if (types.is_boolean(pyarrowType)): return 'boolean' if (types.is_date(pyarrowType) or types.is_date32(pyarrowType) or types.is_date64(pyarrowType)): return 'date' if (types.is_decimal(pyarrowType)): return 'decimal(16,2)' if (types.is_float64(pyarrowType)): 'return double' if (types.is_float16(pyarrowType) or types.is_float32(pyarrowType)): return 'float' if (types.is_int16(pyarrowType) or types.is_int32(pyarrowType) or types.is_uint16(pyarrowType) or types.is_uint32(pyarrowType)): return 'int' if (types.is_map(pyarrowType)): return 'map' if (types.is_struct(pyarrowType)): return 'struct' if (types.is_timestamp(pyarrowType)): return 'timestamp' if (types.is_union(pyarrowType)): return 'union' return str(pyarrowType)
def arrow_to_pandas(self, arrow_column): import pyarrow.types as types if self._df_for_struct and types.is_struct(arrow_column.type): import pandas as pd series = [super(ArrowStreamPandasUDFSerializer, self).arrow_to_pandas(column) .rename(field.name) for column, field in zip(arrow_column.flatten(), arrow_column.type)] s = pd.concat(series, axis=1) else: s = super(ArrowStreamPandasUDFSerializer, self).arrow_to_pandas(arrow_column) return s
def _from_arrow_type(dt: pa.DataType) -> pt.DataType: if is_struct(dt): return pt.StructType([ pt.StructField( # field.name, _from_arrow_type(field.type), nullable=field.nullable field.name, _from_arrow_type(field.type), nullable=True, ) for field in dt ]) elif is_list(dt): if is_timestamp(dt.value_type): raise TypeError( # pragma: no cover "Spark: unsupported type in conversion from Arrow: " + str(dt)) return pt.ArrayType(_from_arrow_type(dt.value_type)) return from_arrow_type(dt)
def _traverse(typ, counter): if isinstance(typ, Schema) or types.is_struct(typ): for field in typ: path = (field.name,) yield path, next(counter) for sub, c in _traverse(field.type, counter): yield path + sub, c elif _is_map(typ): yield from _traverse(typ.value_type, counter) elif types.is_list(typ): # Skip one index for list type, since this can never be selected # directly next(counter) yield from _traverse(typ.value_type, counter) elif types.is_union(typ): # Union types not supported, just skip the indexes for dtype in typ: next(counter) for sub_c in _traverse(dtype, counter): pass
def _traverse(typ, counter): if isinstance(typ, Schema) or types.is_struct(typ): for field in typ: path = (field.name,) yield path, next(counter) for sub, c in _traverse(field.type, counter): yield path + sub, c elif _is_map(typ): for sub_c in _traverse(typ.value_type, counter): yield sub_c elif types.is_list(typ): # Skip one index for list type, since this can never be selected # directly next(counter) for sub_c in _traverse(typ.value_type, counter): yield sub_c elif types.is_union(typ): # Union types not supported, just skip the indexes for dtype in typ: next(counter) for sub_c in _traverse(dtype, counter): pass
def _is_map(typ): return (types.is_list(typ) and types.is_struct(typ.value_type) and typ.value_type.num_fields == 2 and typ.value_type[0].name == 'key' and typ.value_type[1].name == 'value')
def _is_map(typ): return (types.is_list(typ) and types.is_struct(typ.value_type) and typ.value_type.num_children == 2 and typ.value_type[0].name == 'key' and typ.value_type[1].name == 'value')