def test_should_unpack_array_of_structs(spark_session: SparkSession): df_1 = spark_session.createDataFrame([ (1, "a", [Row(1, 'c', 3), Row(2, 'e', 5)]), (2, "b", [Row(4, 'd', 6), Row(7, 'f', 8)]), ], ["id", "text", "struct"]) unpacker = UnpackNestedFields() df_actual = unpacker.unpack_nested(df_1) fields_actual = [(field.name, field.dataType.typeName()) for field in df_actual.schema.fields] assert fields_actual == [ ("id", LongType.typeName()), ("text", StringType.typeName()), ("struct__1", LongType.typeName()), ("struct__2", StringType.typeName()), ("struct__3", LongType.typeName()), ] assert df_actual.collect() == [ (1, "a", 1, 'c', 3), (1, "a", 2, 'e', 5), (2, "b", 4, 'd', 6), (2, "b", 7, 'f', 8), ]
def test_should_unpack_array(spark_session: SparkSession): df_1 = spark_session.createDataFrame([ (1, "a", [1, 2, 3]), (2, "b", [4, 5, 6]), ], ["id", "text", "int_array"]) unpacker = UnpackNestedFields() df_actual = unpacker.unpack_nested(df_1) fields_actual = [(field.name, field.dataType.typeName()) for field in df_actual.schema.fields] assert fields_actual == [ ("id", LongType.typeName()), ("text", StringType.typeName()), ("int_array", LongType.typeName()), ] assert df_actual.collect() == [ (1, "a", 1), (1, "a", 2), (1, "a", 3), (2, "b", 4), (2, "b", 5), (2, "b", 6), ]
def test_should_not_unpack(spark_session: SparkSession): df_1 = spark_session.createDataFrame([ (1, "a", "text"), (2, "b", "other_text"), ], ["id", "text", "other_text"]) unpacker = UnpackNestedFields() df_actual = unpacker.unpack_nested(df_1) assert df_actual is df_1 fields_actual = [(field.name, field.dataType.typeName()) for field in df_actual.schema.fields] assert fields_actual == [ ("id", LongType.typeName()), ("text", StringType.typeName()), ("other_text", StringType.typeName()), ] assert df_actual.collect() == [ (1, "a", "text"), (2, "b", "other_text"), ]
def sql_types_example(spark): # DataType dp = DataType() python_obj = dp.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = dp.toInternal(1) print(sql_obj, type(sql_obj)) print(dp.json()) print(dp.jsonValue()) print(dp.needConversion()) print(dp.simpleString()) print(DataType.typeName()) # NullType nt = NullType() python_obj = nt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = nt.toInternal(1) print(sql_obj, type(sql_obj)) print(nt.json()) print(nt.jsonValue()) print(nt.needConversion()) print(nt.simpleString()) print(NullType.typeName()) # AtomicType at = AtomicType() python_obj = at.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = at.toInternal(1) print(sql_obj, type(sql_obj)) print(at.json()) print(at.jsonValue()) print(at.needConversion()) print(at.simpleString()) print(AtomicType.typeName()) # NumericType nt = NumericType() python_obj = nt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = nt.toInternal(1) print(sql_obj, type(sql_obj)) print(nt.json()) print(nt.jsonValue()) print(nt.needConversion()) print(nt.simpleString()) print(NumericType.typeName()) # IntegralType it = IntegralType() python_obj = it.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = it.toInternal(1) print(sql_obj, type(sql_obj)) print(it.json()) print(it.jsonValue()) print(it.needConversion()) print(it.simpleString()) print(IntegralType.typeName()) # FractionalType ft = FractionalType() python_obj = ft.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = ft.toInternal(1) print(sql_obj, type(sql_obj)) print(ft.json()) print(ft.jsonValue()) print(ft.needConversion()) print(ft.simpleString()) print(FractionalType.typeName()) # StringType st = StringType() python_obj = st.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = st.toInternal(1) print(sql_obj, type(sql_obj)) print(st.json()) print(st.jsonValue()) print(st.needConversion()) print(st.simpleString()) print(StringType.typeName()) # BinaryType bt = BinaryType() python_obj = bt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = bt.toInternal(1) print(sql_obj, type(sql_obj)) print(bt.json()) print(bt.jsonValue()) print(bt.needConversion()) print(bt.simpleString()) print(BinaryType.typeName()) # BooleanType bt = BooleanType() python_obj = bt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = bt.toInternal(1) print(sql_obj, type(sql_obj)) print(bt.json()) print(bt.jsonValue()) print(bt.needConversion()) print(bt.simpleString()) print(BooleanType.typeName()) # DateType from datetime import datetime dt = DateType() python_obj = dt.fromInternal(1000) print(python_obj, type(python_obj)) today = datetime.today() sql_obj = dt.toInternal(today) print(sql_obj, type(sql_obj)) print(dt.json()) print(dt.jsonValue()) print(dt.needConversion()) print(dt.simpleString()) print(DateType.typeName()) # TimestampType tt = TimestampType() python_obj = tt.fromInternal(365000000) print(python_obj, type(python_obj)) today = datetime.today() sql_obj = tt.toInternal(today) print(sql_obj, type(sql_obj)) print(tt.json()) print(tt.jsonValue()) print(tt.needConversion()) print(tt.simpleString()) print(TimestampType.typeName()) # DecimalType dt = DecimalType() python_obj = dt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = dt.toInternal(1) print(sql_obj, type(sql_obj)) print(dt.json()) print(dt.jsonValue()) print(dt.needConversion()) print(dt.simpleString()) print(DecimalType.typeName()) # DoubleType dt = DoubleType() python_obj = dt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = dt.toInternal(1) print(sql_obj, type(sql_obj)) print(dt.json()) print(dt.jsonValue()) print(dt.needConversion()) print(dt.simpleString()) print(DoubleType.typeName()) # FloatType ft = FloatType() python_obj = ft.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = ft.toInternal(1) print(sql_obj, type(sql_obj)) print(ft.json()) print(ft.jsonValue()) print(ft.needConversion()) print(ft.simpleString()) print(FloatType.typeName()) # ByteType bt = ByteType() python_obj = bt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = bt.toInternal(1) print(sql_obj, type(sql_obj)) print(bt.json()) print(bt.jsonValue()) print(bt.needConversion()) print(bt.simpleString()) print(ByteType.typeName()) # IntegerType it = IntegerType() python_obj = it.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = it.toInternal(1) print(sql_obj, type(sql_obj)) print(it.json()) print(it.jsonValue()) print(it.needConversion()) print(it.simpleString()) print(IntegerType.typeName()) # LongType lt = LongType() python_obj = lt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = lt.toInternal(1) print(sql_obj, type(sql_obj)) print(lt.json()) print(lt.jsonValue()) print(lt.needConversion()) print(lt.simpleString()) print(LongType.typeName()) # ShortType st = ShortType() python_obj = st.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = st.toInternal(1) print(sql_obj, type(sql_obj)) print(st.json()) print(st.jsonValue()) print(st.needConversion()) print(st.simpleString()) print(ShortType.typeName()) # ArrayType dt = DataType() at = ArrayType(dt) python_obj = at.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = at.toInternal(1) print(sql_obj, type(sql_obj)) print(at.json()) print(at.jsonValue()) print(at.needConversion()) print(at.simpleString()) print(ArrayType.typeName()) print(ArrayType.fromJson({"containsNull": True, "elementType": "string"})) # MapType key_type = DataType() value_type = DataType() mt = MapType(key_type, value_type) python_obj = mt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = mt.toInternal(1) print(sql_obj, type(sql_obj)) print(mt.json()) print(mt.jsonValue()) print(mt.needConversion()) print(mt.simpleString()) print(MapType.typeName()) print(MapType.fromJson({"valueContainsNull": True, "keyType": "string", "valueType": "integer"})) # StructField dt = DataType() sf = StructField("first_struct", dt) python_obj = sf.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = sf.toInternal(1) print(sql_obj, type(sql_obj)) print(sf.json()) print(sf.jsonValue()) print(sf.needConversion()) print(sf.simpleString()) print(StructField.fromJson({"metadata": None, "nullable": True, "name": "first_struct", "type": "string"})) # StructType string_type = StringType() st = StructType([StructField("first_struct", StringType()), StructField("second_struct", DataType())]) print("------") print(st.names) print(st.fields) print(st._needConversion) print(st._needSerializeAnyField) python_obj = st.fromInternal(["first_struct", "second_struct"]) print(python_obj, type(python_obj)) sql_obj = st.toInternal(["first_struct", "second_struct"]) print(sql_obj, type(sql_obj)) print(st.json()) print(st.jsonValue()) print(st.needConversion()) print(st.simpleString()) print(st.fieldNames()) fields = { "fields": [ {"metadata": None, "nullable": True, "name": "first", "type": "string"}, {"metadata": None, "nullable": True, "name": "second", "type": "integer"} ] } print(st.fromJson(fields)) st.add(StructField("first_struct", StringType())) print("st.add success!") print("Finish running types module API")
_array_signed_int_typecode_ctype_mappings, _array_unsigned_int_typecode_ctype_mappings, _array_type_mappings, _acceptable_types) __null_type = NullType() _NULL_TYPE = __null_type.simpleString() assert _NULL_TYPE == __null_type.typeName() __bool_type = BooleanType() _BOOL_TYPE = __bool_type.simpleString() assert _BOOL_TYPE == __bool_type.typeName() __str_type = StringType() _STR_TYPE = __str_type.simpleString() assert _STR_TYPE == __str_type.typeName() __binary_type = BinaryType() _BINARY_TYPE = __binary_type.simpleString() assert _BINARY_TYPE == __binary_type.typeName() __byte_type = ByteType() _TINYINT_TYPE = __byte_type.simpleString() __short_type = ShortType() _SMALLINT_TYPE = __short_type.simpleString() __int_type = IntegerType() _INT_TYPE = __int_type.simpleString() assert _INT_TYPE == int.__name__ assert __int_type.typeName().startswith(_INT_TYPE)