def test_auto_mapper_number(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "67"), (3, "Old", "Methusela", "131026061001"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( age=A.number(A.column("my_age")), null_field=A.number(AutoMapperDataTypeLiteral(None)), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) in ( str(col("b.my_age").cast("int").alias("age")), str(col("b.my_age").cast("long").alias("age")), ) assert str(sql_expressions["null_field"]) == str( lit(None).cast("long").alias("null_field")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 54 assert result_df.where("member_id == 2").select( "age").collect()[0][0] == 67 assert (result_df.where("member_id == 3").select("age").collect()[0][0] == 131026061001) assert ( result_df.where("member_id == 1").select("null_field").collect()[0][0] is None) assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_automapper_if_list(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Qureshi", "Imran", "59"), (3, "Vidal", "Michael", None), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).columns(age=A.if_( column=A.column("my_age"), check=["54", "59"], value=A.number(A.column("my_age")), else_=A.number(A.text("100")), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], when(col("b.my_age").isin(["54", "59"]), col("b.my_age").cast("long")).otherwise( lit("100").cast(StringType()).cast(LongType())).alias("age"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 54 assert result_df.where("member_id == 2").select( "age").collect()[0][0] == 59 assert result_df.where("member_id == 3").select( "age").collect()[0][0] == 100 assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_automapper_complex_with_skip_if_null( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=True, skip_if_columns_null_or_empty=["first_name"], ).complex( MyClass( id_=A.column("member_id"), name=A.column("last_name"), age=A.number(A.column("my_age")), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert str(sql_expressions["name"]) == str( when( col("b.first_name").isNull() | col("b.first_name").eqNullSafe(""), lit(None)).otherwise(col("b.last_name")).cast( StringType()).alias("name")) assert str(sql_expressions["age"]) == str( when( col("b.first_name").isNull() | col("b.first_name").eqNullSafe(""), lit(None)).otherwise(col("b.my_age")).cast( LongType()).alias("age")) result_df.printSchema() result_df.show() assert result_df.count() == 1 assert result_df.where("id == 1").select( "name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_automapper_if_regex(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran', "54"), (2, 'Vidal', 'Michael', None), ], ['member_id', 'last_name', 'first_name', "my_age" ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).columns( age=A.if_regex(column=A.column("my_age"), check="5*", value=A.number(A.column("my_age")), else_=A.number(A.text("100")))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( when(col("b.my_age").rlike("5*"), col("b.my_age").cast(IntegerType())).otherwise( lit("100").cast(StringType()).cast( IntegerType())).alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 54 assert result_df.where("member_id == 2").select( "age").collect()[0][0] == 100 assert dict(result_df.dtypes)["age"] == "int"
def test_auto_mapper_complex_with_extension( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "Michael", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).complex( MyClass( name=A.column("last_name"), age=A.number(A.column("my_age")), extension=AutoMapperList([ MyProcessingStatusExtension( processing_status=A.text("foo"), request_id=A.text("bar"), date_processed=A.date("2021-01-01"), ) ]), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert str(sql_expressions["name"]) == str( col("b.last_name").cast("string").alias("name")) assert str(sql_expressions["age"]) == str( col("b.my_age").cast("long").alias("age")) result_df.printSchema() result_df.show(truncate=False) assert result_df.where("member_id == 1").select( "name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_automapper_optional_ifexists(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", None), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( optional_age=AutoMapperIfColumnExistsType( column=A.column("my_age"), if_exists=A.number(A.column("my_age")), if_not_exists=A.text("no age"), ), optional_foo=AutoMapperIfColumnExistsType( column=A.column("foo"), if_exists=A.text("foo col is there"), if_not_exists=A.text("no foo"), ), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "optional_age", "optional_foo" ).collect()[0][:] == (54, "no foo") assert result_df.where("member_id == 2").select( "optional_age", "optional_foo" ).collect()[0][:] == (None, "no foo")
def test_auto_mapper_number_typed(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "67"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn("my_age", col("my_age").cast("int")) df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id" ]).columns(age=A.number(A.column("my_age"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions(sql_expressions["age"], col("b.my_age").alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 54 assert result_df.where("member_id == 2").select( "age").collect()[0][0] == 67 assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_complex_with_defined_class( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran', 45), (2, 'Vidal', 'Michael', 35), ], ['member_id', 'last_name', 'first_name', 'my_age' ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"], drop_key_columns=False).complex( MyClass(name=A.column("last_name"), age=A.number(A.column("my_age")))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert str(sql_expressions["name"]) == str( col("b.last_name").cast("string").alias("name")) assert str(sql_expressions["age"]) == str( col("b.my_age").cast("int").alias("age")) result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] == "int"
def test_auto_mapper_schema_pruning_with_defined_class( spark_session: SparkSession, ) -> None: # Arrange clean_spark_session(spark_session) spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "Michael", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper( view="members", source_view="patients", ).complex( MyClass(name=A.column("last_name"), age=A.number(A.column("my_age")))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # Assert assert_compare_expressions(sql_expressions["name"], col("b.last_name").cast("string").alias("name")) assert_compare_expressions(sql_expressions["age"], col("b.my_age").cast("long").alias("age")) result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_schema_pruning_with_extension( spark_session: SparkSession, ) -> None: # Arrange clean_spark_session(spark_session) spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "Michael", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper( view="members", source_view="patients", enable_schema_pruning=True, skip_schema_validation=[], ).complex( MyClass( name=A.column("last_name"), age=A.number(A.column("my_age")), extension=AutoMapperList([ MyProcessingStatusExtension( processing_status=A.text("foo"), request_id=A.text("bar"), date_processed=A.date("2021-01-01"), ) ]), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # Assert assert_compare_expressions(sql_expressions["name"], col("b.last_name").cast("string").alias("name")) assert_compare_expressions(sql_expressions["age"], col("b.my_age").cast("long").alias("age")) result_df.printSchema() result_df.show(truncate=False) assert result_df.where("member_id == 1").select( "name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint") # confirm schema expected_schema: StructType = StructType([ StructField("name", StringType(), False), StructField("age", LongType(), True), StructField( "extension", ArrayType( StructType([ StructField("url", StringType()), StructField( "extension", ArrayType( StructType([ StructField("url", StringType()), StructField("valueString", StringType()), ])), ), ])), True, ), ]) result: SchemaComparerResult = SchemaComparer.compare_schema( parent_column_name=None, source_schema=result_df.schema, desired_schema=expected_schema, ) assert result.errors == [], str(result)
def test_automapper_first_valid_column(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "33"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df_1: DataFrame = spark_session.table("patients") df = source_df_1.select("member_id") df.createOrReplaceTempView("members") # The key thing in this test is that we are using the same mapper on sources with different columns, and they both # work as expected. # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( last_name=A.column("last_name"), age=A.first_valid_column( A.number(A.column("age")), A.number(A.column("my_age")), A.text(None), ), age2=A.first_valid_column( A.if_not_null( A.first_valid_column( A.number(A.column("age")), A.number(A.column("my_age")), A.text(None), ), A.first_valid_column( A.number(A.column("age")), A.number(A.column("my_age")), A.text(None), ), A.number(A.text("100")), ), A.number(A.column("age")), A.number(A.column("his_age")), A.number(99999), ), ) assert isinstance(mapper, AutoMapper) sql_expressions_1: Dict[str, Column] = mapper.get_column_specs( source_df=source_df_1) for column_name, sql_expression in sql_expressions_1.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions_1["age"]) == str( col("b.my_age").cast("long").alias("age")) result_df_1: DataFrame = mapper.transform(df=df) spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", None), ], ["member_id", "last_name", "first_name", "age"], ).createOrReplaceTempView("patients") source_df_2 = spark_session.table("patients") df = source_df_1.select("member_id") df.createOrReplaceTempView("members") sql_expressions_2: Dict[str, Column] = mapper.get_column_specs( source_df=source_df_2) assert str(sql_expressions_2["age"]) == str( col("b.age").cast("long").alias("___age")) result_df_2 = mapper.transform(df=df) # Assert result_df_1.printSchema() result_df_1.show() result_df_2.printSchema() result_df_2.show() assert result_df_1.where("member_id == 1").select( "age", "age2").collect()[0][:] == (54, 54) assert result_df_1.where("member_id == 2").select( "age", "age2").collect()[0][:] == (33, 33) assert result_df_2.where("member_id == 1").select( "age", "age2").collect()[0][:] == (54, 54) assert result_df_2.where("member_id == 2").select( "age", "age2").collect()[0][:] == (None, 100)
def test_automapper_first_valid_column(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "33"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # The key thing in this test is that we are using the same mapper on sources with different columns, and they both # work as expected. # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( last_name=A.column("last_name"), age=A.first_valid_column( A.number(A.expression("CAST (age AS BIGINT)")), A.number(A.expression("CAST (my_age AS BIGINT)")), A.text(None), ), is_young=A.first_valid_column( A.map( A.column("age"), { "21": "yes", "33": "yes", "54": "no comment", None: "not provided" }, ), A.map( A.column("my_age"), { "21": "yes", "33": "yes", "54": "no comment", None: "not provided" }, ), ), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], col("my_age").cast("long").cast("long").alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age", "is_young").collect()[0][:] == (54, "no comment") assert result_df.where("member_id == 2").select( "age", "is_young").collect()[0][:] == (33, "yes")