def test_automapper_first_valid_column(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "33"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df_1: DataFrame = spark_session.table("patients") df = source_df_1.select("member_id") df.createOrReplaceTempView("members") # The key thing in this test is that we are using the same mapper on sources with different columns, and they both # work as expected. # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( last_name=A.column("last_name"), age=A.first_valid_column( A.number(A.column("age")), A.number(A.column("my_age")), A.text(None), ), age2=A.first_valid_column( A.if_not_null( A.first_valid_column( A.number(A.column("age")), A.number(A.column("my_age")), A.text(None), ), A.first_valid_column( A.number(A.column("age")), A.number(A.column("my_age")), A.text(None), ), A.number(A.text("100")), ), A.number(A.column("age")), A.number(A.column("his_age")), A.number(99999), ), ) assert isinstance(mapper, AutoMapper) sql_expressions_1: Dict[str, Column] = mapper.get_column_specs( source_df=source_df_1) for column_name, sql_expression in sql_expressions_1.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions_1["age"]) == str( col("b.my_age").cast("long").alias("age")) result_df_1: DataFrame = mapper.transform(df=df) spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", None), ], ["member_id", "last_name", "first_name", "age"], ).createOrReplaceTempView("patients") source_df_2 = spark_session.table("patients") df = source_df_1.select("member_id") df.createOrReplaceTempView("members") sql_expressions_2: Dict[str, Column] = mapper.get_column_specs( source_df=source_df_2) assert str(sql_expressions_2["age"]) == str( col("b.age").cast("long").alias("___age")) result_df_2 = mapper.transform(df=df) # Assert result_df_1.printSchema() result_df_1.show() result_df_2.printSchema() result_df_2.show() assert result_df_1.where("member_id == 1").select( "age", "age2").collect()[0][:] == (54, 54) assert result_df_1.where("member_id == 2").select( "age", "age2").collect()[0][:] == (33, 33) assert result_df_2.where("member_id == 1").select( "age", "age2").collect()[0][:] == (54, 54) assert result_df_2.where("member_id == 2").select( "age", "age2").collect()[0][:] == (None, 100)
def test_automapper_first_valid_column(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "33"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # The key thing in this test is that we are using the same mapper on sources with different columns, and they both # work as expected. # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( last_name=A.column("last_name"), age=A.first_valid_column( A.number(A.expression("CAST (age AS BIGINT)")), A.number(A.expression("CAST (my_age AS BIGINT)")), A.text(None), ), is_young=A.first_valid_column( A.map( A.column("age"), { "21": "yes", "33": "yes", "54": "no comment", None: "not provided" }, ), A.map( A.column("my_age"), { "21": "yes", "33": "yes", "54": "no comment", None: "not provided" }, ), ), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], col("my_age").cast("long").cast("long").alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age", "is_young").collect()[0][:] == (54, "no comment") assert result_df.where("member_id == 2").select( "age", "is_young").collect()[0][:] == (33, "yes")