def test_automapper_map(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "Y"), (2, "Vidal", "Michael", "N"), (3, "Vidal", "Michael", "f"), (4, "Qureshi", "Imran", None), ], ["member_id", "last_name", "first_name", "has_kids"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).columns(has_kids=A.map( A.column("has_kids"), { None: "Unspecified", "Y": "Yes", "N": "No" }, "unknown", )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["has_kids"], when(col("b.has_kids").eqNullSafe(lit(None)), lit("Unspecified")).when( col("b.has_kids").eqNullSafe(lit("Y")), lit("Yes")).when( col("b.has_kids").eqNullSafe(lit("N")), lit("No")).otherwise(lit("unknown")).alias("___has_kids"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "has_kids").collect()[0][0] == "Yes" assert result_df.where("member_id == 2").select( "has_kids").collect()[0][0] == "No" assert (result_df.where("member_id == 3").select("has_kids").collect()[0] [0] == "unknown") assert (result_df.where("member_id == 4").select("has_kids").collect()[0] [0] == "Unspecified")
def test_automapper_map_no_default(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran', "Y"), (2, 'Vidal', 'Michael', "N"), (3, 'Vidal', 'Michael', "f"), ], ['member_id', 'last_name', 'first_name', "has_kids" ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], keep_null_rows=True).columns(has_kids=A.map(A.column("has_kids"), { "Y": "Yes", "N": "No" })) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["has_kids"]) == str( when(col("b.has_kids").eqNullSafe(lit("Y")), lit("Yes")).when( col("b.has_kids").eqNullSafe(lit("N")), lit("No")).otherwise(lit(None)).alias("___has_kids")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "has_kids").collect()[0][0] == "Yes" assert result_df.where("member_id == 2").select( "has_kids").collect()[0][0] == "No" assert result_df.where("member_id == 3").select( "has_kids").collect()[0][0] is None
def test_automapper_first_valid_column(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "33"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # The key thing in this test is that we are using the same mapper on sources with different columns, and they both # work as expected. # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( last_name=A.column("last_name"), age=A.first_valid_column( A.number(A.expression("CAST (age AS BIGINT)")), A.number(A.expression("CAST (my_age AS BIGINT)")), A.text(None), ), is_young=A.first_valid_column( A.map( A.column("age"), { "21": "yes", "33": "yes", "54": "no comment", None: "not provided" }, ), A.map( A.column("my_age"), { "21": "yes", "33": "yes", "54": "no comment", None: "not provided" }, ), ), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], col("my_age").cast("long").cast("long").alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age", "is_young").collect()[0][:] == (54, "no comment") assert result_df.where("member_id == 2").select( "age", "is_young").collect()[0][:] == (33, "yes")