def test_auto_mapper_datetime_regex_replace_format( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "1/13/1995"), (2, "1/3/1995"), (3, "11/3/1995"), ], ["member_id", "opening_date"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).columns(formatted_date=A.datetime( value=A.regex_replace(A.column("opening_date"), pattern=r"\b(\d)(?=/)", replacement="0$1"), formats=["M/dd/yyyy"], ).to_date_format("yyyy-M-dd")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) assert (result_df.where("member_id == 1").select( "formatted_date").collect()[0][0] == "1995-1-13") assert (result_df.where("member_id == 2").select( "formatted_date").collect()[0][0] == "1995-1-03") assert (result_df.where("member_id == 3").select( "formatted_date").collect()[0][0] == "1995-11-03")
def test_auto_mapper_regex_replace(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01"), (2, "Vidal", "Michael", "1970-02-02"), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=[ "member_id" ]).columns(my_column=A.regex_replace(A.column("last_name"), "i", "f")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["my_column"], regexp_replace(col("b.last_name"), "i", "f").alias("my_column"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() # noinspection SpellCheckingInspection assert (result_df.where("member_id == 1").select("my_column").collect()[0] [0] == "Qureshf") # noinspection SpellCheckingInspection assert (result_df.where("member_id == 2").select("my_column").collect()[0] [0] == "Vfdal")