def test_auto_mapper_datetime_regex_replace_format(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "1/13/1995"),
            (2, "1/3/1995"),
            (3, "11/3/1995"),
        ],
        ["member_id", "opening_date"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")
    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).columns(formatted_date=A.datetime(
                            value=A.regex_replace(A.column("opening_date"),
                                                  pattern=r"\b(\d)(?=/)",
                                                  replacement="0$1"),
                            formats=["M/dd/yyyy"],
                        ).to_date_format("yyyy-M-dd"))
    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")
    result_df: DataFrame = mapper.transform(df=source_df)

    assert (result_df.where("member_id == 1").select(
        "formatted_date").collect()[0][0] == "1995-1-13")
    assert (result_df.where("member_id == 2").select(
        "formatted_date").collect()[0][0] == "1995-1-03")
    assert (result_df.where("member_id == 3").select(
        "formatted_date").collect()[0][0] == "1995-11-03")
예제 #2
0
def test_auto_mapper_regex_replace(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01"),
            (2, "Vidal", "Michael", "1970-02-02"),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=[
            "member_id"
        ]).columns(my_column=A.regex_replace(A.column("last_name"), "i", "f"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["my_column"],
        regexp_replace(col("b.last_name"), "i", "f").alias("my_column"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    # noinspection SpellCheckingInspection
    assert (result_df.where("member_id == 1").select("my_column").collect()[0]
            [0] == "Qureshf")
    # noinspection SpellCheckingInspection
    assert (result_df.where("member_id == 2").select("my_column").collect()[0]
            [0] == "Vfdal")