示例#1
0
def test_automapper_map(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "Y"),
            (2, "Vidal", "Michael", "N"),
            (3, "Vidal", "Michael", "f"),
            (4, "Qureshi", "Imran", None),
        ],
        ["member_id", "last_name", "first_name", "has_kids"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).columns(has_kids=A.map(
                            A.column("has_kids"),
                            {
                                None: "Unspecified",
                                "Y": "Yes",
                                "N": "No"
                            },
                            "unknown",
                        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["has_kids"],
        when(col("b.has_kids").eqNullSafe(lit(None)), lit("Unspecified")).when(
            col("b.has_kids").eqNullSafe(lit("Y")), lit("Yes")).when(
                col("b.has_kids").eqNullSafe(lit("N")),
                lit("No")).otherwise(lit("unknown")).alias("___has_kids"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "has_kids").collect()[0][0] == "Yes"
    assert result_df.where("member_id == 2").select(
        "has_kids").collect()[0][0] == "No"
    assert (result_df.where("member_id == 3").select("has_kids").collect()[0]
            [0] == "unknown")
    assert (result_df.where("member_id == 4").select("has_kids").collect()[0]
            [0] == "Unspecified")
示例#2
0
def test_automapper_map_no_default(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran', "Y"),
        (2, 'Vidal', 'Michael', "N"),
        (3, 'Vidal', 'Michael', "f"),
    ], ['member_id', 'last_name', 'first_name', "has_kids"
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        keep_null_rows=True).columns(has_kids=A.map(A.column("has_kids"), {
            "Y": "Yes",
            "N": "No"
        }))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["has_kids"]) == str(
        when(col("b.has_kids").eqNullSafe(lit("Y")), lit("Yes")).when(
            col("b.has_kids").eqNullSafe(lit("N")),
            lit("No")).otherwise(lit(None)).alias("___has_kids"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "has_kids").collect()[0][0] == "Yes"
    assert result_df.where("member_id == 2").select(
        "has_kids").collect()[0][0] == "No"
    assert result_df.where("member_id == 3").select(
        "has_kids").collect()[0][0] is None
示例#3
0
def test_automapper_first_valid_column(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", "33"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # The key thing in this test is that we are using the same mapper on sources with different columns, and they both
    # work as expected.

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        last_name=A.column("last_name"),
        age=A.first_valid_column(
            A.number(A.expression("CAST (age AS BIGINT)")),
            A.number(A.expression("CAST (my_age AS BIGINT)")),
            A.text(None),
        ),
        is_young=A.first_valid_column(
            A.map(
                A.column("age"),
                {
                    "21": "yes",
                    "33": "yes",
                    "54": "no comment",
                    None: "not provided"
                },
            ),
            A.map(
                A.column("my_age"),
                {
                    "21": "yes",
                    "33": "yes",
                    "54": "no comment",
                    None: "not provided"
                },
            ),
        ),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        col("my_age").cast("long").cast("long").alias("age"))
    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age", "is_young").collect()[0][:] == (54, "no comment")
    assert result_df.where("member_id == 2").select(
        "age", "is_young").collect()[0][:] == (33, "yes")