def test_auto_mapper_number(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", "67"),
            (3, "Old", "Methusela", "131026061001"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        age=A.number(A.column("my_age")),
        null_field=A.number(AutoMapperDataTypeLiteral(None)),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) in (
        str(col("b.my_age").cast("int").alias("age")),
        str(col("b.my_age").cast("long").alias("age")),
    )

    assert str(sql_expressions["null_field"]) == str(
        lit(None).cast("long").alias("null_field"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 54
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] == 67
    assert (result_df.where("member_id == 3").select("age").collect()[0][0] ==
            131026061001)
    assert (
        result_df.where("member_id == 1").select("null_field").collect()[0][0]
        is None)

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
예제 #2
0
def test_automapper_if_list(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Qureshi", "Imran", "59"),
            (3, "Vidal", "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).columns(age=A.if_(
                            column=A.column("my_age"),
                            check=["54", "59"],
                            value=A.number(A.column("my_age")),
                            else_=A.number(A.text("100")),
                        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        when(col("b.my_age").isin(["54", "59"]),
             col("b.my_age").cast("long")).otherwise(
                 lit("100").cast(StringType()).cast(LongType())).alias("age"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 54
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] == 59
    assert result_df.where("member_id == 3").select(
        "age").collect()[0][0] == 100
    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_automapper_complex_with_skip_if_null(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=True,
        skip_if_columns_null_or_empty=["first_name"],
    ).complex(
        MyClass(
            id_=A.column("member_id"),
            name=A.column("last_name"),
            age=A.number(A.column("my_age")),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert str(sql_expressions["name"]) == str(
        when(
            col("b.first_name").isNull() | col("b.first_name").eqNullSafe(""),
            lit(None)).otherwise(col("b.last_name")).cast(
                StringType()).alias("name"))
    assert str(sql_expressions["age"]) == str(
        when(
            col("b.first_name").isNull() | col("b.first_name").eqNullSafe(""),
            lit(None)).otherwise(col("b.my_age")).cast(
                LongType()).alias("age"))

    result_df.printSchema()

    result_df.show()

    assert result_df.count() == 1
    assert result_df.where("id == 1").select(
        "name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
예제 #4
0
def test_automapper_if_regex(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran', "54"),
        (2, 'Vidal', 'Michael', None),
    ], ['member_id', 'last_name', 'first_name', "my_age"
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).columns(
                            age=A.if_regex(column=A.column("my_age"),
                                           check="5*",
                                           value=A.number(A.column("my_age")),
                                           else_=A.number(A.text("100"))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        when(col("b.my_age").rlike("5*"),
             col("b.my_age").cast(IntegerType())).otherwise(
                 lit("100").cast(StringType()).cast(
                     IntegerType())).alias("age"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 54
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] == 100

    assert dict(result_df.dtypes)["age"] == "int"
def test_auto_mapper_complex_with_extension(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "Michael", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).complex(
        MyClass(
            name=A.column("last_name"),
            age=A.number(A.column("my_age")),
            extension=AutoMapperList([
                MyProcessingStatusExtension(
                    processing_status=A.text("foo"),
                    request_id=A.text("bar"),
                    date_processed=A.date("2021-01-01"),
                )
            ]),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert str(sql_expressions["name"]) == str(
        col("b.last_name").cast("string").alias("name"))
    assert str(sql_expressions["age"]) == str(
        col("b.my_age").cast("long").alias("age"))

    result_df.printSchema()
    result_df.show(truncate=False)

    assert result_df.where("member_id == 1").select(
        "name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
예제 #6
0
def test_automapper_optional_ifexists(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        optional_age=AutoMapperIfColumnExistsType(
            column=A.column("my_age"),
            if_exists=A.number(A.column("my_age")),
            if_not_exists=A.text("no age"),
        ),
        optional_foo=AutoMapperIfColumnExistsType(
            column=A.column("foo"),
            if_exists=A.text("foo col is there"),
            if_not_exists=A.text("no foo"),
        ),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "optional_age", "optional_foo"
    ).collect()[0][:] == (54, "no foo")
    assert result_df.where("member_id == 2").select(
        "optional_age", "optional_foo"
    ).collect()[0][:] == (None, "no foo")
def test_auto_mapper_number_typed(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", "67"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn("my_age", col("my_age").cast("int"))

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"
                              ]).columns(age=A.number(A.column("my_age")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(sql_expressions["age"],
                               col("b.my_age").alias("age"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 54
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] == 67

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_complex_with_defined_class(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran', 45),
        (2, 'Vidal', 'Michael', 35),
    ], ['member_id', 'last_name', 'first_name', 'my_age'
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"],
                        drop_key_columns=False).complex(
                            MyClass(name=A.column("last_name"),
                                    age=A.number(A.column("my_age"))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert str(sql_expressions["name"]) == str(
        col("b.last_name").cast("string").alias("name"))
    assert str(sql_expressions["age"]) == str(
        col("b.my_age").cast("int").alias("age"))

    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] == "int"
예제 #9
0
def test_auto_mapper_schema_pruning_with_defined_class(
    spark_session: SparkSession, ) -> None:
    # Arrange
    clean_spark_session(spark_session)
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "Michael", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
    ).complex(
        MyClass(name=A.column("last_name"), age=A.number(A.column("my_age"))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    assert_compare_expressions(sql_expressions["name"],
                               col("b.last_name").cast("string").alias("name"))
    assert_compare_expressions(sql_expressions["age"],
                               col("b.my_age").cast("long").alias("age"))

    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
예제 #10
0
def test_auto_mapper_schema_pruning_with_extension(
    spark_session: SparkSession, ) -> None:
    # Arrange
    clean_spark_session(spark_session)

    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "Michael", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        enable_schema_pruning=True,
        skip_schema_validation=[],
    ).complex(
        MyClass(
            name=A.column("last_name"),
            age=A.number(A.column("my_age")),
            extension=AutoMapperList([
                MyProcessingStatusExtension(
                    processing_status=A.text("foo"),
                    request_id=A.text("bar"),
                    date_processed=A.date("2021-01-01"),
                )
            ]),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    assert_compare_expressions(sql_expressions["name"],
                               col("b.last_name").cast("string").alias("name"))
    assert_compare_expressions(sql_expressions["age"],
                               col("b.my_age").cast("long").alias("age"))

    result_df.printSchema()
    result_df.show(truncate=False)

    assert result_df.where("member_id == 1").select(
        "name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")

    # confirm schema
    expected_schema: StructType = StructType([
        StructField("name", StringType(), False),
        StructField("age", LongType(), True),
        StructField(
            "extension",
            ArrayType(
                StructType([
                    StructField("url", StringType()),
                    StructField(
                        "extension",
                        ArrayType(
                            StructType([
                                StructField("url", StringType()),
                                StructField("valueString", StringType()),
                            ])),
                    ),
                ])),
            True,
        ),
    ])

    result: SchemaComparerResult = SchemaComparer.compare_schema(
        parent_column_name=None,
        source_schema=result_df.schema,
        desired_schema=expected_schema,
    )

    assert result.errors == [], str(result)
def test_automapper_first_valid_column(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", "33"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df_1: DataFrame = spark_session.table("patients")

    df = source_df_1.select("member_id")
    df.createOrReplaceTempView("members")

    # The key thing in this test is that we are using the same mapper on sources with different columns, and they both
    # work as expected.

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        last_name=A.column("last_name"),
        age=A.first_valid_column(
            A.number(A.column("age")),
            A.number(A.column("my_age")),
            A.text(None),
        ),
        age2=A.first_valid_column(
            A.if_not_null(
                A.first_valid_column(
                    A.number(A.column("age")),
                    A.number(A.column("my_age")),
                    A.text(None),
                ),
                A.first_valid_column(
                    A.number(A.column("age")),
                    A.number(A.column("my_age")),
                    A.text(None),
                ),
                A.number(A.text("100")),
            ),
            A.number(A.column("age")),
            A.number(A.column("his_age")),
            A.number(99999),
        ),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions_1: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df_1)
    for column_name, sql_expression in sql_expressions_1.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions_1["age"]) == str(
        col("b.my_age").cast("long").alias("age"))
    result_df_1: DataFrame = mapper.transform(df=df)

    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "age"],
    ).createOrReplaceTempView("patients")

    source_df_2 = spark_session.table("patients")

    df = source_df_1.select("member_id")
    df.createOrReplaceTempView("members")

    sql_expressions_2: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df_2)
    assert str(sql_expressions_2["age"]) == str(
        col("b.age").cast("long").alias("___age"))

    result_df_2 = mapper.transform(df=df)

    # Assert
    result_df_1.printSchema()
    result_df_1.show()

    result_df_2.printSchema()
    result_df_2.show()

    assert result_df_1.where("member_id == 1").select(
        "age", "age2").collect()[0][:] == (54, 54)
    assert result_df_1.where("member_id == 2").select(
        "age", "age2").collect()[0][:] == (33, 33)

    assert result_df_2.where("member_id == 1").select(
        "age", "age2").collect()[0][:] == (54, 54)
    assert result_df_2.where("member_id == 2").select(
        "age", "age2").collect()[0][:] == (None, 100)
예제 #12
0
def test_automapper_first_valid_column(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", "33"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # The key thing in this test is that we are using the same mapper on sources with different columns, and they both
    # work as expected.

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        last_name=A.column("last_name"),
        age=A.first_valid_column(
            A.number(A.expression("CAST (age AS BIGINT)")),
            A.number(A.expression("CAST (my_age AS BIGINT)")),
            A.text(None),
        ),
        is_young=A.first_valid_column(
            A.map(
                A.column("age"),
                {
                    "21": "yes",
                    "33": "yes",
                    "54": "no comment",
                    None: "not provided"
                },
            ),
            A.map(
                A.column("my_age"),
                {
                    "21": "yes",
                    "33": "yes",
                    "54": "no comment",
                    None: "not provided"
                },
            ),
        ),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        col("my_age").cast("long").cast("long").alias("age"))
    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age", "is_young").collect()[0][:] == (54, "no comment")
    assert result_df.where("member_id == 2").select(
        "age", "is_young").collect()[0][:] == (33, "yes")