def test_auto_mapper_array_single_item(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(dst2=AutoMapperList(["address1"]))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["dst2"],
        when(
            array(expr("address1")).isNotNull(),
            filter(coalesce(array(expr("address1")), array()),
                   lambda x: x.isNotNull()),
        ).alias("dst2"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            == "address1")
    assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0]
            == "address1")
def test_auto_mapper_date_literal(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(birthDate=A.date("1970-01-01"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["birthDate"],
        coalesce(
            to_date(lit("1970-01-01"), format="y-M-d"),
            to_date(lit("1970-01-01"), format="yyyyMMdd"),
            to_date(lit("1970-01-01"), format="M/d/y"),
        ).alias("birthDate"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "birthDate").collect()[0][0] == date(1970, 1, 1)
    assert result_df.where("member_id == 2").select(
        "birthDate").collect()[0][0] == date(1970, 1, 1)
예제 #3
0
def test_automapper_if_regex(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).columns(age=A.if_regex(
                            column=A.column("my_age"),
                            check="5*",
                            value=A.number(A.column("my_age")),
                            else_=A.number(A.text("100")),
                        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        when(col("b.my_age").rlike("5*"),
             col("b.my_age").cast(LongType())).otherwise(
                 lit("100").cast(StringType()).cast(LongType())).alias("age"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 54
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] == 100

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_date_column_typed(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01"),
            (2, "Vidal", "Michael", "1970-02-02"),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn(
        "date_of_birth", to_date("date_of_birth", format="yyyy-MM-dd"))

    assert dict(source_df.dtypes)["date_of_birth"] == "date"

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=[
                            "member_id"
                        ]).columns(birthDate=A.date(A.column("date_of_birth")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(sql_expressions["birthDate"],
                               col("b.date_of_birth").alias("birthDate"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "birthDate").collect()[0][0] == date(1970, 1, 1)
    assert result_df.where("member_id == 2").select(
        "birthDate").collect()[0][0] == date(1970, 2, 2)

    assert dict(result_df.dtypes)["birthDate"] == "date"
def test_auto_mapper_lpad(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "1234"),
            (2, "1234567"),
            (3, "123456789"),
        ],
        ["member_id", "empi"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(my_column=A.lpad(column=A.column("empi"), length=9, pad="0"))

    # Assert
    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)

    assert str(sql_expressions["my_column"]) == str(
        lpad(col=col("b.empi"), len=9, pad="0").alias("my_column")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 1").select("my_column").collect()[0][0]
        == "000001234"
    )
    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 2").select("my_column").collect()[0][0]
        == "001234567"
    )

    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 3").select("my_column").collect()[0][0]
        == "123456789"
    )
예제 #6
0
def test_auto_mapper_complex(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(dst2=A.complex(use="usual", family="imran"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert_compare_expressions(
        sql_expressions["dst2"],
        struct(lit("usual").alias("use"),
               lit("imran").alias("family")).alias("dst2"),
    )

    result_df.printSchema()
    result_df.show()

    result_df.where("member_id == 1").select("dst2").show()
    result_df.where("member_id == 1").select("dst2").printSchema()

    result = result_df.where("member_id == 1").select("dst2").collect()[0][0]
    assert result[0] == "usual"
    assert result[1] == "imran"
예제 #7
0
def test_automapper_map_no_default(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran', "Y"),
        (2, 'Vidal', 'Michael', "N"),
        (3, 'Vidal', 'Michael', "f"),
    ], ['member_id', 'last_name', 'first_name', "has_kids"
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        keep_null_rows=True).columns(has_kids=A.map(A.column("has_kids"), {
            "Y": "Yes",
            "N": "No"
        }))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["has_kids"]) == str(
        when(col("b.has_kids").eqNullSafe(lit("Y")), lit("Yes")).when(
            col("b.has_kids").eqNullSafe(lit("N")),
            lit("No")).otherwise(lit(None)).alias("___has_kids"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "has_kids").collect()[0][0] == "Yes"
    assert result_df.where("member_id == 2").select(
        "has_kids").collect()[0][0] == "No"
    assert result_df.where("member_id == 3").select(
        "has_kids").collect()[0][0] is None
def test_auto_mapper_datetime_column_default(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "18922"),
            (2, "Vidal", "Michael", "1609390500"),
        ],
        ["member_id", "last_name", "first_name", "ts"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        keys=["member_id"]).columns(timestamp=A.unix_timestamp(A.column("ts")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["timestamp"],
        to_timestamp(from_unixtime(col("b.ts"), "yyyy-MM-dd HH:mm:ss"),
                     "yyyy-MM-dd HH:mm:ss").alias("timestamp"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.count() == 2

    assert result_df.where("member_id == 1").select(
        "timestamp").collect()[0][0] == datetime(1970, 1, 1, 5, 15, 22)
    assert result_df.where("member_id == 2").select(
        "timestamp").collect()[0][0] == datetime(2020, 12, 31, 4, 55, 0)

    assert dict(result_df.dtypes)["timestamp"] == "timestamp"
예제 #9
0
def test_auto_mapper_split_by_delimiter_and_transform(
    spark_session: SparkSession, ) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01"),
            (2, "Vidal|Bates", "Michael", "1970-02-02"),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]).complex(
            MyObject(my_column=A.transform(
                A.split_by_delimiter(A.column("last_name"), "|"),
                A.complex(bar=A.field("_"), bar2=A.field("_")),
            )))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["my_column"]) == str(
    #     split(col("b.last_name"), "[|]", -1).alias("my_column")
    # )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("my_column").collect()[0]
            [0][0]["bar"] == "Qureshi")

    assert (result_df.where("member_id == 2").select("my_column").collect()[0]
            [0][0]["bar"] == "Vidal")
    assert (result_df.where("member_id == 2").select("my_column").collect()[0]
            [0][1]["bar"] == "Bates")
def test_auto_mapper_complex_with_defined_class(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "Michael", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).complex(
        MyClass(name=A.column("last_name"), age=A.number(A.column("my_age"))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert str(sql_expressions["name"]) == str(
        col("b.last_name").cast("string").alias("name"))
    assert str(sql_expressions["age"]) == str(
        col("b.my_age").cast("long").alias("age"))

    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
예제 #11
0
def test_automapper_map(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "Y"),
            (2, "Vidal", "Michael", "N"),
            (3, "Vidal", "Michael", "f"),
        ],
        ["member_id", "last_name", "first_name", "has_kids"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(has_kids=A.map(A.column("has_kids"), {"Y": "Yes", "N": "No"}, "unknown"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["has_kids"],
        when(col("b.has_kids").eqNullSafe(lit("Y")), lit("Yes"))
        .when(col("b.has_kids").eqNullSafe(lit("N")), lit("No"))
        .otherwise(lit("unknown"))
        .alias("___has_kids"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("has_kids").collect()[0][0] == "Yes"
    assert result_df.where("member_id == 2").select("has_kids").collect()[0][0] == "No"
    assert (
        result_df.where("member_id == 3").select("has_kids").collect()[0][0]
        == "unknown"
    )
def test_auto_mapper_array_multiple_items_with_null(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran'),
        (2, 'Vidal', 'Michael'),
    ], ['member_id', 'last_name', 'first_name'
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df: DataFrame = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False).columns(
            dst2=AutoMapperList(["address1", "address2", None]))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["dst2"]) == str(
        filter(array(lit("address1"), lit("address2"), lit(None)),
               lambda x: x.isNotNull()).alias("dst2"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "dst2").collect()[0][0][0] == "address1"
    assert result_df.where("member_id == 1").select(
        "dst2").collect()[0][0][1] == "address2"
    assert result_df.where("member_id == 2").select(
        "dst2").collect()[0][0][0] == "address1"
    assert result_df.where("member_id == 2").select(
        "dst2").collect()[0][0][1] == "address2"
예제 #13
0
def test_automapper_concat_array(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        drop_key_columns=False).columns(
            age=A.column("identifier").concat(A.text("foo").to_array()))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        concat(col("b.identifier"),
               array(lit("foo").cast("string"))).alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.where("id == 1730325416").select(
        "age").collect()[0][0] == [
            "bar",
            "foo",
        ]

    assert result_df.where("id == 1467734301").select(
        "age").collect()[0][0] == [
            "John",
            "foo",
        ]
예제 #14
0
def test_auto_mapper_amount(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran', "54.45"),
        (2, 'Vidal', 'Michael', "67.67"),
    ], ['member_id', 'last_name', 'first_name', "my_age"
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"
                              ]).columns(age=A.amount(A.column("my_age")))

    debug_text: str = mapper.to_debug_string()
    print(debug_text)

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        col("b.my_age").cast("float").alias("age"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert approx(
        result_df.where("member_id == 1").select("age").collect()[0]
        [0]) == approx(54.45)
    assert approx(
        result_df.where("member_id == 2").select("age").collect()[0]
        [0]) == approx(67.67)

    assert dict(result_df.dtypes)["age"] == "float"
예제 #15
0
def test_auto_mapper_array_typed(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "0"),
            (2, "Vidal", "Michael", "1"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")
    source_df = source_df.withColumn("my_age", col("my_age").cast("boolean"))

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"
                              ]).columns(age=A.array(A.column("my_age")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        array(col("b.my_age")).alias("age"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("age").collect()[0][0] == [
        "0"
    ]
    assert result_df.where("member_id == 2").select("age").collect()[0][0] == [
        "1"
    ]

    assert dict(result_df.dtypes)["age"] == "array<string>"
def test_auto_mapper_complex_with_mappers(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, 'Qureshi', 'Imran'),
            (2, 'Vidal', 'Michael'),
        ], ['member_id', 'last_name', 'first_name']
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False
    ).columns(dst2=A.complex(use="usual", family=A.complex(given="foo")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df
    )
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert str(sql_expressions["dst2"]) == str(
        struct(
            expr("usual").alias("use"),
            struct(expr("foo").alias("given")).alias("family")
        ).alias("dst2")
    )

    result_df.printSchema()
    result_df.show()

    result = result_df.where("member_id == 1").select("dst2").collect()[0][0]
    assert result[0] == "usual"
    assert result[1][0] == "foo"
예제 #17
0
def test_automapper_if_not_null(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran', "54"),
        (2, 'Vidal', 'Michael', None),
    ], ['member_id', 'last_name', 'first_name', "my_age"
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=[
            "member_id"
        ]).columns(age=A.if_not_null(A.column(
            "my_age"), A.number(A.column("my_age")), A.number(A.text("100"))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        when(
            col("b.my_age").isNull(),
            lit("100").cast(StringType()).cast(IntegerType())).otherwise(
                col("b.my_age").cast(IntegerType())).alias("age"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 54
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] == 100

    assert dict(result_df.dtypes)["age"] == "int"
def test_auto_mapper_date_column(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran', "1970-01-01"),
        (2, 'Vidal', 'Michael', "12/31/2020"),
    ], ['member_id', 'last_name', 'first_name', "date_of_birth"
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=[
                            "member_id"
                        ]).columns(birthDate=A.date(A.column("date_of_birth")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["birthDate"]) == str(
        coalesce(to_date(col("b.date_of_birth"), format='y-M-d'),
                 to_date(col("b.date_of_birth"), format='yyyyMMdd'),
                 to_date(col("b.date_of_birth"),
                         format='M/d/y')).alias("birthDate"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "birthDate").collect()[0][0] == date(1970, 1, 1)
    assert result_df.where("member_id == 2").select(
        "birthDate").collect()[0][0] == date(2020, 12, 31)

    assert dict(result_df.dtypes)["birthDate"] == "date"
def test_auto_mapper_substring_by_delimiter(
    spark_session: SparkSession
) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, 'Qureshi', 'Imran', "1970-01-01"),
            (2, 'Vidal', 'Michael', "1970-02-02"),
        ], ['member_id', 'last_name', 'first_name', "date_of_birth"]
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(
        my_column=A.substring_by_delimiter(A.column("last_name"), "s", 1)
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df
    )
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["my_column"]) == str(
        substring_index(col("b.last_name"), "s", 1).alias("my_column")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("my_column"
                                                    ).collect()[0][0] == "Qure"
    assert result_df.where("member_id == 2"
                           ).select("my_column").collect()[0][0] == "Vidal"
예제 #20
0
def test_auto_mapper_regex_replace(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01"),
            (2, "Vidal", "Michael", "1970-02-02"),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=[
            "member_id"
        ]).columns(my_column=A.regex_replace(A.column("last_name"), "i", "f"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["my_column"],
        regexp_replace(col("b.last_name"), "i", "f").alias("my_column"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    # noinspection SpellCheckingInspection
    assert (result_df.where("member_id == 1").select("my_column").collect()[0]
            [0] == "Qureshf")
    # noinspection SpellCheckingInspection
    assert (result_df.where("member_id == 2").select("my_column").collect()[0]
            [0] == "Vfdal")
def test_auto_mapper_with_column_literal(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, 'Qureshi', 'Imran'),
            (2, 'Vidal', 'Michael'),
        ], ['member_id', 'last_name', 'first_name']
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False
    ).columns(lname="last_name")

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df
    )
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["lname"]
               ) == str(lit("last_name").alias("lname"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1"
                           ).select("lname").collect()[0][0] == "last_name"
    assert result_df.where("member_id == 2"
                           ).select("lname").collect()[0][0] == "last_name"
def test_automapper_null_remover(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients").columns(address=A.if_not_null(
            A.column("address"),
            value=A.column("address").select(
                A.if_not_null(
                    A.field("line"),
                    A.field("line").select(
                        A.current().sanitize()).remove_null_or_empty(),
                )),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["age"]) == str(
    #     filter("b.identifier", lambda x: x["use"] == lit("usual")).alias("age")
    # )
    result_df: DataFrame = mapper.transform(df=source_df)

    print(result_df.select("address").collect()[0][0])
    assert result_df.select("address").collect()[0][0][0] == [
        "1111 STREET LN",
        "SUITE 256",
    ]
    result_df.show(truncate=False)
def test_automapper_select_one(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients").columns(age=A.column("identifier").filter(
            lambda x: x["system"] == "http://hl7.org/fhir/sid/us-npi").
                                        select_one(A.field("_.value")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        transform(
            filter(
                "b.identifier",
                lambda x: x["system"] == lit("http://hl7.org/fhir/sid/us-npi"),
            ),
            lambda x: x["value"],
        )[0].alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.select("age").collect()[0][0] == "1730325416"
    assert result_df.select("age").collect()[1][0] == "1467734301"
def test_auto_mapper_with_filter(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        filter_by="left(last_name,2) == 'Vi'",
    ).columns(lname=A.column("last_name"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["lname"]) == str(
        col("b.last_name").alias("lname"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.count() == 1
    assert result_df.where("member_id == 2").select(
        "lname").collect()[0][0] == "Vidal"
예제 #25
0
def test_automapper_transform(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").complex(
        MyObject(age=A.transform(
            A.column("identifier"),
            A.complex(bar=A.field("value"), bar2=A.field("system")),
        )))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        transform(
            "b.identifier",
            lambda x: struct(
                col("x[value]").alias("bar"),
                col("x[system]").alias("bar2")),
        ).alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.select("age").collect()[0][0][0][0] == "123"
def test_auto_mapper_regex_replace(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01"),
            (2, "Vidal", "Michael", "1980-02-02"),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(my_column=A.regex_extract(A.column("date_of_birth"), r"^(\d{4}).*", 1))

    # Assert
    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)

    assert_compare_expressions(
        sql_expressions["my_column"],
        regexp_extract(col("b.date_of_birth"), r"^(\d{4}).*", 1).alias("my_column"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 1").select("my_column").collect()[0][0] == "1970"
    )
    # noinspection SpellCheckingInspection
    assert (
        result_df.where("member_id == 2").select("my_column").collect()[0][0] == "1980"
    )
예제 #27
0
def test_auto_mapper_fhir_id(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi- / &&"),
            (2, " Vidal. "),
        ],
        ["member_id", "last_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(patient=Patient(id_=FhirId(A.column("last_name"))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show(truncate=False)

    assert (
        result_df.where("member_id == 1").selectExpr("patient.id").collect()[0][0]
        == "Qureshi------"
    )
    assert (
        result_df.where("member_id == 2").selectExpr("patient.id").collect()[0][0]
        == "-Vidal.-"
    )
예제 #28
0
def test_automapper_flatten(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)

    source_view_name = "cascaded_list_view"
    result_view_name = "flatten_list_view"
    source_df = spark_session.createDataFrame([([[1], [2, 3, 4], [3, 5]],)], ["column"])
    source_df.printSchema()
    source_df.createOrReplaceTempView(source_view_name)

    # Act
    mapper = AutoMapper(view=result_view_name, source_view=source_view_name).columns(
        column_flat=A.flatten(A.column("column"))
    )

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # assert
    assert result_df.select("column_flat").collect()[0][0] == [1, 2, 3, 4, 3, 5]
def test_auto_mapper_join_using_delimiter(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (123456789, "Gagan", "Chawla", ["MD", "PhD"]),
        ],
        ["npi", "first_name", "last_name", "suffix"],
    ).createOrReplaceTempView("practitioners")

    source_df: DataFrame = spark_session.table("practitioners")

    df = source_df.select("npi")
    df.createOrReplaceTempView("physicians")

    # Act
    mapper = AutoMapper(
        view="physicians", source_view="practitioners", keys=[
            "npi"
        ]).columns(my_column=A.join_using_delimiter(A.column("suffix"), ", "))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["my_column"],
        array_join(col("b.suffix"), ", ").alias("my_column"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()
    assert (result_df.where("npi == 123456789").select("my_column").collect()
            [0][0] == "MD, PhD")
def test_automapper_copy_unmapped_properties(
        spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame(
        [
            ("Qureshi", "Imran", "Iqbal"),
            ("Vidal", "Michael", "Lweis"),
        ],
        ["last_name", "first_name", "middle_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        copy_all_unmapped_properties=True,
        copy_all_unmapped_properties_exclude=["first_name"],
    ).columns(last_name="last_name", )

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    mapper.transform(df=source_df)
    result_df: DataFrame = spark_session.table("members")

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 2, list(result_df.columns)

    assert result_df.columns == ["last_name", "middle_name"]