예제 #1
0
def mapping(parameters: Dict[str, Any]) -> List[AutoMapperBase]:
    # example of a variable
    client_address_variable: str = "address1"
    mapper = AutoMapper(view=parameters["view"],
                        source_view="patients",
                        keys=["member_id"]).columns(
                            patient_id=A.column("member_id"),
                            dst1="src1",
                            dst2=AutoMapperList([client_address_variable]),
                            dst3=AutoMapperList(
                                [client_address_variable, "address2"]),
                            dst4=AutoMapperList([
                                A.complex(use="usual",
                                          family=A.column("last_name"))
                            ]),
                        )

    company_name: str = "Microsoft"

    if company_name == "Microsoft":
        mapper = mapper.columns(dst5=AutoMapperList(
            [A.complex(use="usual", family=A.column("last_name"))]))

    mapper2 = AutoMapper(view=parameters["view2"],
                         source_view="patients",
                         keys=["member_id"]).columns(
                             patient_id=A.column("member_id"),
                             dst1="src2",
                             dst22=AutoMapperList([client_address_variable]),
                         )

    return [mapper, mapper2]
예제 #2
0
def test_auto_mapper_full_no_keys(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame([
        ('Qureshi', 'Imran'),
        ('Vidal', 'Michael'),
    ], ['last_name', 'first_name']).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # example of a variable
    client_address_variable: str = "address1"

    # Act
    mapper = AutoMapper(view="members", source_view="patients").columns(
        dst1="src1",
        dst2=AutoMapperList([client_address_variable]),
        dst3=AutoMapperList([client_address_variable, "address2"]))

    company_name: str = "Microsoft"

    if company_name == "Microsoft":
        mapper = mapper.columns(
            dst4=AutoMapperList(
                [A.complex(use="usual", family=A.column("last_name"))]),
            dst5=AutoMapperList(
                [A.complex(use="usual", first=A.column("first_name"))]),
        )

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    mapper.transform(df=source_df)
    result_df: DataFrame = spark_session.table("members")

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5, list(result_df.columns)
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst1").collect()[0][0] == "src1"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst2").collect()[0][0][0] == "address1"

    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst3").collect()[0][0][0] == "address1"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst3").collect()[0][0][1] == "address2"

    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst4").collect()[0][0][0][0] == "usual"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst4").collect()[0][0][0][1] == "Qureshi"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst5").collect()[0][0][0][1] == "Imran"
def test_automapper_filter_and_transform(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").complex(
        MyObject(age=A.transform(
            A.filter(column=A.column("identifier"),
                     func=lambda x: x["use"] == lit("usual")),
            A.complex(bar=A.field("value"), bar2=A.field("system")))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        transform(
            filter("b.identifier", lambda x: x["use"] == lit("usual")),
            lambda x: struct(x["value"].alias("bar"), x["system"].alias("bar2")
                             )).alias("age"))
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)
def test_auto_mapper_complex_with_mappers(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(dst2=A.complex(use="usual", family=A.complex(given="foo")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert_compare_expressions(
        sql_expressions["dst2"],
        struct(
            expr("usual").alias("use"),
            struct(expr("foo").alias("given")).alias("family"),
        ).alias("dst2"),
    )

    result_df.printSchema()
    result_df.show()

    result = result_df.where("member_id == 1").select("dst2").collect()[0][0]
    assert result[0] == "usual"
    assert result[1][0] == "foo"
def test_auto_mapper_array_single_item_with_mapper(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(dst2=AutoMapperList([A.complex(addr="address1")]))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["dst2"],
        when(
            array(struct(lit("address1").alias("addr"))).isNotNull(),
            filter(
                coalesce(array(struct(lit("address1").alias("addr"))),
                         array()),
                lambda x: x.isNotNull(),
            ),
        ).alias("dst2"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            [0] == "address1")
    assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0]
            [0] == "address1")
예제 #6
0
def test_auto_mapper_multiple_columns(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran'),
        (2, 'Vidal', 'Michael'),
    ], ['member_id', 'last_name', 'first_name'
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False).columns(dst1="src1").columns(
            dst2=AutoMapperList(["address1"])).columns(dst3=AutoMapperList(
                ["address1", "address2"])).columns(dst4=AutoMapperList(
                    [A.complex(use="usual", family=A.column("last_name"))]))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5, list(result_df.columns)
    assert result_df.where("member_id == 1").select(
        "dst1").collect()[0][0] == "src1"
    assert result_df.where("member_id == 1").select(
        "dst2").collect()[0][0][0] == "address1"

    assert result_df.where("member_id == 1").select(
        "dst3").collect()[0][0][0] == "address1"
    assert result_df.where("member_id == 1").select(
        "dst3").collect()[0][0][1] == "address2"

    assert result_df.where("member_id == 1").select(
        "dst4").collect()[0][0][0][0] == "usual"
    assert result_df.where("member_id == 1").select(
        "dst4").collect()[0][0][0][1] == "Qureshi"
def test_auto_mapper_full_no_views(spark_session: SparkSession) -> None:
    # Arrange
    source_df = spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran'),
        (2, 'Vidal', 'Michael'),
    ], ['member_id', 'last_name', 'first_name'])

    # example of a variable
    client_address_variable: str = "address1"

    # Act
    mapper = AutoMapper(keys=["member_id"], drop_key_columns=False).columns(
        dst1="src1",
        dst2=AutoMapperList([client_address_variable]),
        dst3=AutoMapperList([client_address_variable, "address2"]))

    company_name: str = "Microsoft"

    if company_name == "Microsoft":
        mapper = mapper.columns(dst4=AutoMapperList(
            [A.complex(use="usual", family=A.column("last_name"))]))

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5
    assert result_df.where("member_id == 1").select(
        "dst1").collect()[0][0] == "src1"
    assert result_df.where("member_id == 1").select(
        "dst2").collect()[0][0][0] == "address1"

    assert result_df.where("member_id == 1").select(
        "dst3").collect()[0][0][0] == "address1"
    assert result_df.where("member_id == 1").select(
        "dst3").collect()[0][0][1] == "address2"

    assert result_df.where("member_id == 1").select(
        "dst4").collect()[0][0][0][0] == "usual"
    assert result_df.where("member_id == 1").select(
        "dst4").collect()[0][0][0][1] == "Qureshi"
예제 #8
0
def test_auto_mapper_split_by_delimiter_and_transform(
    spark_session: SparkSession, ) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01"),
            (2, "Vidal|Bates", "Michael", "1970-02-02"),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]).complex(
            MyObject(my_column=A.transform(
                A.split_by_delimiter(A.column("last_name"), "|"),
                A.complex(bar=A.field("_"), bar2=A.field("_")),
            )))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["my_column"]) == str(
    #     split(col("b.last_name"), "[|]", -1).alias("my_column")
    # )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("my_column").collect()[0]
            [0][0]["bar"] == "Qureshi")

    assert (result_df.where("member_id == 2").select("my_column").collect()[0]
            [0][0]["bar"] == "Vidal")
    assert (result_df.where("member_id == 2").select("my_column").collect()[0]
            [0][1]["bar"] == "Bates")
def test_auto_mapper_complex(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran'),
        (2, 'Vidal', 'Michael'),
    ], ['member_id', 'last_name', 'first_name'
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"],
                        drop_key_columns=False).columns(
                            dst2=A.complex(use="usual", family="imran"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert str(sql_expressions["dst2"]) == str(
        struct(lit("usual").alias("use"),
               lit("imran").alias("family")).alias("dst2"))

    result_df.printSchema()
    result_df.show()

    result_df.where("member_id == 1").select("dst2").show()
    result_df.where("member_id == 1").select("dst2").printSchema()

    result = result_df.where("member_id == 1").select("dst2").collect()[0][0]
    assert result[0] == "usual"
    assert result[1] == "imran"
예제 #10
0
def test_auto_mapper_full(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "First"),
            (2, "Vidal", "Michael", "Second"),
        ],
        ["member_id", "last_name", "first_name", "class"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # example of a variable
    client_address_variable: str = "address1"

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        dst1="src1",
        dst2=AutoMapperList([client_address_variable]),
        dst3=AutoMapperList([client_address_variable, "address2"]),
        class_=A.column("class"),
    )

    company_name: str = "Microsoft"

    if company_name == "Microsoft":
        mapper = mapper.columns(dst4=AutoMapperList(
            [A.complex(use="usual", family=A.column("last_name"))]))

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    mapper.transform(df=source_df)
    result_df: DataFrame = spark_session.table("members")

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 6
    assert result_df.where("member_id == 1").select(
        "dst1").collect()[0][0] == "src1"
    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            == "address1")

    assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][0]
            == "address1")
    assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][1]
            == "address2")

    assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0]
            [0] == "usual")
    assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0]
            [1] == "Qureshi")

    assert result_df.columns[4] == "class"
    assert result_df.where("member_id == 1").select(
        "class").collect()[0][0] == "First"
예제 #11
0
def test_automapper_full_checkpointing(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    data_dir: Path = Path(__file__).parent.joinpath('./')

    temp_folder = data_dir.joinpath('./temp')
    if path.isdir(temp_folder):
        rmtree(temp_folder)
    mkdir(temp_folder)

    spark_session.createDataFrame(
        [
            (1, 'Qureshi', 'Imran'),
            (2, 'Vidal', 'Michael'),
        ], ['member_id', 'last_name', 'first_name']
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # example of a variable
    client_address_variable: str = "address1"

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
        checkpoint_after_columns=2,
        checkpoint_path=temp_folder
    ).columns(
        dst1="src1",
        dst2=AutoMapperList([client_address_variable]),
        dst3=AutoMapperList([client_address_variable, "address2"])
    )

    company_name: str = "Microsoft"

    if company_name == "Microsoft":
        mapper = mapper.columns(
            dst4=AutoMapperList(
                [A.complex(use="usual", family=A.column("last_name"))]
            )
        )

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df
    )
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    mapper.transform(df=source_df)
    result_df: DataFrame = spark_session.table("members")

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5
    assert result_df.where("member_id == 1").select("dst1"
                                                    ).collect()[0][0] == "src1"
    assert result_df.where("member_id == 1"
                           ).select("dst2").collect()[0][0][0] == "address1"

    assert result_df.where("member_id == 1"
                           ).select("dst3").collect()[0][0][0] == "address1"
    assert result_df.where("member_id == 1"
                           ).select("dst3").collect()[0][0][1] == "address2"

    assert result_df.where("member_id == 1"
                           ).select("dst4").collect()[0][0][0][0] == "usual"
    assert result_df.where("member_id == 1"
                           ).select("dst4").collect()[0][0][0][1] == "Qureshi"
def test_auto_mapper_columns(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        dst1="src1",
        dst2=AutoMapperList(["address1"]),
        dst3=AutoMapperList(["address1", "address2"]),
        dst4=AutoMapperList([A.complex(use="usual", family=A.column("last_name"))]),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # Assert
    assert len(sql_expressions) == 4
    assert str(sql_expressions["dst1"]) == str(lit("src1").alias("dst1"))
    assert str(sql_expressions["dst2"]) == str(
        filter(array(lit("address1")), lambda x: x.isNotNull()).alias("dst2")
    )
    assert str(sql_expressions["dst3"]) == str(
        filter(array(lit("address1"), lit("address2")), lambda x: x.isNotNull()).alias(
            "dst3"
        )
    )
    assert str(sql_expressions["dst4"]) == str(
        filter(
            array(
                struct(lit("usual").alias("use"), col("b.last_name").alias("family"))
            ),
            lambda x: x.isNotNull(),
        ).alias("dst4")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5
    assert result_df.where("member_id == 1").select("dst1").collect()[0][0] == "src1"
    assert (
        result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
        == "address1"
    )

    assert (
        result_df.where("member_id == 1").select("dst3").collect()[0][0][0]
        == "address1"
    )
    assert (
        result_df.where("member_id == 1").select("dst3").collect()[0][0][1]
        == "address2"
    )

    assert (
        result_df.where("member_id == 1").select("dst4").collect()[0][0][0][0]
        == "usual"
    )
    assert (
        result_df.where("member_id == 1").select("dst4").collect()[0][0][0][1]
        == "Qureshi"
    )