def test_automapper_filter_and_transform(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").complex(
        MyObject(age=A.transform(
            A.filter(column=A.column("identifier"),
                     func=lambda x: x["use"] == lit("usual")),
            A.complex(bar=A.field("value"), bar2=A.field("system")))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        transform(
            filter("b.identifier", lambda x: x["use"] == lit("usual")),
            lambda x: struct(x["value"].alias("bar"), x["system"].alias("bar2")
                             )).alias("age"))
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)
Exemplo n.º 2
0
def test_automapper_field(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").columns(
        age=A.column("identifier").select_one(A.field("type.coding[0].code"))
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["age"]
    #            ) == str(col("b.identifier[0]").alias("age"))
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.select("age").collect()[0][0] == "PRN"
Exemplo n.º 3
0
def test_auto_mapper_split_by_delimiter_and_transform(
    spark_session: SparkSession, ) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01"),
            (2, "Vidal|Bates", "Michael", "1970-02-02"),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]).complex(
            MyObject(my_column=A.transform(
                A.split_by_delimiter(A.column("last_name"), "|"),
                A.complex(bar=A.field("_"), bar2=A.field("_")),
            )))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["my_column"]) == str(
    #     split(col("b.last_name"), "[|]", -1).alias("my_column")
    # )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("my_column").collect()[0]
            [0][0]["bar"] == "Qureshi")

    assert (result_df.where("member_id == 2").select("my_column").collect()[0]
            [0][0]["bar"] == "Vidal")
    assert (result_df.where("member_id == 2").select("my_column").collect()[0]
            [0][1]["bar"] == "Bates")
def test_automapper_null_remover(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients").columns(address=A.if_not_null(
            A.column("address"),
            value=A.column("address").select(
                A.if_not_null(
                    A.field("line"),
                    A.field("line").select(
                        A.current().sanitize()).remove_null_or_empty(),
                )),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["age"]) == str(
    #     filter("b.identifier", lambda x: x["use"] == lit("usual")).alias("age")
    # )
    result_df: DataFrame = mapper.transform(df=source_df)

    print(result_df.select("address").collect()[0][0])
    assert result_df.select("address").collect()[0][0][0] == [
        "1111 STREET LN",
        "SUITE 256",
    ]
    result_df.show(truncate=False)
def test_automapper_nested_array_filter_simple_with_array(
    spark_session: SparkSession, ) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    environ["LOGLEVEL"] = "DEBUG"

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients").columns(age=A.nested_array_filter(
            array_field=A.column("array1"),
            inner_array_field=A.field("array2"),
            match_property="reference",
            match_value=A.text("bar"),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        filter(
            col("b.array1"),
            lambda y: exists(
                y["array2"], lambda x: x["reference"] == lit("bar").cast(
                    "string")),
        ).alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.printSchema()
    result_df.show(truncate=False)

    assert result_df.count() == 2
    assert result_df.select("age").collect()[0][0] == []
    assert result_df.select(
        "age").collect()[1][0][0]["array2"][0]["reference"] == "bar"
def test_automapper_select_one(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients").columns(age=A.column("identifier").filter(
            lambda x: x["system"] == "http://hl7.org/fhir/sid/us-npi").
                                        select_one(A.field("_.value")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        transform(
            filter(
                "b.identifier",
                lambda x: x["system"] == lit("http://hl7.org/fhir/sid/us-npi"),
            ),
            lambda x: x["value"],
        )[0].alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.select("age").collect()[0][0] == "1730325416"
    assert result_df.select("age").collect()[1][0] == "1467734301"
Exemplo n.º 7
0
def test_auto_mapper_hir_period_uses_date(spark_session: SparkSession) -> None:
    data_dir: Path = Path(__file__).parent.joinpath("./")

    temp_folder = data_dir.joinpath("./temp")
    if path.isdir(temp_folder):
        rmtree(temp_folder)

    encounter_test_folder: Path = data_dir.joinpath("test_files").joinpath(
        "encounter.json")

    minified_json_path: Path = create_jsonl_files(
        src_file=encounter_test_folder,
        dst_folder=temp_folder.joinpath("minified_period"),
        dst_file_name="1.json",
    )

    df = spark_session.read.json(str(minified_json_path))
    df.createOrReplaceTempView("encounters")

    mapper = AutoMapper(
        view="fhir_encounters",
        source_view="encounters",
        copy_all_unmapped_properties=True,
    ).complex(
        Encounter(
            use_date_for=["encounter.period.start", "encounter.period.end"],
            id_=FhirId(A.concat("pat", A.column("id"))),
            status=EncounterStatusCode(A.column("status")),
            class_=Coding(
                system=A.column("class.system"),
                code=ActEncounterCode(A.column("class.code")),
                display=A.column("class.display"),
            ),
            subject=Reference(
                display=A.column("subject.display"),
                reference=FhirReference(
                    resource="Patient",
                    column=A.concat(
                        "pat",
                        A.string_after_delimiter(A.column("subject.reference"),
                                                 "/"),
                    ),
                ),
            ),
            period=Period(start=A.column("period.start"),
                          end=A.column("period.end")),
            participant=FhirList(
                A.column("participant").select(  # type: ignore
                    EncounterParticipant(
                        individual=Reference[Union[Practitioner]]
                        (  # type: ignore
                            display=A.field("individual.display"),
                            reference=FhirReference(
                                resource="Practitioner",
                                column=A.concat(
                                    "pat",
                                    A.string_after_delimiter(
                                        A.field("individual.reference"), "/"),
                                ),
                            ),
                        ),
                        type_=FhirList(
                            A.field("type").select(  # type: ignore
                                CodeableConcept(
                                    coding=FhirList(  # type: ignore
                                        A.field(
                                            "coding").select(  # type: ignore
                                                Coding(
                                                    system=A.field("system"),
                                                    code=ParticipantTypeCode(
                                                        A.field("code")),
                                                    display=A.field("display"),
                                                ))),
                                    text=A.field("text"),
                                ), )),
                        period=Period(start=A.field("period.start"), ),
                    ), ), ),
        ))

    assert isinstance(mapper, AutoMapper)
    result_df: DataFrame = mapper.transform(df=df)
    assert result_df
    fhir_encounters_df = df.sql_ctx.table("fhir_encounters")
    assert isinstance(
        fhir_encounters_df.select(
            fhir_encounters_df.period.start).collect()[0][0],
        datetime.date,
    )
    assert isinstance(
        fhir_encounters_df.select(
            fhir_encounters_df.period.end).collect()[0][0],
        datetime.date,
    )
Exemplo n.º 8
0
def test_automapper_nested_array_filter_with_parent_column(
    spark_session: SparkSession,
) -> None:
    schema = StructType(
        [
            StructField("row_id", dataType=IntegerType(), nullable=False),
            StructField(
                "location",
                dataType=ArrayType(
                    StructType(
                        [
                            StructField("name", StringType(), True),
                        ]
                    )
                ),
            ),
            StructField(
                "schedule",
                dataType=ArrayType(
                    StructType(
                        [
                            StructField("name", StringType(), True),
                            StructField(
                                "actor",
                                ArrayType(
                                    StructType(
                                        [StructField("reference", StringType(), True)]
                                    ),
                                    True,
                                ),
                            ),
                        ]
                    )
                ),
            ),
            StructField(
                "single_level",
                dataType=ArrayType(
                    StructType(
                        [
                            StructField("reference", StringType(), True),
                        ]
                    )
                ),
            ),
        ]
    )
    spark_session.createDataFrame(
        [
            (
                1,
                [{"name": "location-100"}, {"name": "location-200"}],
                [
                    {
                        "name": "schedule-1",
                        "actor": [
                            {"reference": "location-100"},
                            {"reference": "practitioner-role-100"},
                        ],
                    },
                    {
                        "name": "schedule-2",
                        "actor": [
                            {"reference": "location-200"},
                            {"reference": "practitioner-role-200"},
                        ],
                    },
                ],
                [{"reference": "location-100"}, {"reference": "location-200"}],
            )
        ],
        schema,
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    mapper = AutoMapper(
        view="schedule", source_view="patients", keys=["row_id"]
    ).columns(
        location=A.column("location").select(
            AutoMapperElasticSearchLocation(
                name=A.field("name"),
                scheduling=A.nested_array_filter(
                    array_field=A.column("schedule"),
                    inner_array_field=A.field("actor"),
                    match_property="reference",
                    match_value=A.field("{parent}.name"),
                ).select_one(AutoMapperElasticSearchSchedule(name=A.field("name"))),
            )
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    print("------COLUMN SPECS------")
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")
    assert_compare_expressions(
        sql_expressions["location"],
        transform(
            col("b.location"),
            lambda l: (
                struct(
                    l["name"].alias("name"),
                    transform(
                        filter(
                            col("b.schedule"),
                            lambda s: exists(
                                s["actor"],
                                lambda a: a["reference"] == l["name"],  # type: ignore
                            ),
                        ),
                        lambda s: struct(s["name"].alias("name")),
                    )[0].alias("scheduling"),
                )
            ),
        ).alias("___location"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    # result_df.printSchema()
    # result_df.show(truncate=False)
    location_row = result_df.collect()[0].location
    for index, location in enumerate(location_row):
        location_name = location.name
        location_scheduling = location.scheduling
        assert location_name == f"location-{index + 1}00"
        assert len(location_scheduling) == 1
        assert location_scheduling.name == f"schedule-{index + 1}"