def test_automapper_filter_and_transform(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").complex( MyObject(age=A.transform( A.filter(column=A.column("identifier"), func=lambda x: x["use"] == lit("usual")), A.complex(bar=A.field("value"), bar2=A.field("system"))))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( transform( filter("b.identifier", lambda x: x["use"] == lit("usual")), lambda x: struct(x["value"].alias("bar"), x["system"].alias("bar2") )).alias("age")) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False)
def test_automapper_field(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").columns( age=A.column("identifier").select_one(A.field("type.coding[0].code")) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["age"] # ) == str(col("b.identifier[0]").alias("age")) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.select("age").collect()[0][0] == "PRN"
def test_auto_mapper_split_by_delimiter_and_transform( spark_session: SparkSession, ) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01"), (2, "Vidal|Bates", "Michael", "1970-02-02"), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"]).complex( MyObject(my_column=A.transform( A.split_by_delimiter(A.column("last_name"), "|"), A.complex(bar=A.field("_"), bar2=A.field("_")), ))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["my_column"]) == str( # split(col("b.last_name"), "[|]", -1).alias("my_column") # ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("my_column").collect()[0] [0][0]["bar"] == "Qureshi") assert (result_df.where("member_id == 2").select("my_column").collect()[0] [0][0]["bar"] == "Vidal") assert (result_df.where("member_id == 2").select("my_column").collect()[0] [0][1]["bar"] == "Bates")
def test_automapper_null_remover(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(address=A.if_not_null( A.column("address"), value=A.column("address").select( A.if_not_null( A.field("line"), A.field("line").select( A.current().sanitize()).remove_null_or_empty(), )), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["age"]) == str( # filter("b.identifier", lambda x: x["use"] == lit("usual")).alias("age") # ) result_df: DataFrame = mapper.transform(df=source_df) print(result_df.select("address").collect()[0][0]) assert result_df.select("address").collect()[0][0][0] == [ "1111 STREET LN", "SUITE 256", ] result_df.show(truncate=False)
def test_automapper_nested_array_filter_simple_with_array( spark_session: SparkSession, ) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") environ["LOGLEVEL"] = "DEBUG" data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(age=A.nested_array_filter( array_field=A.column("array1"), inner_array_field=A.field("array2"), match_property="reference", match_value=A.text("bar"), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], filter( col("b.array1"), lambda y: exists( y["array2"], lambda x: x["reference"] == lit("bar").cast( "string")), ).alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.printSchema() result_df.show(truncate=False) assert result_df.count() == 2 assert result_df.select("age").collect()[0][0] == [] assert result_df.select( "age").collect()[1][0][0]["array2"][0]["reference"] == "bar"
def test_automapper_select_one(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(age=A.column("identifier").filter( lambda x: x["system"] == "http://hl7.org/fhir/sid/us-npi"). select_one(A.field("_.value"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], transform( filter( "b.identifier", lambda x: x["system"] == lit("http://hl7.org/fhir/sid/us-npi"), ), lambda x: x["value"], )[0].alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.select("age").collect()[0][0] == "1730325416" assert result_df.select("age").collect()[1][0] == "1467734301"
def test_auto_mapper_hir_period_uses_date(spark_session: SparkSession) -> None: data_dir: Path = Path(__file__).parent.joinpath("./") temp_folder = data_dir.joinpath("./temp") if path.isdir(temp_folder): rmtree(temp_folder) encounter_test_folder: Path = data_dir.joinpath("test_files").joinpath( "encounter.json") minified_json_path: Path = create_jsonl_files( src_file=encounter_test_folder, dst_folder=temp_folder.joinpath("minified_period"), dst_file_name="1.json", ) df = spark_session.read.json(str(minified_json_path)) df.createOrReplaceTempView("encounters") mapper = AutoMapper( view="fhir_encounters", source_view="encounters", copy_all_unmapped_properties=True, ).complex( Encounter( use_date_for=["encounter.period.start", "encounter.period.end"], id_=FhirId(A.concat("pat", A.column("id"))), status=EncounterStatusCode(A.column("status")), class_=Coding( system=A.column("class.system"), code=ActEncounterCode(A.column("class.code")), display=A.column("class.display"), ), subject=Reference( display=A.column("subject.display"), reference=FhirReference( resource="Patient", column=A.concat( "pat", A.string_after_delimiter(A.column("subject.reference"), "/"), ), ), ), period=Period(start=A.column("period.start"), end=A.column("period.end")), participant=FhirList( A.column("participant").select( # type: ignore EncounterParticipant( individual=Reference[Union[Practitioner]] ( # type: ignore display=A.field("individual.display"), reference=FhirReference( resource="Practitioner", column=A.concat( "pat", A.string_after_delimiter( A.field("individual.reference"), "/"), ), ), ), type_=FhirList( A.field("type").select( # type: ignore CodeableConcept( coding=FhirList( # type: ignore A.field( "coding").select( # type: ignore Coding( system=A.field("system"), code=ParticipantTypeCode( A.field("code")), display=A.field("display"), ))), text=A.field("text"), ), )), period=Period(start=A.field("period.start"), ), ), ), ), )) assert isinstance(mapper, AutoMapper) result_df: DataFrame = mapper.transform(df=df) assert result_df fhir_encounters_df = df.sql_ctx.table("fhir_encounters") assert isinstance( fhir_encounters_df.select( fhir_encounters_df.period.start).collect()[0][0], datetime.date, ) assert isinstance( fhir_encounters_df.select( fhir_encounters_df.period.end).collect()[0][0], datetime.date, )
def test_automapper_nested_array_filter_with_parent_column( spark_session: SparkSession, ) -> None: schema = StructType( [ StructField("row_id", dataType=IntegerType(), nullable=False), StructField( "location", dataType=ArrayType( StructType( [ StructField("name", StringType(), True), ] ) ), ), StructField( "schedule", dataType=ArrayType( StructType( [ StructField("name", StringType(), True), StructField( "actor", ArrayType( StructType( [StructField("reference", StringType(), True)] ), True, ), ), ] ) ), ), StructField( "single_level", dataType=ArrayType( StructType( [ StructField("reference", StringType(), True), ] ) ), ), ] ) spark_session.createDataFrame( [ ( 1, [{"name": "location-100"}, {"name": "location-200"}], [ { "name": "schedule-1", "actor": [ {"reference": "location-100"}, {"reference": "practitioner-role-100"}, ], }, { "name": "schedule-2", "actor": [ {"reference": "location-200"}, {"reference": "practitioner-role-200"}, ], }, ], [{"reference": "location-100"}, {"reference": "location-200"}], ) ], schema, ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") mapper = AutoMapper( view="schedule", source_view="patients", keys=["row_id"] ).columns( location=A.column("location").select( AutoMapperElasticSearchLocation( name=A.field("name"), scheduling=A.nested_array_filter( array_field=A.column("schedule"), inner_array_field=A.field("actor"), match_property="reference", match_value=A.field("{parent}.name"), ).select_one(AutoMapperElasticSearchSchedule(name=A.field("name"))), ) ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) print("------COLUMN SPECS------") for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["location"], transform( col("b.location"), lambda l: ( struct( l["name"].alias("name"), transform( filter( col("b.schedule"), lambda s: exists( s["actor"], lambda a: a["reference"] == l["name"], # type: ignore ), ), lambda s: struct(s["name"].alias("name")), )[0].alias("scheduling"), ) ), ).alias("___location"), ) result_df: DataFrame = mapper.transform(df=source_df) # Assert # result_df.printSchema() # result_df.show(truncate=False) location_row = result_df.collect()[0].location for index, location in enumerate(location_row): location_name = location.name location_scheduling = location.scheduling assert location_name == f"location-{index + 1}00" assert len(location_scheduling) == 1 assert location_scheduling.name == f"schedule-{index + 1}"