def test_auto_mapper_datetime_regex_replace_format( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "1/13/1995"), (2, "1/3/1995"), (3, "11/3/1995"), ], ["member_id", "opening_date"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).columns(formatted_date=A.datetime( value=A.regex_replace(A.column("opening_date"), pattern=r"\b(\d)(?=/)", replacement="0$1"), formats=["M/dd/yyyy"], ).to_date_format("yyyy-M-dd")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) assert (result_df.where("member_id == 1").select( "formatted_date").collect()[0][0] == "1995-1-13") assert (result_df.where("member_id == 2").select( "formatted_date").collect()[0][0] == "1995-1-03") assert (result_df.where("member_id == 3").select( "formatted_date").collect()[0][0] == "1995-11-03")
def test_automapper_flatten(spark_session: SparkSession) -> None: clean_spark_session(spark_session) source_view_name = "cascaded_list_view" result_view_name = "flatten_list_view" source_df = spark_session.createDataFrame([([[1], [2, 3, 4], [3, 5]], )], ["column"]) source_df.createOrReplaceTempView(source_view_name) # Act mapper = AutoMapper(view=result_view_name, source_view=source_view_name).columns( column_flat=A.flatten(A.column("column"))) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # assert assert result_df.select("column_flat").collect()[0][0] == [ 1, 2, 3, 4, 3, 5 ]
def test_automapper_transform(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").complex( MyObject(age=A.transform( A.column("identifier"), A.complex(bar=A.field("value"), bar2=A.field("system"))))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( transform( "b.identifier", lambda x: struct( col("x[value]").alias("bar"), col("x[system]").alias("bar2"))).alias("age")) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.select("age").collect()[0][0][0][0] == "123"
def test_auto_mapper_handles_duplicates(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame([ (1, 'Qureshi', 'Imran'), (2, 'Qureshi', 'Imran'), (3, 'Qureshi', 'Imran2'), (4, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name' ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id" ]).columns(dst1="src1", dst2=A.column("last_name"), dst3=A.column("first_name")) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert result_df.count() == 3
def test_automapper_filter(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").columns( age=A.filter(column=A.column("identifier"), func=lambda x: x["use"] == lit("usual"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], filter("b.identifier", lambda x: x["use"] == lit("usual")).alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False)
def test_automapper_field(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").columns( age=A.column("identifier").select_one(A.field("type.coding[0].code")) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["age"] # ) == str(col("b.identifier[0]").alias("age")) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.select("age").collect()[0][0] == "PRN"
def test_auto_mapper_amount(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54.45"), (2, "Vidal", "Michael", "67.67"), (3, "Alex", "Hearn", "1286782.17"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns( age=A.amount(A.column("my_age")), null_col=A.amount(AutoMapperDataTypeLiteral(None)), ) debug_text: str = mapper.to_debug_string() print(debug_text) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], col("b.my_age").cast("double").alias("age") ) assert_compare_expressions( sql_expressions["null_col"], lit(None).cast("double").alias("null_col") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert approx( result_df.where("member_id == 1").select("age", "null_col").collect()[0][:] ) == (approx(54.45), None) assert approx( result_df.where("member_id == 2").select("age", "null_col").collect()[0][:] ) == (approx(67.67), None) # Ensuring exact match in situations in which float arithmetic errors might occur assert ( str(result_df.where("member_id == 3").select("age").collect()[0][0]) == "1286782.17" ) assert dict(result_df.dtypes)["age"] == "double" assert dict(result_df.dtypes)["null_col"] == "double"
def test_automapper_flatten_with_null(spark_session: SparkSession) -> None: clean_spark_session(spark_session) source_view_name = "cascaded_list_view" result_view_name = "flatten_list_view" schema = StructType([ StructField( "column", ArrayType(elementType=ArrayType(elementType=IntegerType()))) ]) source_df = spark_session.createDataFrame( [([[1], [2, 3, 4], [3, 5], None], )], schema=schema) source_df.printSchema() source_df.createOrReplaceTempView(source_view_name) # Act mapper = AutoMapper(view=result_view_name, source_view=source_view_name).columns( column_flat=A.flatten(A.column("column"))) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # assert assert result_df.select("column_flat").collect()[0][0] == [ 1, 2, 3, 4, 3, 5 ]
def test_automapper_complex_with_skip_if_null( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=True, skip_if_columns_null_or_empty=["first_name"], ).complex( MyClass( id_=A.column("member_id"), name=A.column("last_name"), age=A.number(A.column("my_age")), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert str(sql_expressions["name"]) == str( when( col("b.first_name").isNull() | col("b.first_name").eqNullSafe(""), lit(None)).otherwise(col("b.last_name")).cast( StringType()).alias("name")) assert str(sql_expressions["age"]) == str( when( col("b.first_name").isNull() | col("b.first_name").eqNullSafe(""), lit(None)).otherwise(col("b.my_age")).cast( LongType()).alias("age")) result_df.printSchema() result_df.show() assert result_df.count() == 1 assert result_df.where("id == 1").select( "name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_full_no_keys(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame([ ('Qureshi', 'Imran'), ('Vidal', 'Michael'), ], ['last_name', 'first_name']).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper(view="members", source_view="patients").columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"])) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns( dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))]), dst5=AutoMapperList( [A.complex(use="usual", first=A.column("first_name"))]), ) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5, list(result_df.columns) assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst1").collect()[0][0] == "src1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst3").collect()[0][0][0] == "address1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst3").collect()[0][0][1] == "address2" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst4").collect()[0][0][0][0] == "usual" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst4").collect()[0][0][0][1] == "Qureshi" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst5").collect()[0][0][0][1] == "Imran"
def test_automapper_map(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "Y"), (2, "Vidal", "Michael", "N"), (3, "Vidal", "Michael", "f"), (4, "Qureshi", "Imran", None), ], ["member_id", "last_name", "first_name", "has_kids"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).columns(has_kids=A.map( A.column("has_kids"), { None: "Unspecified", "Y": "Yes", "N": "No" }, "unknown", )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["has_kids"], when(col("b.has_kids").eqNullSafe(lit(None)), lit("Unspecified")).when( col("b.has_kids").eqNullSafe(lit("Y")), lit("Yes")).when( col("b.has_kids").eqNullSafe(lit("N")), lit("No")).otherwise(lit("unknown")).alias("___has_kids"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "has_kids").collect()[0][0] == "Yes" assert result_df.where("member_id == 2").select( "has_kids").collect()[0][0] == "No" assert (result_df.where("member_id == 3").select("has_kids").collect()[0] [0] == "unknown") assert (result_df.where("member_id == 4").select("has_kids").collect()[0] [0] == "Unspecified")
def test_auto_mapper_datetime_column_default(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "18922"), (2, "Vidal", "Michael", "1609390500"), ], ["member_id", "last_name", "first_name", "ts"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns( timestamp=A.unix_timestamp(A.column("ts")), literal_val=A.unix_timestamp("1609390500"), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["literal_val"]) == str( to_timestamp( from_unixtime("1609390500", "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd HH:mm:ss" ).alias("literal_val") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.count() == 2 assert result_df.where("member_id == 1").select("timestamp").collect()[0][ 0 ] == datetime(1970, 1, 1, 5, 15, 22) assert result_df.where("member_id == 2").select("timestamp").collect()[0][ 0 ] == datetime(2020, 12, 31, 4, 55, 0) assert result_df.where("member_id == 1").select("literal_val").collect()[0][ 0 ] == datetime(2020, 12, 31, 4, 55, 0) assert result_df.where("member_id == 2").select("literal_val").collect()[0][ 0 ] == datetime(2020, 12, 31, 4, 55, 0) assert dict(result_df.dtypes)["timestamp"] == "timestamp" assert dict(result_df.dtypes)["literal_val"] == "timestamp"
def test_auto_mapper_number(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "67"), (3, "Old", "Methusela", "131026061001"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( age=A.number(A.column("my_age")), null_field=A.number(AutoMapperDataTypeLiteral(None)), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) in ( str(col("b.my_age").cast("int").alias("age")), str(col("b.my_age").cast("long").alias("age")), ) assert str(sql_expressions["null_field"]) == str( lit(None).cast("long").alias("null_field")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 54 assert result_df.where("member_id == 2").select( "age").collect()[0][0] == 67 assert (result_df.where("member_id == 3").select("age").collect()[0][0] == 131026061001) assert ( result_df.where("member_id == 1").select("null_field").collect()[0][0] is None) assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_array_multiple_items_structs( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, None, "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df: DataFrame = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(dst2=AutoMapperList( [ AutoMapperDataTypeComplexBase(a=A.column("first_name"), b=A.column("last_name")), AutoMapperDataTypeComplexBase(a=A.column("first_name"), b=None), ], include_null_properties=True, )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["dst2"]) == str( # filter( # array(lit("address1"), lit("address2")), lambda x: x.isNotNull() # ).alias("dst2") # ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] [0] == "Imran") assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] [1] == "Qureshi") assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0] [0] == "Michael") assert ( result_df.where("member_id == 2").select("dst2").collect()[0][0][0][1] is None)
def test_auto_mapper_coalesce(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", None), (2, None, "Michael", "1970-02-02"), (3, None, "Michael", None), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns( my_column=A.coalesce( A.column("last_name"), A.column("date_of_birth"), A.text("last_resort") ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["my_column"], coalesce( col("b.last_name"), col("b.date_of_birth"), lit("last_resort").cast(StringType()), ).alias("my_column"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert ( result_df.where("member_id == 1").select("my_column").collect()[0][0] == "Qureshi" ) assert ( result_df.where("member_id == 2").select("my_column").collect()[0][0] == "1970-02-02" ) assert ( result_df.where("member_id == 3").select("my_column").collect()[0][0] == "last_resort" )
def test_automapper_if_not_null_or_empty(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame( [ (1, 'Qureshi', 'Imran', "54"), (2, 'Vidal', 'Michael', ""), (3, 'Vidal3', 'Michael', None), ], ['member_id', 'last_name', 'first_name', "my_age"] ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df.show() df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False ).columns( age=A.if_not_null_or_empty( A.column("my_age"), A.column("my_age"), A.text("100") ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df ) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( when( col("b.my_age").isNull() | col("b.my_age").eqNullSafe(""), lit("100").cast(StringType()) ).otherwise(col("b.my_age")).alias("age") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("age" ).collect()[0][0] == "54" assert result_df.where("member_id == 2").select("age" ).collect()[0][0] == "100" assert result_df.where("member_id == 3").select("age" ).collect()[0][0] == "100" assert dict(result_df.dtypes)["age"] == "string"
def test_auto_mapper_cast(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "Michael", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn("an_array", array()) source_df.createOrReplaceTempView("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).complex( MyClass( name=A.column("last_name"), age=A.column("my_age").cast(AutoMapperNumberDataType), my_array=A.column("an_array").cast( AutoMapperList[AutoMapperNumberDataType] ), ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert str(sql_expressions["name"]) == str( col("b.last_name").cast("string").alias("name") ) assert str(sql_expressions["age"]) == str(col("b.my_age").cast("long").alias("age")) result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_fhir_group_resource(spark_session: SparkSession) -> None: spark_session.createDataFrame( [(1, "practitioner", "affiliated practitioner", 2)], ["practitioner_id", "type", "name", "affiliated_id"], ).createOrReplaceTempView("groups") source_df: DataFrame = spark_session.table("groups") df = source_df.select("practitioner_id") df.createOrReplaceTempView("view_group") mapper = AutoMapper( view="view_group", source_view="groups", keys=["practitioner_id"]).complex( Group( id_=FhirId(A.column("practitioner_id")), meta=Meta(source="http://medstarhealth.org/provider"), identifier=FhirList([ Identifier( value=A.column("practitioner_id"), type_=CodeableConcept(coding=FhirList([ Coding( system=IdentifierTypeCodesCode.codeset, code=IdentifierTypeCodesCode( A.text("PractitionerAffiliation")), ) ])), system="http://medstarhealth.org", ) ]), type_=GroupTypeCodeValues.Practitioner, actual=True, name=A.text("Medstar Affiliated Practitioner"), member=FhirList([ GroupMember(entity=Reference(reference=FhirReference( "Practitioner", A.column("affiliated_id"), )), # inactive=False, ), ]), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) result_df.printSchema() result_df.show(truncate=False)
def test_auto_mapper_decimal(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54.45"), (2, "Vidal", "Michael", "123467.678"), (3, "Paul", "Kyle", "13"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(age=A.decimal(A.column("my_age"), 10, 2)) debug_text: str = mapper.to_debug_string() print(debug_text) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], col("b.my_age").cast("decimal(10,2)").alias("age") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("age").collect()[0][0] == Decimal( "54.45" ) assert result_df.where("member_id == 2").select("age").collect()[0][0] == Decimal( "123467.68" ) assert result_df.where("member_id == 3").select("age").collect()[0][0] == Decimal( "13.00" ) assert dict(result_df.dtypes)["age"] == "decimal(10,2)"
def test_auto_mapper_hash(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "54"), (2, "Vidal", "67"), (3, "Vidal", None), (4, None, None), ], ["member_id", "last_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn("my_age", col("my_age").cast("int")) df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id" ]).columns(age=A.hash(A.column("my_age"), A.column("last_name"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], hash(col("b.my_age"), col("b.last_name")).cast("string").alias("age"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("age").collect()[0][0] == "-543157534") assert (result_df.where("member_id == 2").select("age").collect()[0][0] == "2048196121") assert (result_df.where("member_id == 3").select("age").collect()[0][0] == "-80001407") assert result_df.where("member_id == 4").select( "age").collect()[0][0] == "42" assert dict(result_df.dtypes)["age"] == "string"
def test_auto_mapper_date_format(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01 12:30"), (2, "Vidal", "Michael", "1970-02-02 06:30"), ], ["member_id", "last_name", "first_name", "opening_time"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn( "opening_time", to_timestamp("opening_time", format="yyyy-MM-dd hh:mm")) assert dict(source_df.dtypes)["opening_time"] == "timestamp" df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"]).columns(openingTime=A.datetime( A.column("opening_time")).to_date_format("hh:mm:ss")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["openingTime"], date_format(coalesce(to_timestamp(col("b.opening_time"))), "hh:mm:ss").alias("openingTime"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("openingTime").collect() [0][0] == "12:30:00") assert (result_df.where("member_id == 2").select("openingTime").collect() [0][0] == "06:30:00") # check type assert dict(result_df.dtypes)["openingTime"] == "string"
def test_auto_mapper_boolean(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "0"), (2, "Vidal", "Michael", "1"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).columns( age=A.boolean(A.column("my_age")), is_active=A.boolean("False"), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions(sql_expressions["age"], col("b.my_age").cast("boolean").alias("age")) assert_compare_expressions(sql_expressions["is_active"], lit("False").cast("boolean").alias("is_active")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age", "is_active", ).collect()[0][:] == (False, False) assert result_df.where("member_id == 2").select( "age", "is_active", ).collect()[0][:] == (True, False) assert dict(result_df.dtypes)["age"] == "boolean" assert dict(result_df.dtypes)["is_active"] == "boolean"
def test_auto_mapper_multiple_columns(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name' ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False).columns(dst1="src1").columns( dst2=AutoMapperList(["address1"])).columns(dst3=AutoMapperList( ["address1", "address2"])).columns(dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))])) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5, list(result_df.columns) assert result_df.where("member_id == 1").select( "dst1").collect()[0][0] == "src1" assert result_df.where("member_id == 1").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst3").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst3").collect()[0][0][1] == "address2" assert result_df.where("member_id == 1").select( "dst4").collect()[0][0][0][0] == "usual" assert result_df.where("member_id == 1").select( "dst4").collect()[0][0][0][1] == "Qureshi"
def test_automapper_nested_array_filter_simple_with_array( spark_session: SparkSession, ) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") environ["LOGLEVEL"] = "DEBUG" data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(age=A.nested_array_filter( array_field=A.column("array1"), inner_array_field=A.field("array2"), match_property="reference", match_value=A.text("bar"), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], filter( col("b.array1"), lambda y: exists( y["array2"], lambda x: x["reference"] == lit("bar").cast( "string")), ).alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.printSchema() result_df.show(truncate=False) assert result_df.count() == 2 assert result_df.select("age").collect()[0][0] == [] assert result_df.select( "age").collect()[1][0][0]["array2"][0]["reference"] == "bar"
def test_auto_mapper_fhir_reference(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi"), (2, "Vidal"), ], ["member_id", "last_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns( patient=Patient( id_=FhirId(A.column("last_name")), managingOrganization=Reference( reference=FhirReference("Organization", A.column("last_name")) ), ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show(truncate=False) assert ( result_df.where("member_id == 1") .selectExpr("patient.managingOrganization.reference") .collect()[0][0] == "Organization/Qureshi" ) assert ( result_df.where("member_id == 2") .selectExpr("patient.managingOrganization.reference") .collect()[0][0] == "Organization/Vidal" )
def test_auto_mapper_date_column_typed(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01"), (2, "Vidal", "Michael", "1970-02-02"), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn( "date_of_birth", to_date("date_of_birth", format="yyyy-MM-dd") ) assert dict(source_df.dtypes)["date_of_birth"] == "date" df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns(birthDate=A.date(A.column("date_of_birth"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["birthDate"]) == str( col("b.date_of_birth").alias("birthDate") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("birthDate").collect()[0][ 0 ] == date(1970, 1, 1) assert result_df.where("member_id == 2").select("birthDate").collect()[0][ 0 ] == date(1970, 2, 2) assert dict(result_df.dtypes)["birthDate"] == "date"
def test_auto_mapper_full_no_views(spark_session: SparkSession) -> None: # Arrange source_df = spark_session.createDataFrame([ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name']) # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper(keys=["member_id"], drop_key_columns=False).columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"])) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns(dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))])) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5 assert result_df.where("member_id == 1").select( "dst1").collect()[0][0] == "src1" assert result_df.where("member_id == 1").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst3").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst3").collect()[0][0][1] == "address2" assert result_df.where("member_id == 1").select( "dst4").collect()[0][0][0][0] == "usual" assert result_df.where("member_id == 1").select( "dst4").collect()[0][0][0][1] == "Qureshi"
def test_auto_mapper_fhir_plan_definition(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01", "female"), (2, "Vidal", "Michael", "1970-02-02", None), ], ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).complex( PlanDefinition( id_=FhirId(A.column("member_id")), status=PublicationStatusCodeValues.Active ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) result_df.printSchema() result_df.show()
def test_automapper_if_not(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", None), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=[ "member_id" ]).columns(age=A.if_not(column=A.column("my_age"), check=A.text("55"), value=A.number(A.text("100")))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( when( col("b.my_age").eqNullSafe(lit("55").cast(StringType())), lit(None)).otherwise( lit("100").cast(StringType()).cast(LongType())).alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.count() == 1 assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 100 assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_lpad(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "1234"), (2, "1234567"), (3, "123456789"), ], ["member_id", "empi"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns(my_column=A.lpad(column=A.column("empi"), length=9, pad="0")) # Assert assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) assert str(sql_expressions["my_column"]) == str( lpad(col=col("b.empi"), len=9, pad="0").alias("my_column") ) result_df: DataFrame = mapper.transform(df=df) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 1").select("my_column").collect()[0][0] == "000001234" ) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 2").select("my_column").collect()[0][0] == "001234567" ) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 3").select("my_column").collect()[0][0] == "123456789" )