def test_auto_mapper_with_column_expression( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name' ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).columns( lname=A.expression("SUBSTRING(last_name,1,3)")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["lname"]) == str( expr("SUBSTRING(last_name,1,3)").alias("lname")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "lname").collect()[0][0] == "Qur" assert result_df.where("member_id == 2").select( "lname").collect()[0][0] == "Vid"
def test_automapper_first_valid_column(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "33"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # The key thing in this test is that we are using the same mapper on sources with different columns, and they both # work as expected. # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( last_name=A.column("last_name"), age=A.first_valid_column( A.number(A.expression("CAST (age AS BIGINT)")), A.number(A.expression("CAST (my_age AS BIGINT)")), A.text(None), ), is_young=A.first_valid_column( A.map( A.column("age"), { "21": "yes", "33": "yes", "54": "no comment", None: "not provided" }, ), A.map( A.column("my_age"), { "21": "yes", "33": "yes", "54": "no comment", None: "not provided" }, ), ), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], col("my_age").cast("long").cast("long").alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age", "is_young").collect()[0][:] == (54, "no comment") assert result_df.where("member_id == 2").select( "age", "is_young").collect()[0][:] == (33, "yes")
def mapping(parameters: Dict[str, Any]) -> AutoMapper: mapper = AutoMapper( view=parameters["view"], source_view="bwellProviderFeed_08122020", # file name keys=["gecb_provider_number"], ).complex( Practitioner( id_=FhirId(A.column("b.gecb_provider_number")), meta=Meta( source=A.text("http://www.icanbwell.com"), security=FhirList( [ Coding( system="https://www.icanbwell.com/owner", code=AllSecurityLabelsCode(A.text("bwell")), ), Coding( system="https://www.icanbwell.com/access", code=AllSecurityLabelsCode(A.text("bwell")), ), Coding( system="https://www.icanbwell.com/vendor", code=AllSecurityLabelsCode(A.text("bwell")), ), ] ), ), identifier=FhirList( [ Identifier( use=IdentifierUseCodeValues.Usual, value=A.column("b.gecb_provider_number"), type_=CodeableConcept( coding=FhirList( [ Coding( system=IdentifierUseCode.codeset, code=IdentifierTypeCodesCodeValues.ProviderNumber, ) ] ) ), system="medstarhealth.org", ), Identifier( use=IdentifierUseCodeValues.Official, value=A.column("provider_npi"), type_=CodeableConcept( coding=FhirList( [ Coding( system=IdentifierUseCode.codeset, code=IdentifierTypeCodesCodeValues.NationalProviderIdentifier, ) ] ) ), system="http://hl7.org/fhir/sid/us-npi", ), ] ), active=True, name=FhirList( [ HumanName( given=FhirList( [ A.column("provider_first_name"), A.column("provider_middle_name"), ] ), family=A.column("provider_last_name"), suffix=FhirList([A.column("provider_title")]), use=NameUseCodeValues.Usual, text=A.text(" "), ) ] ), # birthdate="", gender=AdministrativeGenderCode( A.expression( """ CASE WHEN `provider_gender` = 'MALE' THEN 'male' WHEN `provider_gender` = 'FEMALE' THEN 'female' ELSE 'unknown' END """ ) ), telecom=FhirList([]), address=FhirList( [ Address( use=AddressUseCodeValues.Work, # AddressUseCode.Work, type_=AddressTypeCode( A.text("physical") ), # AddressTypeCode.Physical, text=A.column("practice_name"), line=FhirList( [ A.column("scheduling_location_address1"), A.column("scheduling_location_address2"), ] ), city=A.column("scheduling_location_city"), district=A.text(" "), state=A.column("scheduling_location_state"), postalCode=A.column("scheduling_location_zip"), country=A.text("USA"), ) ] ), ) ) return mapper