def mapping(parameters: Dict[str, Any]) -> List[AutoMapperBase]: # example of a variable client_address_variable: str = "address1" mapper = AutoMapper(view=parameters["view"], source_view="patients", keys=["member_id"]).columns( patient_id=A.column("member_id"), dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList( [client_address_variable, "address2"]), dst4=AutoMapperList([ A.complex(use="usual", family=A.column("last_name")) ]), ) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns(dst5=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))])) mapper2 = AutoMapper(view=parameters["view2"], source_view="patients", keys=["member_id"]).columns( patient_id=A.column("member_id"), dst1="src2", dst22=AutoMapperList([client_address_variable]), ) return [mapper, mapper2]
def test_auto_mapper_full_no_keys(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame([ ('Qureshi', 'Imran'), ('Vidal', 'Michael'), ], ['last_name', 'first_name']).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper(view="members", source_view="patients").columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"])) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns( dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))]), dst5=AutoMapperList( [A.complex(use="usual", first=A.column("first_name"))]), ) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5, list(result_df.columns) assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst1").collect()[0][0] == "src1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst3").collect()[0][0][0] == "address1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst3").collect()[0][0][1] == "address2" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst4").collect()[0][0][0][0] == "usual" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst4").collect()[0][0][0][1] == "Qureshi" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst5").collect()[0][0][0][1] == "Imran"
def test_auto_mapper_full_no_views(spark_session: SparkSession) -> None: # Arrange source_df = spark_session.createDataFrame([ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name']) # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper(keys=["member_id"], drop_key_columns=False).columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"])) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns(dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))])) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5 assert result_df.where("member_id == 1").select( "dst1").collect()[0][0] == "src1" assert result_df.where("member_id == 1").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst3").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst3").collect()[0][0][1] == "address2" assert result_df.where("member_id == 1").select( "dst4").collect()[0][0][0][0] == "usual" assert result_df.where("member_id == 1").select( "dst4").collect()[0][0][0][1] == "Qureshi"
def test_auto_mapper_full(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "First"), (2, "Vidal", "Michael", "Second"), ], ["member_id", "last_name", "first_name", "class"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"]), class_=A.column("class"), ) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns(dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))])) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 6 assert result_df.where("member_id == 1").select( "dst1").collect()[0][0] == "src1" assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][1] == "address2") assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0] [0] == "usual") assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0] [1] == "Qureshi") assert result_df.columns[4] == "class" assert result_df.where("member_id == 1").select( "class").collect()[0][0] == "First"
def test_automapper_full_checkpointing(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) data_dir: Path = Path(__file__).parent.joinpath('./') temp_folder = data_dir.joinpath('./temp') if path.isdir(temp_folder): rmtree(temp_folder) mkdir(temp_folder) spark_session.createDataFrame( [ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name'] ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, checkpoint_after_columns=2, checkpoint_path=temp_folder ).columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"]) ) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns( dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))] ) ) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df ) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5 assert result_df.where("member_id == 1").select("dst1" ).collect()[0][0] == "src1" assert result_df.where("member_id == 1" ).select("dst2").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1" ).select("dst3").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1" ).select("dst3").collect()[0][0][1] == "address2" assert result_df.where("member_id == 1" ).select("dst4").collect()[0][0][0][0] == "usual" assert result_df.where("member_id == 1" ).select("dst4").collect()[0][0][0][1] == "Qureshi"