def mapping(parameters: Dict[str, Any]) -> List[AutoMapperBase]: # example of a variable client_address_variable: str = "address1" mapper = AutoMapper(view=parameters["view"], source_view="patients", keys=["member_id"]).columns( patient_id=A.column("member_id"), dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList( [client_address_variable, "address2"]), dst4=AutoMapperList([ A.complex(use="usual", family=A.column("last_name")) ]), ) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns(dst5=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))])) mapper2 = AutoMapper(view=parameters["view2"], source_view="patients", keys=["member_id"]).columns( patient_id=A.column("member_id"), dst1="src2", dst22=AutoMapperList([client_address_variable]), ) return [mapper, mapper2]
def test_auto_mapper_full_no_keys(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame([ ('Qureshi', 'Imran'), ('Vidal', 'Michael'), ], ['last_name', 'first_name']).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper(view="members", source_view="patients").columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"])) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns( dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))]), dst5=AutoMapperList( [A.complex(use="usual", first=A.column("first_name"))]), ) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5, list(result_df.columns) assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst1").collect()[0][0] == "src1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst3").collect()[0][0][0] == "address1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst3").collect()[0][0][1] == "address2" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst4").collect()[0][0][0][0] == "usual" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst4").collect()[0][0][0][1] == "Qureshi" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst5").collect()[0][0][0][1] == "Imran"
def test_automapper_filter_and_transform(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").complex( MyObject(age=A.transform( A.filter(column=A.column("identifier"), func=lambda x: x["use"] == lit("usual")), A.complex(bar=A.field("value"), bar2=A.field("system"))))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( transform( filter("b.identifier", lambda x: x["use"] == lit("usual")), lambda x: struct(x["value"].alias("bar"), x["system"].alias("bar2") )).alias("age")) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False)
def test_auto_mapper_complex_with_mappers(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(dst2=A.complex(use="usual", family=A.complex(given="foo"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert_compare_expressions( sql_expressions["dst2"], struct( expr("usual").alias("use"), struct(expr("foo").alias("given")).alias("family"), ).alias("dst2"), ) result_df.printSchema() result_df.show() result = result_df.where("member_id == 1").select("dst2").collect()[0][0] assert result[0] == "usual" assert result[1][0] == "foo"
def test_auto_mapper_array_single_item_with_mapper( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(dst2=AutoMapperList([A.complex(addr="address1")])) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["dst2"], when( array(struct(lit("address1").alias("addr"))).isNotNull(), filter( coalesce(array(struct(lit("address1").alias("addr"))), array()), lambda x: x.isNotNull(), ), ).alias("dst2"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] [0] == "address1") assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0] [0] == "address1")
def test_auto_mapper_multiple_columns(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name' ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False).columns(dst1="src1").columns( dst2=AutoMapperList(["address1"])).columns(dst3=AutoMapperList( ["address1", "address2"])).columns(dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))])) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5, list(result_df.columns) assert result_df.where("member_id == 1").select( "dst1").collect()[0][0] == "src1" assert result_df.where("member_id == 1").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst3").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst3").collect()[0][0][1] == "address2" assert result_df.where("member_id == 1").select( "dst4").collect()[0][0][0][0] == "usual" assert result_df.where("member_id == 1").select( "dst4").collect()[0][0][0][1] == "Qureshi"
def test_auto_mapper_full_no_views(spark_session: SparkSession) -> None: # Arrange source_df = spark_session.createDataFrame([ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name']) # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper(keys=["member_id"], drop_key_columns=False).columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"])) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns(dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))])) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5 assert result_df.where("member_id == 1").select( "dst1").collect()[0][0] == "src1" assert result_df.where("member_id == 1").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst3").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst3").collect()[0][0][1] == "address2" assert result_df.where("member_id == 1").select( "dst4").collect()[0][0][0][0] == "usual" assert result_df.where("member_id == 1").select( "dst4").collect()[0][0][0][1] == "Qureshi"
def test_auto_mapper_split_by_delimiter_and_transform( spark_session: SparkSession, ) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01"), (2, "Vidal|Bates", "Michael", "1970-02-02"), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"]).complex( MyObject(my_column=A.transform( A.split_by_delimiter(A.column("last_name"), "|"), A.complex(bar=A.field("_"), bar2=A.field("_")), ))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["my_column"]) == str( # split(col("b.last_name"), "[|]", -1).alias("my_column") # ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("my_column").collect()[0] [0][0]["bar"] == "Qureshi") assert (result_df.where("member_id == 2").select("my_column").collect()[0] [0][0]["bar"] == "Vidal") assert (result_df.where("member_id == 2").select("my_column").collect()[0] [0][1]["bar"] == "Bates")
def test_auto_mapper_complex(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name' ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"], drop_key_columns=False).columns( dst2=A.complex(use="usual", family="imran")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert str(sql_expressions["dst2"]) == str( struct(lit("usual").alias("use"), lit("imran").alias("family")).alias("dst2")) result_df.printSchema() result_df.show() result_df.where("member_id == 1").select("dst2").show() result_df.where("member_id == 1").select("dst2").printSchema() result = result_df.where("member_id == 1").select("dst2").collect()[0][0] assert result[0] == "usual" assert result[1] == "imran"
def test_auto_mapper_full(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "First"), (2, "Vidal", "Michael", "Second"), ], ["member_id", "last_name", "first_name", "class"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"]), class_=A.column("class"), ) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns(dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))])) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 6 assert result_df.where("member_id == 1").select( "dst1").collect()[0][0] == "src1" assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][1] == "address2") assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0] [0] == "usual") assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0] [1] == "Qureshi") assert result_df.columns[4] == "class" assert result_df.where("member_id == 1").select( "class").collect()[0][0] == "First"
def test_automapper_full_checkpointing(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) data_dir: Path = Path(__file__).parent.joinpath('./') temp_folder = data_dir.joinpath('./temp') if path.isdir(temp_folder): rmtree(temp_folder) mkdir(temp_folder) spark_session.createDataFrame( [ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name'] ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, checkpoint_after_columns=2, checkpoint_path=temp_folder ).columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"]) ) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns( dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))] ) ) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df ) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5 assert result_df.where("member_id == 1").select("dst1" ).collect()[0][0] == "src1" assert result_df.where("member_id == 1" ).select("dst2").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1" ).select("dst3").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1" ).select("dst3").collect()[0][0][1] == "address2" assert result_df.where("member_id == 1" ).select("dst4").collect()[0][0][0][0] == "usual" assert result_df.where("member_id == 1" ).select("dst4").collect()[0][0][0][1] == "Qureshi"
def test_auto_mapper_columns(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( dst1="src1", dst2=AutoMapperList(["address1"]), dst3=AutoMapperList(["address1", "address2"]), dst4=AutoMapperList([A.complex(use="usual", family=A.column("last_name"))]), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # Assert assert len(sql_expressions) == 4 assert str(sql_expressions["dst1"]) == str(lit("src1").alias("dst1")) assert str(sql_expressions["dst2"]) == str( filter(array(lit("address1")), lambda x: x.isNotNull()).alias("dst2") ) assert str(sql_expressions["dst3"]) == str( filter(array(lit("address1"), lit("address2")), lambda x: x.isNotNull()).alias( "dst3" ) ) assert str(sql_expressions["dst4"]) == str( filter( array( struct(lit("usual").alias("use"), col("b.last_name").alias("family")) ), lambda x: x.isNotNull(), ).alias("dst4") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5 assert result_df.where("member_id == 1").select("dst1").collect()[0][0] == "src1" assert ( result_df.where("member_id == 1").select("dst2").collect()[0][0][0] == "address1" ) assert ( result_df.where("member_id == 1").select("dst3").collect()[0][0][0] == "address1" ) assert ( result_df.where("member_id == 1").select("dst3").collect()[0][0][1] == "address2" ) assert ( result_df.where("member_id == 1").select("dst4").collect()[0][0][0][0] == "usual" ) assert ( result_df.where("member_id == 1").select("dst4").collect()[0][0][0][1] == "Qureshi" )