def test_auto_mapper_array_single_item(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(dst2=AutoMapperList(["address1"])) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["dst2"], when( array(expr("address1")).isNotNull(), filter(coalesce(array(expr("address1")), array()), lambda x: x.isNotNull()), ).alias("dst2"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0] == "address1")
def test_auto_mapper_date_literal(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(birthDate=A.date("1970-01-01")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["birthDate"], coalesce( to_date(lit("1970-01-01"), format="y-M-d"), to_date(lit("1970-01-01"), format="yyyyMMdd"), to_date(lit("1970-01-01"), format="M/d/y"), ).alias("birthDate"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "birthDate").collect()[0][0] == date(1970, 1, 1) assert result_df.where("member_id == 2").select( "birthDate").collect()[0][0] == date(1970, 1, 1)
def test_automapper_if_regex(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", None), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).columns(age=A.if_regex( column=A.column("my_age"), check="5*", value=A.number(A.column("my_age")), else_=A.number(A.text("100")), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( when(col("b.my_age").rlike("5*"), col("b.my_age").cast(LongType())).otherwise( lit("100").cast(StringType()).cast(LongType())).alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 54 assert result_df.where("member_id == 2").select( "age").collect()[0][0] == 100 assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_auto_mapper_date_column_typed(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01"), (2, "Vidal", "Michael", "1970-02-02"), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn( "date_of_birth", to_date("date_of_birth", format="yyyy-MM-dd")) assert dict(source_df.dtypes)["date_of_birth"] == "date" df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=[ "member_id" ]).columns(birthDate=A.date(A.column("date_of_birth"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions(sql_expressions["birthDate"], col("b.date_of_birth").alias("birthDate")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "birthDate").collect()[0][0] == date(1970, 1, 1) assert result_df.where("member_id == 2").select( "birthDate").collect()[0][0] == date(1970, 2, 2) assert dict(result_df.dtypes)["birthDate"] == "date"
def test_auto_mapper_lpad(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "1234"), (2, "1234567"), (3, "123456789"), ], ["member_id", "empi"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns(my_column=A.lpad(column=A.column("empi"), length=9, pad="0")) # Assert assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) assert str(sql_expressions["my_column"]) == str( lpad(col=col("b.empi"), len=9, pad="0").alias("my_column") ) result_df: DataFrame = mapper.transform(df=df) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 1").select("my_column").collect()[0][0] == "000001234" ) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 2").select("my_column").collect()[0][0] == "001234567" ) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 3").select("my_column").collect()[0][0] == "123456789" )
def test_auto_mapper_complex(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(dst2=A.complex(use="usual", family="imran")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert_compare_expressions( sql_expressions["dst2"], struct(lit("usual").alias("use"), lit("imran").alias("family")).alias("dst2"), ) result_df.printSchema() result_df.show() result_df.where("member_id == 1").select("dst2").show() result_df.where("member_id == 1").select("dst2").printSchema() result = result_df.where("member_id == 1").select("dst2").collect()[0][0] assert result[0] == "usual" assert result[1] == "imran"
def test_automapper_map_no_default(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran', "Y"), (2, 'Vidal', 'Michael', "N"), (3, 'Vidal', 'Michael', "f"), ], ['member_id', 'last_name', 'first_name', "has_kids" ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], keep_null_rows=True).columns(has_kids=A.map(A.column("has_kids"), { "Y": "Yes", "N": "No" })) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["has_kids"]) == str( when(col("b.has_kids").eqNullSafe(lit("Y")), lit("Yes")).when( col("b.has_kids").eqNullSafe(lit("N")), lit("No")).otherwise(lit(None)).alias("___has_kids")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "has_kids").collect()[0][0] == "Yes" assert result_df.where("member_id == 2").select( "has_kids").collect()[0][0] == "No" assert result_df.where("member_id == 3").select( "has_kids").collect()[0][0] is None
def test_auto_mapper_datetime_column_default( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "18922"), (2, "Vidal", "Michael", "1609390500"), ], ["member_id", "last_name", "first_name", "ts"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"]).columns(timestamp=A.unix_timestamp(A.column("ts"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["timestamp"], to_timestamp(from_unixtime(col("b.ts"), "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd HH:mm:ss").alias("timestamp"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.count() == 2 assert result_df.where("member_id == 1").select( "timestamp").collect()[0][0] == datetime(1970, 1, 1, 5, 15, 22) assert result_df.where("member_id == 2").select( "timestamp").collect()[0][0] == datetime(2020, 12, 31, 4, 55, 0) assert dict(result_df.dtypes)["timestamp"] == "timestamp"
def test_auto_mapper_split_by_delimiter_and_transform( spark_session: SparkSession, ) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01"), (2, "Vidal|Bates", "Michael", "1970-02-02"), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"]).complex( MyObject(my_column=A.transform( A.split_by_delimiter(A.column("last_name"), "|"), A.complex(bar=A.field("_"), bar2=A.field("_")), ))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["my_column"]) == str( # split(col("b.last_name"), "[|]", -1).alias("my_column") # ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("my_column").collect()[0] [0][0]["bar"] == "Qureshi") assert (result_df.where("member_id == 2").select("my_column").collect()[0] [0][0]["bar"] == "Vidal") assert (result_df.where("member_id == 2").select("my_column").collect()[0] [0][1]["bar"] == "Bates")
def test_auto_mapper_complex_with_defined_class( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "Michael", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).complex( MyClass(name=A.column("last_name"), age=A.number(A.column("my_age")))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert str(sql_expressions["name"]) == str( col("b.last_name").cast("string").alias("name")) assert str(sql_expressions["age"]) == str( col("b.my_age").cast("long").alias("age")) result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_automapper_map(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "Y"), (2, "Vidal", "Michael", "N"), (3, "Vidal", "Michael", "f"), ], ["member_id", "last_name", "first_name", "has_kids"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns(has_kids=A.map(A.column("has_kids"), {"Y": "Yes", "N": "No"}, "unknown")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["has_kids"], when(col("b.has_kids").eqNullSafe(lit("Y")), lit("Yes")) .when(col("b.has_kids").eqNullSafe(lit("N")), lit("No")) .otherwise(lit("unknown")) .alias("___has_kids"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("has_kids").collect()[0][0] == "Yes" assert result_df.where("member_id == 2").select("has_kids").collect()[0][0] == "No" assert ( result_df.where("member_id == 3").select("has_kids").collect()[0][0] == "unknown" )
def test_auto_mapper_array_multiple_items_with_null( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name' ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df: DataFrame = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False).columns( dst2=AutoMapperList(["address1", "address2", None])) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["dst2"]) == str( filter(array(lit("address1"), lit("address2"), lit(None)), lambda x: x.isNotNull()).alias("dst2")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("member_id == 1").select( "dst2").collect()[0][0][1] == "address2" assert result_df.where("member_id == 2").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("member_id == 2").select( "dst2").collect()[0][0][1] == "address2"
def test_automapper_concat_array(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients", drop_key_columns=False).columns( age=A.column("identifier").concat(A.text("foo").to_array())) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], concat(col("b.identifier"), array(lit("foo").cast("string"))).alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.where("id == 1730325416").select( "age").collect()[0][0] == [ "bar", "foo", ] assert result_df.where("id == 1467734301").select( "age").collect()[0][0] == [ "John", "foo", ]
def test_auto_mapper_amount(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran', "54.45"), (2, 'Vidal', 'Michael', "67.67"), ], ['member_id', 'last_name', 'first_name', "my_age" ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id" ]).columns(age=A.amount(A.column("my_age"))) debug_text: str = mapper.to_debug_string() print(debug_text) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( col("b.my_age").cast("float").alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert approx( result_df.where("member_id == 1").select("age").collect()[0] [0]) == approx(54.45) assert approx( result_df.where("member_id == 2").select("age").collect()[0] [0]) == approx(67.67) assert dict(result_df.dtypes)["age"] == "float"
def test_auto_mapper_array_typed(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "0"), (2, "Vidal", "Michael", "1"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn("my_age", col("my_age").cast("boolean")) df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id" ]).columns(age=A.array(A.column("my_age"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( array(col("b.my_age")).alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("age").collect()[0][0] == [ "0" ] assert result_df.where("member_id == 2").select("age").collect()[0][0] == [ "1" ] assert dict(result_df.dtypes)["age"] == "array<string>"
def test_auto_mapper_complex_with_mappers(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name'] ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False ).columns(dst2=A.complex(use="usual", family=A.complex(given="foo"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df ) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert str(sql_expressions["dst2"]) == str( struct( expr("usual").alias("use"), struct(expr("foo").alias("given")).alias("family") ).alias("dst2") ) result_df.printSchema() result_df.show() result = result_df.where("member_id == 1").select("dst2").collect()[0][0] assert result[0] == "usual" assert result[1][0] == "foo"
def test_automapper_if_not_null(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran', "54"), (2, 'Vidal', 'Michael', None), ], ['member_id', 'last_name', 'first_name', "my_age" ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=[ "member_id" ]).columns(age=A.if_not_null(A.column( "my_age"), A.number(A.column("my_age")), A.number(A.text("100")))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( when( col("b.my_age").isNull(), lit("100").cast(StringType()).cast(IntegerType())).otherwise( col("b.my_age").cast(IntegerType())).alias("age")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 54 assert result_df.where("member_id == 2").select( "age").collect()[0][0] == 100 assert dict(result_df.dtypes)["age"] == "int"
def test_auto_mapper_date_column(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame([ (1, 'Qureshi', 'Imran', "1970-01-01"), (2, 'Vidal', 'Michael', "12/31/2020"), ], ['member_id', 'last_name', 'first_name', "date_of_birth" ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=[ "member_id" ]).columns(birthDate=A.date(A.column("date_of_birth"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["birthDate"]) == str( coalesce(to_date(col("b.date_of_birth"), format='y-M-d'), to_date(col("b.date_of_birth"), format='yyyyMMdd'), to_date(col("b.date_of_birth"), format='M/d/y')).alias("birthDate")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "birthDate").collect()[0][0] == date(1970, 1, 1) assert result_df.where("member_id == 2").select( "birthDate").collect()[0][0] == date(2020, 12, 31) assert dict(result_df.dtypes)["birthDate"] == "date"
def test_auto_mapper_substring_by_delimiter( spark_session: SparkSession ) -> None: # Arrange spark_session.createDataFrame( [ (1, 'Qureshi', 'Imran', "1970-01-01"), (2, 'Vidal', 'Michael', "1970-02-02"), ], ['member_id', 'last_name', 'first_name', "date_of_birth"] ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns( my_column=A.substring_by_delimiter(A.column("last_name"), "s", 1) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df ) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["my_column"]) == str( substring_index(col("b.last_name"), "s", 1).alias("my_column") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("my_column" ).collect()[0][0] == "Qure" assert result_df.where("member_id == 2" ).select("my_column").collect()[0][0] == "Vidal"
def test_auto_mapper_regex_replace(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01"), (2, "Vidal", "Michael", "1970-02-02"), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=[ "member_id" ]).columns(my_column=A.regex_replace(A.column("last_name"), "i", "f")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["my_column"], regexp_replace(col("b.last_name"), "i", "f").alias("my_column"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() # noinspection SpellCheckingInspection assert (result_df.where("member_id == 1").select("my_column").collect()[0] [0] == "Qureshf") # noinspection SpellCheckingInspection assert (result_df.where("member_id == 2").select("my_column").collect()[0] [0] == "Vfdal")
def test_auto_mapper_with_column_literal(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, 'Qureshi', 'Imran'), (2, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name'] ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False ).columns(lname="last_name") assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df ) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["lname"] ) == str(lit("last_name").alias("lname")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1" ).select("lname").collect()[0][0] == "last_name" assert result_df.where("member_id == 2" ).select("lname").collect()[0][0] == "last_name"
def test_automapper_null_remover(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(address=A.if_not_null( A.column("address"), value=A.column("address").select( A.if_not_null( A.field("line"), A.field("line").select( A.current().sanitize()).remove_null_or_empty(), )), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["age"]) == str( # filter("b.identifier", lambda x: x["use"] == lit("usual")).alias("age") # ) result_df: DataFrame = mapper.transform(df=source_df) print(result_df.select("address").collect()[0][0]) assert result_df.select("address").collect()[0][0][0] == [ "1111 STREET LN", "SUITE 256", ] result_df.show(truncate=False)
def test_automapper_select_one(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(age=A.column("identifier").filter( lambda x: x["system"] == "http://hl7.org/fhir/sid/us-npi"). select_one(A.field("_.value"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], transform( filter( "b.identifier", lambda x: x["system"] == lit("http://hl7.org/fhir/sid/us-npi"), ), lambda x: x["value"], )[0].alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.select("age").collect()[0][0] == "1730325416" assert result_df.select("age").collect()[1][0] == "1467734301"
def test_auto_mapper_with_filter(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], filter_by="left(last_name,2) == 'Vi'", ).columns(lname=A.column("last_name")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["lname"]) == str( col("b.last_name").alias("lname")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.count() == 1 assert result_df.where("member_id == 2").select( "lname").collect()[0][0] == "Vidal"
def test_automapper_transform(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").complex( MyObject(age=A.transform( A.column("identifier"), A.complex(bar=A.field("value"), bar2=A.field("system")), ))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], transform( "b.identifier", lambda x: struct( col("x[value]").alias("bar"), col("x[system]").alias("bar2")), ).alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.select("age").collect()[0][0][0][0] == "123"
def test_auto_mapper_regex_replace(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01"), (2, "Vidal", "Michael", "1980-02-02"), ], ["member_id", "last_name", "first_name", "date_of_birth"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns(my_column=A.regex_extract(A.column("date_of_birth"), r"^(\d{4}).*", 1)) # Assert assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) assert_compare_expressions( sql_expressions["my_column"], regexp_extract(col("b.date_of_birth"), r"^(\d{4}).*", 1).alias("my_column"), ) result_df: DataFrame = mapper.transform(df=df) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 1").select("my_column").collect()[0][0] == "1970" ) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 2").select("my_column").collect()[0][0] == "1980" )
def test_auto_mapper_fhir_id(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi- / &&"), (2, " Vidal. "), ], ["member_id", "last_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns(patient=Patient(id_=FhirId(A.column("last_name")))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show(truncate=False) assert ( result_df.where("member_id == 1").selectExpr("patient.id").collect()[0][0] == "Qureshi------" ) assert ( result_df.where("member_id == 2").selectExpr("patient.id").collect()[0][0] == "-Vidal.-" )
def test_automapper_flatten(spark_session: SparkSession) -> None: clean_spark_session(spark_session) source_view_name = "cascaded_list_view" result_view_name = "flatten_list_view" source_df = spark_session.createDataFrame([([[1], [2, 3, 4], [3, 5]],)], ["column"]) source_df.printSchema() source_df.createOrReplaceTempView(source_view_name) # Act mapper = AutoMapper(view=result_view_name, source_view=source_view_name).columns( column_flat=A.flatten(A.column("column")) ) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # assert assert result_df.select("column_flat").collect()[0][0] == [1, 2, 3, 4, 3, 5]
def test_auto_mapper_join_using_delimiter(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (123456789, "Gagan", "Chawla", ["MD", "PhD"]), ], ["npi", "first_name", "last_name", "suffix"], ).createOrReplaceTempView("practitioners") source_df: DataFrame = spark_session.table("practitioners") df = source_df.select("npi") df.createOrReplaceTempView("physicians") # Act mapper = AutoMapper( view="physicians", source_view="practitioners", keys=[ "npi" ]).columns(my_column=A.join_using_delimiter(A.column("suffix"), ", ")) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["my_column"], array_join(col("b.suffix"), ", ").alias("my_column"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("npi == 123456789").select("my_column").collect() [0][0] == "MD, PhD")
def test_automapper_copy_unmapped_properties( spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame( [ ("Qureshi", "Imran", "Iqbal"), ("Vidal", "Michael", "Lweis"), ], ["last_name", "first_name", "middle_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper( view="members", source_view="patients", copy_all_unmapped_properties=True, copy_all_unmapped_properties_exclude=["first_name"], ).columns(last_name="last_name", ) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 2, list(result_df.columns) assert result_df.columns == ["last_name", "middle_name"]