def test_automapper_field(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").columns( age=A.column("identifier").select_one(A.field("type.coding[0].code")) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["age"] # ) == str(col("b.identifier[0]").alias("age")) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.select("age").collect()[0][0] == "PRN"
def test_automapper_flatten(spark_session: SparkSession) -> None: clean_spark_session(spark_session) source_view_name = "cascaded_list_view" result_view_name = "flatten_list_view" source_df = spark_session.createDataFrame([([[1], [2, 3, 4], [3, 5]], )], ["column"]) source_df.createOrReplaceTempView(source_view_name) # Act mapper = AutoMapper(view=result_view_name, source_view=source_view_name).columns( column_flat=A.flatten(A.column("column"))) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # assert assert result_df.select("column_flat").collect()[0][0] == [ 1, 2, 3, 4, 3, 5 ]
def test_automapper_flatten_with_null(spark_session: SparkSession) -> None: clean_spark_session(spark_session) source_view_name = "cascaded_list_view" result_view_name = "flatten_list_view" schema = StructType([ StructField( "column", ArrayType(elementType=ArrayType(elementType=IntegerType()))) ]) source_df = spark_session.createDataFrame( [([[1], [2, 3, 4], [3, 5], None], )], schema=schema) source_df.printSchema() source_df.createOrReplaceTempView(source_view_name) # Act mapper = AutoMapper(view=result_view_name, source_view=source_view_name).columns( column_flat=A.flatten(A.column("column"))) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # assert assert result_df.select("column_flat").collect()[0][0] == [ 1, 2, 3, 4, 3, 5 ]
def test_can_load_xml_file_with_schema(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('test.xml')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) xml_shema = StructType([ StructField("_id", StringType(), True), StructField("author", StringType(), True), StructField("description", StringType(), True), StructField("genre", StringType(), True), StructField("price", DoubleType(), True), StructField("publish_date", StringType(), True), StructField("title", StringType(), True), ]) # Act FrameworkXmlLoader(view="my_view", filepath=test_file_path, row_tag="book", schema=xml_shema).transform(df) result: DataFrame = spark_session.sql("SELECT * FROM my_view") result.show() assert result.count() == 12 assert len(result.columns) == 7
def test_can_load_non_standard_delimited_csv( spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('test.psv')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) # Act loader = FrameworkCsvLoader(view="my_view", filepath=test_file_path, delimiter="|") loader.transform(df) # noinspection SqlDialectInspection result: DataFrame = spark_session.sql("SELECT * FROM my_view") result.show() # Assert assert loader.getDelimiter() == "|" assert_results(result)
def test_correctly_loads_csv_with_clean_flag_on( spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('column_name_test.csv')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) # Act FrameworkCsvLoader( view="my_view", filepath=test_file_path, delimiter=",", clean_column_names=True, ).transform(df) # noinspection SqlDialectInspection result: DataFrame = spark_session.sql("SELECT * FROM my_view") # Assert assert_results(result) assert result.collect()[1][0] == "2" assert (result.columns[2] == "Ugly_column_with_chars_that_parquet_does_not_like_much_-")
def test_can_convert_json_folder_to_jsonl(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('test_files')}" temp_folder = data_dir.joinpath("temp") if path.isdir(temp_folder): rmtree(temp_folder) makedirs(temp_folder) schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) # Act FrameworkJsonToJsonlConverter(file_path=test_file_path, output_folder=temp_folder).transform(df) # Assert with open(temp_folder.joinpath("test.json"), "r+") as file: lines: List[str] = file.readlines() assert len(lines) == 2 assert ( lines[0] == '{"title":"A Philosophy of Software Design","authors":[{"given":["John"],"surname":"Ousterhout"}],"edition":null}\n' ) assert ( lines[1] == '{"title":"Essentials of Programming Languages","authors":[{"given":["Dan","P."],"surname":"Friedman"},{"given":["Mitchell"],"surname":"Wand"}],"edition":3}\n' )
def test_can_keep_columns(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('test.csv')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) # Act FrameworkCsvLoader(view="my_view", filepath=test_file_path, delimiter=",").transform(df) FrameworkSelectColumnsTransformer(view="my_view", keep_columns=["Column2"]).transform(df) # noinspection SqlDialectInspection result: DataFrame = spark_session.sql("SELECT * FROM my_view") result.show() # Assert assert len(result.columns) == 1 assert result.count() == 3 assert result.collect()[1][0] == "bar"
def test_can_run_unvalidated_framework_pipeline( spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") flights_path: str = f"file://{data_dir.joinpath('flights.csv')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) spark_session.sql("DROP TABLE IF EXISTS default.flights") # Act parameters = {"flights_path": flights_path} with ProgressLogger() as progress_logger: pipeline: MyUnValidatedPipeline = MyUnValidatedPipeline( parameters=parameters, progress_logger=progress_logger) transformer = pipeline.fit(df) transformer.transform(df) # Assert result_df: DataFrame = spark_session.sql("SELECT * FROM flights2") result_df.show() assert result_df.count() > 0
def test_fail_fast_validated_framework_pipeline_writes_results( spark_session: SparkSession, ) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") flights_path: str = f"file://{data_dir.joinpath('flights.csv')}" output_path: str = f"file://{data_dir.joinpath('temp').joinpath('validation.csv')}" if path.isdir(data_dir.joinpath("temp")): shutil.rmtree(data_dir.joinpath("temp")) schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) spark_session.sql("DROP TABLE IF EXISTS default.flights") # Act parameters = { "flights_path": flights_path, "validation_source_path": str(data_dir), "validation_output_path": output_path, } try: with ProgressLogger() as progress_logger: pipeline: MyFailFastValidatedPipeline = MyFailFastValidatedPipeline( parameters=parameters, progress_logger=progress_logger) transformer = pipeline.fit(df) transformer.transform(df) except AssertionError: validation_df = df.sql_ctx.read.csv(output_path, header=True) validation_df.show(truncate=False) assert validation_df.count() == 1
def test_automapper_filter_and_transform(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").complex( MyObject(age=A.transform( A.filter(column=A.column("identifier"), func=lambda x: x["use"] == lit("usual")), A.complex(bar=A.field("value"), bar2=A.field("system"))))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( transform( filter("b.identifier", lambda x: x["use"] == lit("usual")), lambda x: struct(x["value"].alias("bar"), x["system"].alias("bar2") )).alias("age")) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False)
def test_can_run_validated_framework_pipeline( spark_session: SparkSession) -> None: with pytest.raises(AssertionError): # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") flights_path: str = f"file://{data_dir.joinpath('flights.csv')}" output_path: str = f"file://{data_dir.joinpath('temp').joinpath('validation.csv')}" if path.isdir(data_dir.joinpath("temp")): shutil.rmtree(data_dir.joinpath("temp")) schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) spark_session.sql("DROP TABLE IF EXISTS default.flights") # Act parameters = { "flights_path": flights_path, "validation_source_path": str(data_dir), "validation_output_path": output_path, } with ProgressLogger() as progress_logger: pipeline: MyValidatedPipeline = MyValidatedPipeline( parameters=parameters, progress_logger=progress_logger) transformer = pipeline.fit(df) transformer.transform(df)
def test_auto_mapper_handles_duplicates(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame([ (1, 'Qureshi', 'Imran'), (2, 'Qureshi', 'Imran'), (3, 'Qureshi', 'Imran2'), (4, 'Vidal', 'Michael'), ], ['member_id', 'last_name', 'first_name' ]).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id" ]).columns(dst1="src1", dst2=A.column("last_name"), dst3=A.column("first_name")) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert result_df.count() == 3
def test_validation_recurses_query_dir(spark_session: SparkSession) -> None: clean_spark_session(spark_session) query_dir: Path = Path(__file__).parent.joinpath("./queries") more_queries_dir: str = "more_queries" data_dir: Path = Path(__file__).parent.joinpath("./data") test_data_file: str = f"{data_dir.joinpath('test.csv')}" validation_query_file: str = "validate.sql" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) FrameworkCsvLoader(view="my_view", filepath=test_data_file).transform(df) FrameworkValidationTransformer( validation_source_path=str(query_dir), validation_queries=[validation_query_file, more_queries_dir], ).transform(df) df_validation = df.sql_ctx.table("pipeline_validation") df_validation.show(truncate=False) assert 3 == df_validation.count( ), "Expected 3 total rows in pipeline_validation" assert (1 == df_validation.filter("is_failed == 1").count() ), "Expected one failing row in the validation table"
def test_correctly_loads_csv_with_clean_flag_off(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('column_name_test.json')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema ) # Act FrameworkJsonLoader( view="books", filepath=test_file_path, clean_column_names=False ).transform(df) # noinspection SqlDialectInspection result: DataFrame = spark_session.sql("SELECT * FROM books") # Assert assert result.count() == 2 assert result.collect()[1]["title"] == "Essentials of Programming Languages" assert len(result.collect()[1]["authors"]) == 2 assert result.collect()[1]["authors"][0]["surname"] == "Friedman" assert ( result.collect()[1]["Ugly column,with;chars{that}parquet(does)not like=much_-"] == 3 )
def test_can_load_simple_json(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('test.json')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) # Act FrameworkJsonLoader(view="books", filepath=test_file_path).transform(df) # noinspection SqlDialectInspection result: DataFrame = spark_session.sql("SELECT * FROM books") result.show() # Assert assert result.count() == 2 assert result.collect( )[1]["title"] == "Essentials of Programming Languages" assert len(result.collect()[1]["authors"]) == 2 assert result.collect()[1]["authors"][0]["surname"] == "Friedman" assert result.collect()[1]["edition"] == 3
def test_auto_mapper_full_no_keys(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame([ ('Qureshi', 'Imran'), ('Vidal', 'Michael'), ], ['last_name', 'first_name']).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # example of a variable client_address_variable: str = "address1" # Act mapper = AutoMapper(view="members", source_view="patients").columns( dst1="src1", dst2=AutoMapperList([client_address_variable]), dst3=AutoMapperList([client_address_variable, "address2"])) company_name: str = "Microsoft" if company_name == "Microsoft": mapper = mapper.columns( dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))]), dst5=AutoMapperList( [A.complex(use="usual", first=A.column("first_name"))]), ) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") mapper.transform(df=source_df) result_df: DataFrame = spark_session.table("members") # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5, list(result_df.columns) assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst1").collect()[0][0] == "src1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst2").collect()[0][0][0] == "address1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst3").collect()[0][0][0] == "address1" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst3").collect()[0][0][1] == "address2" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst4").collect()[0][0][0][0] == "usual" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst4").collect()[0][0][0][1] == "Qureshi" assert result_df.where("dst4[0].family == 'Qureshi'").select( "dst5").collect()[0][0][0][1] == "Imran"
def test_automapper_if_not_null_or_empty(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame( [ (1, 'Qureshi', 'Imran', "54"), (2, 'Vidal', 'Michael', ""), (3, 'Vidal3', 'Michael', None), ], ['member_id', 'last_name', 'first_name', "my_age"] ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df.show() df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False ).columns( age=A.if_not_null_or_empty( A.column("my_age"), A.column("my_age"), A.text("100") ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df ) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) == str( when( col("b.my_age").isNull() | col("b.my_age").eqNullSafe(""), lit("100").cast(StringType()) ).otherwise(col("b.my_age")).alias("age") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("age" ).collect()[0][0] == "54" assert result_df.where("member_id == 2").select("age" ).collect()[0][0] == "100" assert result_df.where("member_id == 3").select("age" ).collect()[0][0] == "100" assert dict(result_df.dtypes)["age"] == "string"
def test_can_load_fixed_width(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('test.txt')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) # Act FrameworkFixedWidthLoader( view="my_view", filepath=test_file_path, columns=[ ColumnSpec(column_name="id", start_pos=1, length=3, data_type=StringType()), ColumnSpec(column_name="some_date", start_pos=4, length=8, data_type=StringType()), ColumnSpec( column_name="some_string", start_pos=12, length=3, data_type=StringType(), ), ColumnSpec( column_name="some_integer", start_pos=15, length=4, data_type=IntegerType(), ), ], ).transform(df) # noinspection SqlDialectInspection result: DataFrame = spark_session.sql("SELECT * FROM my_view") result.show() # Assert assert result.count() == 2 assert result.collect()[0][0] == "001" assert result.collect()[1][0] == "002" assert result.collect()[0][1] == "01292017" assert result.collect()[1][1] == "01302017" assert result.collect()[0][2] == "you" assert result.collect()[1][2] == "me" assert result.collect()[0][3] == 1234 assert result.collect()[1][3] == 5678
def test_automapper_null_if_empty(spark_session: SparkSession) -> None: # Arrange clean_spark_session(session=spark_session) spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", ""), (3, "Vidal3", "Michael", None), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df.show() df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(age=A.column("my_age").to_null_if_empty()) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], when(col("b.my_age").eqNullSafe(""), lit(None)).otherwise(col("b.my_age")).alias("age"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == "54" assert result_df.where("member_id == 2").select( "age").collect()[0][0] is None assert result_df.where("member_id == 3").select( "age").collect()[0][0] is None assert dict(result_df.dtypes)["age"] == "string"
def test_automapper_nested_array_filter_simple_with_array( spark_session: SparkSession, ) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") environ["LOGLEVEL"] = "DEBUG" data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(age=A.nested_array_filter( array_field=A.column("array1"), inner_array_field=A.field("array2"), match_property="reference", match_value=A.text("bar"), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], filter( col("b.array1"), lambda y: exists( y["array2"], lambda x: x["reference"] == lit("bar").cast( "string")), ).alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.printSchema() result_df.show(truncate=False) assert result_df.count() == 2 assert result_df.select("age").collect()[0][0] == [] assert result_df.select( "age").collect()[1][0][0]["array2"][0]["reference"] == "bar"
def test_automapper_concat_array(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients", drop_key_columns=False).columns( age=A.column("identifier").concat(A.text("foo").to_array())) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], concat(col("b.identifier"), array(lit("foo").cast("string"))).alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.where("id == 1730325416").select( "age").collect()[0][0] == [ "bar", "foo", ] assert result_df.where("id == 1467734301").select( "age").collect()[0][0] == [ "John", "foo", ]
def test_web_crawler(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) response: Dict[str, Any] = {} # Act response = FrameworkWebCrawler(spider_class=SpiderTestClass, name="test_crawler").transform( df, response) # Assert print(response) assert response
def test_auto_mapper_schema_pruning_with_defined_class( spark_session: SparkSession, ) -> None: # Arrange clean_spark_session(spark_session) spark_session.createDataFrame( [ (1, "Qureshi", "Imran", 45), (2, "Vidal", "Michael", 35), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper( view="members", source_view="patients", ).complex( MyClass(name=A.column("last_name"), age=A.number(A.column("my_age")))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=source_df) # Assert assert_compare_expressions(sql_expressions["name"], col("b.last_name").cast("string").alias("name")) assert_compare_expressions(sql_expressions["age"], col("b.my_age").cast("long").alias("age")) result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "name").collect()[0][0] == "Qureshi" assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_automapper_select_one(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(age=A.column("identifier").filter( lambda x: x["system"] == "http://hl7.org/fhir/sid/us-npi"). select_one(A.field("_.value"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], transform( filter( "b.identifier", lambda x: x["system"] == lit("http://hl7.org/fhir/sid/us-npi"), ), lambda x: x["value"], )[0].alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.select("age").collect()[0][0] == "1730325416" assert result_df.select("age").collect()[1][0] == "1467734301"
def test_automapper_null_remover(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(address=A.if_not_null( A.column("address"), value=A.column("address").select( A.if_not_null( A.field("line"), A.field("line").select( A.current().sanitize()).remove_null_or_empty(), )), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # assert str(sql_expressions["age"]) == str( # filter("b.identifier", lambda x: x["use"] == lit("usual")).alias("age") # ) result_df: DataFrame = mapper.transform(df=source_df) print(result_df.select("address").collect()[0][0]) assert result_df.select("address").collect()[0][0][0] == [ "1111 STREET LN", "SUITE 256", ] result_df.show(truncate=False)
def test_can_load_xml_file(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('test.xml')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) # Act FrameworkXmlLoader(view="my_view", filepath=test_file_path, row_tag="book").transform(df) result: DataFrame = spark_session.sql("SELECT * FROM my_view") result.show() assert result.count() == 12 assert len(result.columns) == 7
def test_can_load_multiline_csv(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('multiline_row.csv')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) # Act FrameworkCsvLoader(view="my_view", filepath=test_file_path, delimiter=",", multiline=True).transform(df) # noinspection SqlDialectInspection result: DataFrame = spark_session.sql("SELECT * FROM my_view") assert 1 == result.count()
def test_can_load_simple_json_with_schema(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") test_file_path: str = f"{data_dir.joinpath('schema_test.json')}" test_file_path_2: str = f"{data_dir.joinpath('schema_test_2.json')}" schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) # Act FrameworkJsonLoader(view="books", filepath=test_file_path).transform(df) FrameworkJsonLoader(view="books_schema", filepath=test_file_path_2, use_schema_from_view="books").transform(df) result: DataFrame = spark_session.sql("SELECT * FROM books") result_2: DataFrame = spark_session.sql("SELECT * FROM books_schema") # Act assert result.schema == result_2.schema
def test_file_downloader(spark_session: SparkSession) -> None: # Arrange clean_spark_session(spark_session) schema = StructType([]) df: DataFrame = spark_session.createDataFrame( spark_session.sparkContext.emptyRDD(), schema) response: Dict[str, Any] = {} download_url: str = "https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip" download_to_path: str = f"file://{os.path.join(Path(__file__).parent, 'data')}" # Act response = FrameworkFileDownloader( download_urls=[download_url], download_to_path=download_to_path, extract_zips=True, ).transform(df, response) # Assert assert response