def test_framework_fill_na_transformer(spark_session: SparkSession) -> None: # create a dataframe with the test data data_dir: Path = Path(__file__).parent.joinpath("./") df: DataFrame = create_empty_dataframe(spark_session=spark_session) view: str = "primary_care_protocol" FrameworkCsvLoader( view=view, filepath=data_dir.joinpath("primary_care_protocol.csv"), clean_column_names=False, ).transform(df) # ensure we have all the rows even the ones we want to drop result_df: DataFrame = spark_session.table(view) result_df = result_df.withColumn("Minimum Age", result_df["Minimum Age"].cast("float")) result_df.createOrReplaceTempView(view) assert 7 == result_df.count() # drop the rows with null NPI or null Last Name FrameworkFillNaTransformer(view=view, column_mapping={ "Minimum Age": 1.0, "Maximum Age": "No Limit" }).transform(df) # assert we get only the rows with a populated NPI result_df = spark_session.table(view) assert 7 == result_df.count() assert "No Limit" == result_df.select( "Maximum Age").collect()[1].__getitem__("Maximum Age") assert 24.0 == result_df.agg({"Minimum Age": "sum"}).collect()[0][0]
def test_json_splitter(spark_session: SparkSession) -> None: # Arrange data_dir: Path = Path(__file__).parent.joinpath("./") temp_folder = data_dir.joinpath("./temp") if path.isdir(temp_folder): rmtree(temp_folder) makedirs(temp_folder) df: DataFrame = create_empty_dataframe(spark_session=spark_session) insurance_feed_path: Path = data_dir.joinpath("large_json_file.json") # Act with ProgressLogger() as progress_logger: FrameworkJsonSplitter( file_path=insurance_feed_path, output_folder=temp_folder, max_size_per_file_in_mb=0.1, progress_logger=progress_logger, ).transform(df) # Assert files: List[str] = glob(str(temp_folder.joinpath("*.json"))) assert len(files) == 14 # read the last file and make sure it is valid json last_file: Path = temp_folder.joinpath("large_json_file_6.json") with open(last_file, "r+") as file: obj = json.loads(file.read()) assert obj[0]["name"] == "Sellers Mcguire" print(obj)
def test_framework_drop_duplicates_transformer( spark_session: SparkSession) -> None: # create a dataframe with the test data data_dir: Path = Path(__file__).parent.joinpath("./") df: DataFrame = create_empty_dataframe(spark_session=spark_session) view: str = "primary_care_protocol" FrameworkCsvLoader( view=view, filepath=data_dir.joinpath("primary_care_protocol.csv"), clean_column_names=False, ).transform(df) # ensure we have all the rows even the ones we want to drop result_df: DataFrame = spark_session.table(view) assert 3 == result_df.count() # drop the rows with null NPI or null Last Name FrameworkDropDuplicatesTransformer(columns=["NPI"], view=view).transform(df) # assert we get only the rows with a populated NPI result_df = spark_session.table(view) assert 2 == result_df.count()
def test_framework_drop_rows_with_null_transformer( spark_session: SparkSession) -> None: # create a dataframe with the test data data_dir: Path = Path(__file__).parent.joinpath("./") df: DataFrame = create_empty_dataframe(spark_session=spark_session) view: str = "primary_care_protocol" FrameworkCsvLoader( view=view, filepath=data_dir.joinpath("primary_care_protocol.csv"), clean_column_names=False, ).transform(df) # ensure we have all the rows even the ones we want to drop result_df: DataFrame = spark_session.table(view) assert 7 == result_df.count() # drop the rows with null NPI or null Last Name FrameworkDropRowsWithNullTransformer(columns_to_check=["NPI", "Last Name"], view=view).transform(df) # assert we get only the rows with a populated NPI result_df = spark_session.table(view) assert 1 == result_df.count() # ensure that no rows are dropped when there are no null values FrameworkDropRowsWithNullTransformer(columns_to_check=["NPI", "Last Name"], view=view).transform(result_df) assert 1 == result_df.count()
def test_can_run_python_transformer(spark_session: SparkSession) -> None: # Arrange # Act with ProgressLogger() as progress_logger: result: Transformer = FeaturesCarriersPythonV1( parameters={ "foo": "bar" }, progress_logger=progress_logger).transformers[0] # Assert assert isinstance(result, FeatureCarrierPythonTransformer) # make sure we can call transform on it df = create_empty_dataframe(spark_session=spark_session) result.transform(df)
def test_can_find_python_transformer(spark_session: SparkSession) -> None: # Arrange data_dir: Path = Path(__file__).parent.joinpath("./") # load_all_modules_from_dir(str(data_dir.joinpath("../../library/features/carriers_python/v1"))) # Act with ProgressLogger() as progress_logger: result: Transformer = get_python_transformer_from_location( location=str( data_dir.joinpath("library/features/carriers_python/v1")), import_module_name=".calculate", parameters={"foo": "bar"}, progress_logger=progress_logger, ) # Assert assert isinstance(result, FeatureCarrierPythonTransformer) # make sure we can call transform on it df = create_empty_dataframe(spark_session=spark_session) result.transform(df)
from os import path, makedirs from pathlib import Path from shutil import rmtree from pyspark.sql import SparkSession, DataFrame from spark_pipeline_framework.progress_logger.progress_logger import ProgressLogger from spark_pipeline_framework.utilities.spark_data_frame_helpers import create_empty_dataframe def ${TestName}(spark_session: SparkSession) -> None: # Arrange data_dir: Path = Path(__file__).parent.joinpath('./') temp_folder = data_dir.joinpath('./temp') if path.isdir(temp_folder): rmtree(temp_folder) makedirs(temp_folder) df: DataFrame = create_empty_dataframe(spark_session=spark_session) # Act with ProgressLogger() as progress_logger: ${Transformer}( progress_logger=progress_logger ).transform(df) # Assert