def test_should_maintain_all_data_it_reads() -> None: given_ingest_folder, given_transform_folder = __create_ingest_and_transform_folders( ) given_dataframe = SPARK.read.parquet(given_ingest_folder) distance_transformer.run(SPARK, given_ingest_folder, given_transform_folder) actual_dataframe = SPARK.read.parquet(given_transform_folder) actual_columns = set(actual_dataframe.columns) actual_schema = set(actual_dataframe.schema) expected_columns = set(given_dataframe.columns) expected_schema = set(given_dataframe.schema) assert expected_columns == actual_columns assert expected_schema.issubset(actual_schema)
def test_should_add_distance_column_with_calculated_distance() -> None: given_ingest_folder, given_transform_folder = __create_ingest_and_transform_folders( ) distance_transformer.run(SPARK, given_ingest_folder, given_transform_folder) actual_dataframe = SPARK.read.parquet(given_transform_folder) expected_dataframe = SPARK.createDataFrame([ SAMPLE_DATA[0] + [1.07], SAMPLE_DATA[1] + [0.92], SAMPLE_DATA[2] + [1.99], ], BASE_COLUMNS + ['distance']) expected_distance_schema = StructField('distance', DoubleType(), nullable=True) actual_distance_schema = actual_dataframe.schema['distance'] assert expected_distance_schema == actual_distance_schema assert expected_dataframe.collect() == actual_dataframe.collect()
import logging import sys from pyspark.sql import SparkSession from data_transformations.citibike import distance_transformer LOG_FILENAME = 'project.log' APP_NAME = "Citibike Pipeline: Distance Calculation" if __name__ == '__main__': logging.basicConfig(filename=LOG_FILENAME, level=logging.INFO) arguments = sys.argv if len(arguments) is not 3: logging.warning("Dataset file path and output path not specified!") sys.exit(1) dataset_path = arguments[2] output_path = arguments[3] spark = SparkSession.builder.appName(APP_NAME).getOrCreate() logging.info("Application Initialized: " + spark.sparkContext.appName) distance_transformer.run(spark, dataset_path, output_path) logging.info("Application Done: " + spark.sparkContext.appName) spark.stop()