def test_save_and_load_on_nested_list_params(self): temp_path = tempfile.mkdtemp() splitsArray = [ [-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.1, 1.2, float("inf")], ] bucketizer = Bucketizer(splitsArray=splitsArray, inputCols=["values", "values"], outputCols=["b1", "b2"]) savePath = temp_path + "/bk" bucketizer.write().overwrite().save(savePath) loadedBucketizer = Bucketizer.load(savePath) assert loadedBucketizer.getSplitsArray() == splitsArray
def main(iso_date, base_path): APP_NAME = "make_predictions.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load each and every model in the pipeline # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string indexers into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Run the requests through the transformations from training # # Get today and tomorrow's dates as iso strings to scope query today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date() iso_today = rounded_today.isoformat() # Build the day's input path: a date based primary key directory structure today_input_path = "{}/data/prediction_tasks_daily.json/{}".format( base_path, iso_today) from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), ]) prediction_requests = spark.read.json(today_input_path, schema=schema) prediction_requests.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests.withColumn( 'Route', concat(prediction_requests.Origin, lit('-'), prediction_requests.Dest)) prediction_requests_with_route.show(6) # Index string fields with the corresponding indexer for that column for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay and Distance final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Drop the indexes for the nominal fields index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Build the day's output path: a date based primary key directory structure today_output_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today) # Save the output to its daily bucket final_predictions.repartition(1).write.mode("overwrite").json( today_output_path)
def main(iso_date, base_path): APP_NAME = "make_predictions.py" # SparkSession이 없으면 환경 생성 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # 파이프라인에 모든 모델을 적재 # # 도착 지연 구간 설정 모델을 적재 from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # 모든 문자열 인덱서를 dict에 적재 from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # 수치 벡터 어셈블러 적재 from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # 분류 모델 적재 from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # 요청을 훈련 데이터로부터 변환을 통해 실행 # # 쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기 today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date()쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기 iso_today = rounded_today.isoformat() # 해당 날짜의 입력 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조 today_input_path = "{}/data/prediction_tasks_daily.json/{}".format( base_path, iso_today ) from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), ]) prediction_requests = spark.read.json(today_input_path, schema=schema) prediction_requests.show() # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests.withColumn( 'Route', concat( prediction_requests.Origin, lit('-'), prediction_requests.Dest ) ) prediction_requests_with_route.show(6) # 해당 열에 대응하는 인덱서로 문자열 필드를 인덱싱 for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # 수치열 벡터화: DepDelay, Distance final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # 명목형 필드를 위한 인덱스 제거 index_columns = ["Carrier_index", "DayOfMonth_index","DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # 예측 생성 predictions = rfc.transform(final_vectorized_features) # 원래 필드를 제공하기 위해 특징 벡터와 예측 메타데이터를 제거 predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # 결과 검사 final_predictions.show() # 해당 날짜의 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조 today_output_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today ) # 일별 구간에 결과 저장 final_predictions.repartition(1).write.mode("overwrite").json(today_output_path)
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext( appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", }) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'])) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
def main(base_path): APP_NAME = "make_predictions_streaming.py" # 10초마다 데이터 처리 PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # 스트리밍 패키지 추가 및 초기화 findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext( appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # 예측 생성에 사용된 모든 모델 적재 # # 도착 지연 구간화 모델 적재 from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # 모든 문자열 필드 벡터화 파이프라인을 dict에 적재 from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # 숫자 벡터 어셈블러 적재 from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # 분류 모델 적재 from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # 스트리밍에서 예측 요청 처리 # stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", }) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'])) row_stream.pprint() # # RDD 기반 객체 스트림에서 dataframe 생성 # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # 문자열 필드를 해당 열에 대응하는 파이프라인으로 벡터화 # 범주 필드를 범주형 특징 벡터로 변환한 다음 중간 결과 필드 삭제 for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # 숫사 열 벡터화: DepDelay, Distance, 인덱스 열 final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # 벡터 검사 final_vectorized_features.show() # 개별 인덱스 열 제거 index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # 예측 생성 predictions = rfc.transform(final_vectorized_features) # 원 필드에 제공하기 위해 특징 벡터와 예측 메타데이터 제거 predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # 결과 검사 final_predictions.show() # 몽고DB에 저장 if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # 분류를 수행하고 몽고 DB에 저장 row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream( ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", } ) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row( FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'] ) ) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat( prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest ) ) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
def main(base_path): spark = SparkSession.builder.config("spark.default.parallelism", 1).appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Messages look like: # # { # "Carrier": "DL", # "DayOfMonth": 25, # "DayOfWeek": 4, # "DayOfYear": 359, # "DepDelay": 10.0, # "Dest": "LAX", # "Distance": 2475.0, # "FlightDate": "2015-12-25", # "FlightNum": null, # "Origin": "JFK", # "Timestamp": "2019-10-31T00:19:47.633280", # "UUID": "af74b096-ecc7-4493-a79a-ebcdff699385" # } # # Process Prediction Requests from Kafka # message_df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", BROKERS) \ .option("subscribe", PREDICTION_TOPIC) \ .load() # Create a DataFrame out of the one-hot encoded RDD schema = T.StructType([ T.StructField("Carrier", T.StringType()), T.StructField("DayOfMonth", T.IntegerType()), T.StructField("DayOfWeek", T.IntegerType()), T.StructField("DayOfYear", T.IntegerType()), T.StructField("DepDelay", T.FloatType()), T.StructField("Dest", T.StringType()), T.StructField("Distance", T.FloatType()), T.StructField("FlightDate", T.StringType()), T.StructField("FlightNum", T.StringType()), T.StructField("Origin", T.StringType()), T.StructField("Timestamp", T.TimestampType()), T.StructField("UUID", T.StringType()), ]) prediction_requests_df = message_df.select( F.from_json(F.col("value").cast("string"), schema).alias("data")).select("data.*") # # Add a Route variable to replace FlightNum # prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', F.concat(prediction_requests_df.Origin, F.lit('-'), prediction_requests_df.Dest)) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Drop the individual index columns index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Store the results to MongoDB class MongoWriter: def open(self, partition_id, epoch_id): print(f"Opened partition id: {partition_id}, epoch: {epoch_id}") self.mongo_client = pymongo.MongoClient() print(f"Opened MongoClient: {self.mongo_client}") return True def process(self, row): print(f"Processing row: {row}") as_dict = row.asDict() print(f"Inserting row.asDict(): {as_dict}") id = self.mongo_client.agile_data_science.flight_delay_classification_response.insert_one( as_dict) print(f"Inserted row, got ID: {id.inserted_id}") self.mongo_client.close() return True def close(self, error): print("Closed with error: %s" % str(error)) return True query = final_predictions.writeStream.foreach(MongoWriter()).start() query.awaitTermination()
def main(iso_date, base_path): APP_NAME = "make_predictions.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load each and every model in the pipeline # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load the departure delay bucketizer departure_bucketizer_path = "{}/models/departure_bucketizer.bin".format( base_path) departure_bucketizer = Bucketizer.load(departure_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml import PipelineModel string_vectorizer_pipeline_models = {} for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "FlightNum", "DepDelayBucket" ]: string_pipeline_model_path = "{}/models/string_indexer_pipeline_model_{}.bin".format( base_path, column) string_pipeline_model = PipelineModel.load(string_pipeline_model_path) string_vectorizer_pipeline_models[column] = string_pipeline_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the final assembler final_assembler_path = "{}/models/final_vector_assembler.bin".format( base_path) final_assembler = VectorAssembler.load(final_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Run the requests through the transformations from training # # Get today and tomorrow's dates as iso strings to scope query today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date() iso_today = rounded_today.isoformat() # Build the day's input path: a date based primary key directory structure today_input_path = "{}/data/prediction_tasks_daily.json/{}".format( base_path, iso_today) from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), ]) prediction_requests = spark.read.json(today_input_path, schema=schema) prediction_requests.show() # Bucketize the departure and arrival delays for classification ml_bucketized_features = departure_bucketizer.transform( prediction_requests) # Check the buckets ml_bucketized_features.select("DepDelay", "DepDelayBucket").show() # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "FlightNum", "DepDelayBucket" ]: string_pipeline_path = "{}/models/string_indexer_pipeline_{}.bin".format( base_path, column) string_pipeline_model = string_vectorizer_pipeline_models[column] ml_bucketized_features = string_pipeline_model.transform( ml_bucketized_features) ml_bucketized_features = ml_bucketized_features.drop(column + "_index") # Vectorize numeric columns ml_bucketized_features = vector_assembler.transform(ml_bucketized_features) # Drop the original numeric columns numeric_columns = ["DepDelay", "Distance"] # Combine various features into one feature vector, 'features' final_vectorized_features = final_assembler.transform( ml_bucketized_features) final_vectorized_features.show() # Drop the individual vector columns feature_columns = [ "Carrier_vec", "DayOfMonth_vec", "DayOfWeek_vec", "DayOfYear_vec", "Origin_vec", "Dest_vec", "FlightNum_vec", "DepDelayBucket_vec", "NumericFeatures_vec" ] for column in feature_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Build the day's output path: a date based primary key directory structure today_output_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today) # Save the output to its daily bucket final_predictions.repartition(1).write.mode("overwrite").json( today_output_path)