def main(): # spark = SparkSession.builder.appName('google-play-store-streamer').getOrCreate() sc = SparkContext(appName="PysparkStreaming").getOrCreate() ssc = StreamingContext(sc, 3) # Load Model model = RandomForestClassificationModel.load(MODEL_PATH) def parseStream(rdd): if not rdd.isEmpty(): df = sc.read.json(rdd) df.show() # Vectorize data feature_cols = df.columns feature_cols.remove('Installs indexed') assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="error") pipeline = Pipeline(stages=[assembler]) outputModel = pipeline.fit(df) output = outputModel.transform(df) final_data = output.select("features", "Installs indexed") # Predict predictions = model.transform(final_data) evaluator = MulticlassClassificationEvaluator( labelCol="Installs indexed", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Random forest test Error = %g" % (1.0 - accuracy)) randomForestError = (1.0 - accuracy) print(randomForestError) stream_data = ssc.textFileStream('StreamData/') stream_data.foreachRDD(lambda rdd: parseStream(rdd)) ssc.start() ssc.awaitTermination()
def sendRecord(df): from pyspark.ml.classification import RandomForestClassificationModel sameModel = RandomForestClassificationModel.load("randomForest.model") df = df.withColumn("amount", df["amount"].cast(FloatType())) df = df.withColumn("newbalanceDest", df["newbalanceDest"].cast(FloatType())) df = df.withColumn("newbalanceOrig", df["newbalanceOrig"].cast(FloatType())) df = df.withColumn("oldbalanceDest", df["oldbalanceDest"].cast(FloatType())) df = df.withColumn("oldbalanceOrg", df["oldbalanceOrg"].cast(FloatType())) df = df.withColumn("isFlaggedFraud", df["isFlaggedFraud"].cast(IntegerType())) df = df.withColumn("step", df["step"].cast(IntegerType())) df = df.withColumn("Type", df["Type"].cast(IntegerType())) assembler = VectorAssembler(inputCols=[ "Type", "amount", "newbalanceDest", "newbalanceOrig", "oldbalanceDest", "oldbalanceOrg", "step" ], outputCol="features") output = assembler.transform(df).select("features") predictions = sameModel.transform(output) pr = predictions.select("prediction") pr = pr.rdd if (str(pr.collect() == [Row(prediction=1.0)]) == "True"): print("FRAUD!!!!") else: print("Not Fraud")
def main(iso_date, base_path): APP_NAME = "make_predictions.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load each and every model in the pipeline # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string indexers into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Run the requests through the transformations from training # # Get today and tomorrow's dates as iso strings to scope query today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date() iso_today = rounded_today.isoformat() # Build the day's input path: a date based primary key directory structure today_input_path = "{}/data/prediction_tasks_daily.json/{}".format( base_path, iso_today) from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), ]) prediction_requests = spark.read.json(today_input_path, schema=schema) prediction_requests.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests.withColumn( 'Route', concat(prediction_requests.Origin, lit('-'), prediction_requests.Dest)) prediction_requests_with_route.show(6) # Index string fields with the corresponding indexer for that column for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay and Distance final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Drop the indexes for the nominal fields index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Build the day's output path: a date based primary key directory structure today_output_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today) # Save the output to its daily bucket final_predictions.repartition(1).write.mode("overwrite").json( today_output_path)
predictions = rForestModel.transform(pTestDF) # %% evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1") evaluator.evaluate(predictions) # %% lr = LogisticRegression(featuresCol='features', labelCol='class') lrModel = lr.fit(pTrainDF) predictionsLR = lrModel.transform(pTestDF) evaluator.evaluate(predictionsLR) # %% naiveBayes = NaiveBayes(featuresCol='features', labelCol='class') naiveModel = naiveBayes.fit(pTrainDF) predictionsNaive = naiveModel.transform(pTestDF) evaluator.evaluate(predictionsNaive) # %% pipelineModel.save('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/pipelineW2V') rForestModel.save('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/rForest') #%% pipelineModel = PipelineModel.load('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/pipelineW2V') rForestModel = RandomForestClassificationModel.load('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/rForest') # %%
def main(iso_date, base_path): APP_NAME = "make_predictions.py" # SparkSession이 없으면 환경 생성 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # 파이프라인에 모든 모델을 적재 # # 도착 지연 구간 설정 모델을 적재 from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # 모든 문자열 인덱서를 dict에 적재 from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # 수치 벡터 어셈블러 적재 from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # 분류 모델 적재 from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # 요청을 훈련 데이터로부터 변환을 통해 실행 # # 쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기 today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date()쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기 iso_today = rounded_today.isoformat() # 해당 날짜의 입력 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조 today_input_path = "{}/data/prediction_tasks_daily.json/{}".format( base_path, iso_today ) from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), ]) prediction_requests = spark.read.json(today_input_path, schema=schema) prediction_requests.show() # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests.withColumn( 'Route', concat( prediction_requests.Origin, lit('-'), prediction_requests.Dest ) ) prediction_requests_with_route.show(6) # 해당 열에 대응하는 인덱서로 문자열 필드를 인덱싱 for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # 수치열 벡터화: DepDelay, Distance final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # 명목형 필드를 위한 인덱스 제거 index_columns = ["Carrier_index", "DayOfMonth_index","DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # 예측 생성 predictions = rfc.transform(final_vectorized_features) # 원래 필드를 제공하기 위해 특징 벡터와 예측 메타데이터를 제거 predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # 결과 검사 final_predictions.show() # 해당 날짜의 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조 today_output_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today ) # 일별 구간에 결과 저장 final_predictions.repartition(1).write.mode("overwrite").json(today_output_path)
def label_failure_modes(cls, site, did, rd_item, df, model_dir, sc): ''' : param site: site, e.g. 'fab15', 'fab10' : did: design id, e.g. 'Z32D' : rd_item: rd bin in string format, e.g. 'rdC' ''' start_time = time.time() #Convert to Pandas dataframe # df = df.toPandas() # if 'FBD_REGION' in df.columns: # df['FBD_REGION'] = df['FBD_REGION'].apply(lambda x : cls.label_zone(x)) labelled_failure_modes = [] df = df.withColumn("row_id", F.monotonically_increasing_id()) model_features_list, model_name_list, model_dir_list = cls.__read_model_name( site, did, rd_item, model_dir) print(model_dir_list) if len(model_name_list) > 0: for name, features, dirname in zip(model_name_list, model_features_list, model_dir_list): features_missing = [e for e in features if e not in df.columns] if len(features_missing) > 0: print('Features %s missing for model %s' % (','.join(features_missing), name)) else: print(dirname) print(features) try: model = RandomForestClassificationModel.load( str(dirname)) # model = LinearSVCModel.load(model_dir) assembler = VectorAssembler(inputCols=features, outputCol="features") # Set maxCategories so features with > 4 distinct values are treated as continuous. newData = assembler.transform(df) df_i = model.transform(newData) #df_i = cls.pred_rf_model_spark(dirname, feature, name, df) df_i = df_i.withColumnRenamed("prediction", name) df = df.join(df_i.select("row_id", name), ("row_id")) labelled_failure_modes.append(name) print('Labelling done for: ', name) except: print('Labelling failed for: ', name) if len(labelled_failure_modes) > 0: df = df.withColumn( 'total', sum(df[col] for col in labelled_failure_modes)) df_labelled = df.filter(df.total > 0) df_unlabelled = df.filter(df.total == 0) else: df_labelled = [] df_unlabelled = df else: df_labelled = [] df_unlabelled = df print('No models found for: %s, %s, %s' % (site, did, rd_item)) print('Labelling time = ', time.time() - start_time) start_time = time.time() if df_labelled != []: df_labelled = df_labelled.toPandas() else: df_labelled = pd.DataFrame() df_unlabelled = df_unlabelled.toPandas() print('Pandas df conversion time = ', time.time() - start_time) return df_labelled, df_unlabelled, labelled_failure_modes
print(wine.limit(20)) # In[ ]: from pyspark.ml.feature import VectorAssembler # select the columns to be used as the features (all except `quality`) featureColumns = [c for c in wine.columns if c != 'quality'] # create and configure the assembler assembler = VectorAssembler(inputCols=featureColumns, outputCol="features") # transform the original data dataDF = assembler.transform(wine) dataDF.printSchema() # calculate the average wine quality avgQuality = wine.groupBy().avg('quality').first()[0] print(avgQuality) from pyspark.ml.classification import RandomForestClassificationModel rfObjectFileLoaded = sc._jsc.objectFile( "hdfs://ec2-3-88-182-126.compute-1.amazonaws.com:9000/home/ubuntu/sparkfolder/spark-2.4.7-bin-hadoop2.7/output/rf.model" ) rfModelLoaded_JavaObject = rfObjectFileLoaded.first() rfModelLoaded = RandomForestClassificationModel(rfModelLoaded_JavaObject) loadedPredictionsDF = rfModelLoaded.transform(wine) # evaluate the model again to see if we get the same performance print("Loaded model RMSE = %g" % evaluator.evaluate(loadedPredictionsDF))
# Calculate and print Recall score for Decision Tree Algorithm evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="weightedRecall") dtcWeightedRecall = evaluator.evaluate(dtcPredictions) print("Decision Tree weightedRecall Error = %g" % (dtcWeightedRecall)) # Train a RandomForest algorithm rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10) rfm = rf.fit(trainingData) # Save trained Logistic Regression Model to s3 Bucket for future use rfm.save('s3://expedia-hotel-recommendations-workflow/rfm_model') # Load Pre-Trained Logistic Regression Model to illistrate how model will be imported for future use rfModel = RandomForestClassificationModel.load("s3://expedia-hotel-recommendations-workflow/rfm_model") # Make predictions with Random Forest model rfPredictions = rfModel.transform(testData) # Calculate and print Accuracy score for Random Forest Algorithm evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") rfAccuracy = evaluator.evaluate(rfPredictions) print("Random Forest accuracy Error = %g" % (rfAccuracy)) # Calculate and print F1 score for Random Forest Algorithm evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="f1") rfF1 = evaluator.evaluate(rfPredictions) print("Random Forest f1 Error = %g" % (rfF1))
spark = SparkSession.builder.master("local").appName( "wineClasssification").getOrCreate() ######################### Reading Dataset######################## testDf = spark.read.csv('TestDataset.csv', header='true', inferSchema='true', sep=';') #testDf = spark.read.csv('hdfs://ip-172-31-19-75.ec2.internal:8020/TestDataset.csv',header='true', inferSchema='true', sep=';') feature = [c for c in testDf.columns if c != 'quality'] assembler_test = VectorAssembler(inputCols=feature, outputCol="features") test_trans = assembler_test.transform(testDf) #test_trans.printSchema() ######################### Loading Model ############################ model = RandomForestClassificationModel.load("wine_train_model") ######################### Predicting ########################## predictions = model.transform(test_trans) ##Value inside show this is just for printing number of value #predictions.select("quality", "features").show(1000) ######################### Printing Accuracy ########################## eval = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="accuracy") accuracy = eval.evaluate(predictions) print("accuracy test Error = %g" % (1.0 - accuracy)) from pyspark.mllib.evaluation import MulticlassMetrics transformed_data = model.transform(test_trans)
def loadRFModel(df): assembler = VectorAssembler(inputCols=['cid', 'GPA'], outputCol='features') output = assembler.transform(df) model = RandomForestClassificationModel.load("rf_model") ret = model.transform(output).select('cid', 'prediction') return ret.head(7)
#helper functions – helper.py file from pyspark.sql import functions as F import pickle from pyspark.ml import PipelineModel from pyspark.ml.classification import RandomForestClassificationModel from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType, DoubleType # read model objects saved from the training process path_to_read_objects = '/deploy' #pyspark objects char_labels = PipelineModel.load(path_to_read_objects + '/char_label_model.h5') assembleModel = PipelineModel.load(path_to_read_objects + '/assembleModel.h5') clf_model = RandomForestClassificationModel.load(path_to_read_objects + '/clf_model.h5') #python objects with open(path_to_read_objects + '/file.pkl', 'rb') as handle: features_list, char_vars, num_vars = pickle.load(handle) #make necessary transformations def rename_columns(df, char_vars): mapping = dict(zip([i + '_index' for i in char_vars], char_vars)) df = df.select([F.col(c).alias(mapping.get(c, c)) for c in df.columns]) return df # score the new data def score_new_df(scoredf): X = scoredf.select(features_list)
def main(base_path): APP_NAME = "make_predictions_streaming.py" # 10초마다 데이터 처리 PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # 스트리밍 패키지 추가 및 초기화 findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext( appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # 예측 생성에 사용된 모든 모델 적재 # # 도착 지연 구간화 모델 적재 from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # 모든 문자열 필드 벡터화 파이프라인을 dict에 적재 from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # 숫자 벡터 어셈블러 적재 from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # 분류 모델 적재 from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # 스트리밍에서 예측 요청 처리 # stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", }) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'])) row_stream.pprint() # # RDD 기반 객체 스트림에서 dataframe 생성 # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # 문자열 필드를 해당 열에 대응하는 파이프라인으로 벡터화 # 범주 필드를 범주형 특징 벡터로 변환한 다음 중간 결과 필드 삭제 for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # 숫사 열 벡터화: DepDelay, Distance, 인덱스 열 final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # 벡터 검사 final_vectorized_features.show() # 개별 인덱스 열 제거 index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # 예측 생성 predictions = rfc.transform(final_vectorized_features) # 원 필드에 제공하기 위해 특징 벡터와 예측 메타데이터 제거 predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # 결과 검사 final_predictions.show() # 몽고DB에 저장 if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # 분류를 수행하고 몽고 DB에 저장 row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream( ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", } ) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row( FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'] ) ) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat( prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest ) ) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
def main(base_path): spark = SparkSession.builder.config("spark.default.parallelism", 1).appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Messages look like: # # { # "Carrier": "DL", # "DayOfMonth": 25, # "DayOfWeek": 4, # "DayOfYear": 359, # "DepDelay": 10.0, # "Dest": "LAX", # "Distance": 2475.0, # "FlightDate": "2015-12-25", # "FlightNum": null, # "Origin": "JFK", # "Timestamp": "2019-10-31T00:19:47.633280", # "UUID": "af74b096-ecc7-4493-a79a-ebcdff699385" # } # # Process Prediction Requests from Kafka # message_df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", BROKERS) \ .option("subscribe", PREDICTION_TOPIC) \ .load() # Create a DataFrame out of the one-hot encoded RDD schema = T.StructType([ T.StructField("Carrier", T.StringType()), T.StructField("DayOfMonth", T.IntegerType()), T.StructField("DayOfWeek", T.IntegerType()), T.StructField("DayOfYear", T.IntegerType()), T.StructField("DepDelay", T.FloatType()), T.StructField("Dest", T.StringType()), T.StructField("Distance", T.FloatType()), T.StructField("FlightDate", T.StringType()), T.StructField("FlightNum", T.StringType()), T.StructField("Origin", T.StringType()), T.StructField("Timestamp", T.TimestampType()), T.StructField("UUID", T.StringType()), ]) prediction_requests_df = message_df.select( F.from_json(F.col("value").cast("string"), schema).alias("data")).select("data.*") # # Add a Route variable to replace FlightNum # prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', F.concat(prediction_requests_df.Origin, F.lit('-'), prediction_requests_df.Dest)) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Drop the individual index columns index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Store the results to MongoDB class MongoWriter: def open(self, partition_id, epoch_id): print(f"Opened partition id: {partition_id}, epoch: {epoch_id}") self.mongo_client = pymongo.MongoClient() print(f"Opened MongoClient: {self.mongo_client}") return True def process(self, row): print(f"Processing row: {row}") as_dict = row.asDict() print(f"Inserting row.asDict(): {as_dict}") id = self.mongo_client.agile_data_science.flight_delay_classification_response.insert_one( as_dict) print(f"Inserted row, got ID: {id.inserted_id}") self.mongo_client.close() return True def close(self, error): print("Closed with error: %s" % str(error)) return True query = final_predictions.writeStream.foreach(MongoWriter()).start() query.awaitTermination()
# 67673, 67688, 67689, 67690, 67691, 67701, 67708, 67709, 67718, 67719, 67728, 67961, 67962, 67977, 67978, # 67980, 67981, 67984, 67989, 67990, 67995, 67996, 67997, 67999, 68007, 68009, 68010, 68011, 68017, 68018, # 68019, 68251, 68267, 68268, 68269, 68270, 68274, 68279, 68284, 68285, 68286, 68287, 68288, 68301, 68307, # 68308, 68556, 68557, 68558, 68559, 68560, 68562, 68563, 68569, 68573, 68574, 68575, 68576, 68591, 68597, # 68598, 68846, 68847, 68848, 68849, 68850, 68853, 68858, 68859, 68861, 68866, 68870, 68881, 68885, 68887, # 68888, 68889, 69136, 69137, 69138, 69139, 69140, 69141, 69142, 69144, 69148, 69150, 69151, 69154, 69156, # 69169, 69170, 69171, 69172, 69177, 69425, 69426, 69427, 69429, 69432, 69433, 69434, 69438, 69442, 69443, # 69446, 69455, 69462, 69467, 69715, 69716, 69718, 69722, 69724, 69728, 69731, 69732, 69736, 69742, 69743, # 69744, 69745, 69751, 69752, 69753, 69755, 69757, 70005, 70006, 70008, 70012, 70018, 70030, 70031, 70032, # 70033, 70034, 70042, 70043, 70044, 70045, 70046, 70048, 70049, 70295, 70296, 70297, 70298, 70299, 70300, # 70302, 70303, 70309, 70317, 70318, 70319, 70320, 70324, 70333, 70334, 70336, 70566, 70585, 70586, 70587, # 70588, 70590, 70591, 70592, 70593, 70606, 70607, 70610, 70623, 70624, 70625, 70626, 70855, 70856, 70875, # 70877, 70878, 70879, 70880, 70881, 70882, 70883, 70887, 70901, 70910, 70915, 70916, 71144, 71145, 71146, # 71165, 71166, 71168, 71169, 71170, 71171, 71172, 71202, 71206, 71435, 71436, 71455, 71467, 71485, 71724, # 71744, 71745, 71746, 71747, 71757, 71758, 71771, 71772, 71775, 72012, 72013, 72014, 72015, 72036, 72037, # 72038, 72039, 72060, 72061, 72062, 72325, 72326, 72327, 72328, 72329, 72334, 72348, 72349, 72350, 72591, # 72616, 72617, 72618, 72624, 72625, 72884, 72907, 72908, 72909, 72913, 72916, 72917, 73182, 73194, 73195, # 73197, 73203, 73205, 73472, 73485, 73486, 73487, 73489, 73491, 73492, 73494, 73775, 73776, 73781, 73782, # 73783, 73784, 74061, 74062, 74065, 74066, 74070, 74071, 74072, 74073, 74091, 74351, 74352, 74353, 74354, # 74355, 74356, 74359, 74361, 74362, 74381, 74641, 74642, 74643, 74644, 74645, 74646, 74649, 74650, 74651, # 74652, 74922, 74936, 74940, 75226, 75229, 75230, 75520, 75817, 76384, 76385, 76391, 76397, 76402, 76687, # 76691, 76692, 76962, 77251, 77252, 77255, 77256, 77540, 77541, 77542, 77828, 77829, 77830, 77831, 77832, # 78118, 78119, 78122, 78409, 78410, 78411, 78412, 79862, 80152] mlist = [] # ilist = random.sample(ilist, 10) index = 40664 model = RandomForestClassificationModel.load( 'hdfs://master:9000//fcd/split/serialModel/model_{}'.format(index)) mlist.append(model) sc.stop()
maxDepth=10) model = classifier.fit(train_data) # Transform the test data using the model to get predictions predicted_test_data = model.transform(test_data) # Evaluate the model performance evaluator_f1 = MulticlassClassificationEvaluator( labelCol='gender', predictionCol='prediction', metricName='f1') print("F1 score: {}", evaluator_f1.evaluate(predicted_test_data)) evaluator_accuracy = MulticlassClassificationEvaluator( labelCol='gender', predictionCol='prediction', metricName='accuracy') print("Accuracy: {}", evaluator_accuracy.evaluate(predicted_test_data)) # Predict some new records # In real case, use VectorAssembler to transform df for features column data_to_predict = final_data.select("features").limit(10) model.transform(data_to_predict).show() # Save the model model.save("hdfs://devenv/user/spark/web_logs_analysis/gender_model/") # Read the saved model model_reloaded = RandomForestClassificationModel.load( "hdfs://devenv/user/spark/web_logs_analysis/gender_model/") # Predict some new records # In real case, use VectorAssembler to transform df for features column data_to_predict = final_data.select("features").limit(10) model_reloaded.transform(data_to_predict).show()
featuresCol='X') LR_model = LR.fit(X_train_large) LR_model.save(LR_model_path) # Random Forest RF = RandomForestClassifier(numTrees=100, maxDepth=15, labelCol="score", featuresCol="X") RF_model = RF.fit(X_train_large) RF_model.save(RF_model_path) # Loading all trained models NB_Model = NaiveBayesModel.load(NB_model_path) LR_Model = LogisticRegressionModel.load(LR_model_path) RF_Model = RandomForestClassificationModel.load(RF_model_path) voteClassifier = VoteClassifier(NB_Model, LR_Model, RF_Model) evaluate(voteClassifier.transform_vote(X_test_large), confusion=False, predictionCol='prediction_vote') evaluate(voteClassifier.transform_vote(X_test_imbd), confusion=False, predictionCol='prediction_vote') voteClassifier.transform_vote(X_test_imbd).show() # Accuracy: (TP+TN)/N # Positive Predicitve Value: TP/(TP+FP) # Negative Predicitve Value: TN/(TN+FN) import matplotlib.pyplot as plt
import sys #Create and connect to spark session, read data given in docker command spark = SparkSession.builder.master('local[*]').appName( 'Predict_model').getOrCreate() test_set = spark.read.csv(sys.argv[-1], header=True, inferSchema=True, sep=';') # Create feature vector assembler = VectorAssembler(inputCols=[ test_set.columns[0], test_set.columns[1], test_set.columns[2], test_set.columns[3], test_set.columns[4], test_set.columns[5], test_set.columns[6], test_set.columns[7], test_set.columns[8], test_set.columns[9], test_set.columns[10] ], outputCol='features') test_assembled = assembler.transform(test_set) test_assembled = test_assembled.select(test_assembled.columns[-1], test_assembled.columns[-2]) # Load trained classification model rfp = RandomForestClassificationModel.load('RF_model') #Predict classes of new data predictions = rfp.transform(test_assembled) #Evaluate model performance multi_evaluator = MulticlassClassificationEvaluator( labelCol=test_assembled.columns[-1], metricName='f1') print('F-1 Score of the classification model:', multi_evaluator.evaluate(predictions))
def load_model(): rf = RandomForestClassificationModel.load( "s3://wineapp-parth/rf_model.model/") return rf
spark_session = SparkSession.builder.master("local").appName( "wineClasssification").getOrCreate() print("\nProgram has started : \n") ##-------------------------------------- code to read dataset ------------------------## testDataframe = spark_session.read.csv('TestDataset.csv', header='true', inferSchema='true', sep=';') feature = [c for c in testDataframe.columns if (c not in 'quality')] assembler_test = VectorAssembler(inputCols=feature, outputCol="features") test_trans = assembler_test.transform(testDataframe) ##-------------------------------------- code to load model ------------------------## model = RandomForestClassificationModel.load("model") ##-------------------------------------- code to predict ------------------------## predictions = model.transform(test_trans) ##-------------------------------------- code to print accuracy ------------------------## accuracy = MulticlassClassificationEvaluator( labelCol="quality", predictionCol="prediction", metricName="accuracy").evaluate(predictions) print("Testing- Accuracy Error = %g" % (1.0 - accuracy)) transformed_data = model.transform(test_trans) print( MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="accuracy").getMetricName(),
from pyspark.sql import SparkSession from pyspark.sql import functions as F from pyspark.sql.functions import col from pyspark.sql.functions import lit from pyspark.sql.functions import udf from pyspark.sql.types import * from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassificationModel from pyspark.ml import PipelineModel import time #Load in onehotencoder and rf model spark = SparkSession.builder.getOrCreate() print("In load_model:") loadtime = time.time() rf_pipeline = PipelineModel.load("/home/ubuntu/pipeline") rf_model = RandomForestClassificationModel.load("/home/ubuntu/model_rf") print("after load_model: %s seconds" % (time.time() - loadtime)) def change_type(df): df = df.withColumn('credit_score', col('credit_score').cast(IntegerType())) df = df.withColumn('original_dti', col('original_dti').cast(IntegerType())) df = df.withColumn('original_upb', col('original_upb').cast(IntegerType())) df = df.withColumn('original_ltv', col('original_ltv').cast(IntegerType())) df = df.withColumn('original_interest_rate', col('original_interest_rate').cast(DoubleType())) df = df.withColumn('number_of_units', col('number_of_units').cast(IntegerType())) df = df.withColumn('mip', col('mip').cast(IntegerType())) return df
sensorImportancesPD = pd.DataFrame.from_records(list(sensorImportances.items()), columns=['Sensor','Importance (%)'])\ .sort_values('Importance (%)') sb.set_color_codes("pastel") sb.barplot(x="Importance (%)", y="Sensor", data=sensorImportancesPD, label="Total", color="b") # #### Model Saving/Loading # We can save models and pipelines for re-use later model.bestModel.write().overwrite().save(path='rf_sensor_maintenance.mdl') !rm -rf rf_sensor_maintenance.mdl !hdfs dfs -get models/rf_sensor_maintenance.mdl newModel = RandomForestClassificationModel.load('rf_sensor_maintenance.mdl') predictions = newModel.transform(li.transform(va)) accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) # Let's see how much maintenance we could have saved if we used this model def f(actual, predicted, cost): if actual==predicted: if actual=='Corrective': return 0 elif actual=='Preventive': return cost elif actual=='Healthy': return 30000 else: return cost
from pyspark.sql.functions import when from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import StandardScaler from pyspark.ml.classification import RandomForestClassificationModel spark = SparkSession.builder.master("local").appName( "wineClasssification").getOrCreate() testDf = spark.read.format('csv').options( header='true', inferSchema='true', delimiter=';').csv("s3://cs643/TrainingDataset.csv") feature = [c for c in testDf.columns if c != 'quality'] assembler_test = VectorAssembler(inputCols=feature, outputCol="features") test_trans = assembler_test.transform(testDf) model = RandomForestClassificationModel.load( "s3://cs643/wine_train_model.model") predictions = model.transform(test_trans) eval = MulticlassClassificationEvaluator(labelCol='""""quality"""""', predictionCol="prediction", metricName="accuracy") accuracy = eval.evaluate(predictions) print("accuracy test Error = %g" % (1.0 - accuracy)) from pyspark.mllib.evaluation import MulticlassMetrics transformed_data = model.transform(test_trans) print(eval.getMetricName(), 'accuracy:', eval.evaluate(transformed_data)) eval1 = MulticlassClassificationEvaluator(labelCol='""""quality"""""', predictionCol="prediction",
sc.setLogLevel("ERROR") app = Flask(__name__) schema = StructType([ StructField("sepal_length", FloatType()), StructField("sepal_width", FloatType()), StructField("petal_length", FloatType()), StructField("petal_width", FloatType()), StructField("class", StringType()) ]) predict_schema = StructType(schema.fields[:-1]) pipelineModel = PipelineModel.load("api/sparksaves/pipelineModel") rfModel = RandomForestClassificationModel.load("api/sparksaves/rfModel") spark = SparkSession.builder.getOrCreate() @app.route('/get_prediction', methods=['POST']) def calc_prob(): """Calculate probability for species.""" input_features = [[ float(request.json["sepal_length"]), float(request.json["sepal_width"]), float(request.json["petal_length"]), float(request.json["petal_width"]) ]] predict_df = spark.createDataFrame(data=input_features,
print(rf_accuracy) rf_precision=MulticlassClassificationEvaluator(labelCol='affairs',metricName='weightedPrecision').evaluate(rf_predictions) print('The precision rate on test data is {0:.0%}'.format(rf_precision)) rf_precision rf_auc=BinaryClassificationEvaluator(labelCol='affairs').evaluate(rf_predictions) print(rf_auc) # Feature importance rf_classifier.featureImportances df.schema["features"].metadata["ml_attr"]["attrs"] # Save the model rf_classifier.save("C:\\Users\\Hernan\\Data Science\\SPARK\\machine-learning-with-pyspark\\chapter_6_Random_Forests\\RF_model") from pyspark.ml.classification import RandomForestClassificationModel rf=RandomForestClassificationModel.load("C:\\Users\\Hernan\\Data Science\\SPARK\\machine-learning-with-pyspark\\chapter_6_Random_Forests\\RF_model") test_df.show(5) model_preditions=rf.transform(test_df) model_preditions.show() single_df = spark.createDataFrame([[5.0,33.0,5.0,1.0,5.0,0.0]], ['rate_marriage', 'age', 'yrs_married', 'children', 'religious', 'affairs']) single_df = df_assembler.transform(single_df) single_df = single_df.select(['features','affairs']) model_predition=rf.transform(single_df) model_predition.show()
#remove punctuation pp_udf = udf(preprocess, ArrayType(StringType())) words = ads_free.withColumn('Words', pp_udf(ads_free.Text)) #remove stop words remover = StopWordsRemover(inputCol="Words", outputCol="filtered") removed = remover.transform(words) params_path = '../tmp/{}' #Load trained hashing frequency and transform hf_path = params_path.format('hf') hashingTF = HashingTF.load(hf_path) featureized = hashingTF.transform(removed) #Load trained hashing frequency and transform idf_path = params_path.format('idfmodel') idfmodel = IDFModel.load(idf_path) result = idfmodel.transform(featureized) #load rf model and predict rf_path = params_path.format('rf') rf = RandomForestClassificationModel.load(rf_path) prediction = rf.transform(result) path_to_save = '../tmp/twitterstream_test_prediction.json' prediction.write.json(path_to_save) #test whether json is written test = spark.read.json(path_to_save)
from pyspark.sql import SparkSession from pyspark.sql.functions import * from pyspark.ml.feature import VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer from pyspark.ml.classification import RandomForestClassificationModel if __name__ == "__main__": spark = SparkSession \ .builder \ .getOrCreate() # Load full data logs = spark.read.parquet("hdfs://devenv/user/spark/spark_mllib_101/ec_web_logs_analysis/data/") # Age group prediction # Load age group model age_group_model = RandomForestClassificationModel.load( "hdfs://devenv/user/spark/spark_mllib_101/ec_web_logs_analysis/model_age_group_prediction/") # +---------+-----------------+ # |age_group|age_group_indexed| # +---------+-----------------+ # | under 20| 2.0| # | over 50| 3.0| # | 21-35| 0.0| # | 36-50| 1.0| # +---------+-----------------+ # Prepare features and preprocessing data_prep = logs.select("device_id", "product_category_id", "device_type", "connect_type", "age_group") data_prep = VectorAssembler(inputCols=["product_category_id", "device_type", "connect_type"], outputCol="features").transform(data_prep)
#reading the saved countvector model cv = CountVectorizerModel.load(args.model_path + '/countvector_model') #transforming test data to count vector testing_data = cv.transform(testing_data) #saving the transformed data as parquet file testing_data.write.parquet(args.model_path + '/testingdata.parquet') print( '********************* after cv transformation *****************') print( '********************* after cv transformation *****************') print( '********************* after cv transformation *****************') #reading the saved random forest model rfModel = RandomForestClassificationModel.load(args.model_path + '/rfmodel') #getting the predictions predictions = predict(rfModel, testing_data) #saving the predictions as parquet file predictions.write.parquet(args.model_path + '/predictions.parquet') print('********************* after predicitons *****************') print('********************* after predicitons *****************') print('********************* Done *****************') else: print("Enter correct mode (train or test)")
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages( ["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext( appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream(ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", }) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row(FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'])) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame( rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat(prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest)) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
#load model if algoName == "LogisticRegression": from pyspark.ml.classification import LogisticRegressionModel model = LogisticRegressionModel.load(modelPath) elif algoName == "LinearRegression": from pyspark.ml.regression import LinearRegressionModel model = LinearRegressionModel.load(modelPath) elif algoName == "DecisionTreeClassification": from pyspark.ml.classification import DecisionTreeClassificationModel model = DecisionTreeClassificationModel.load(modelPath) elif algoName == "DecisionTreeRegression": from pyspark.ml.regression import DecisionTreeRegressionModel model = DecisionTreeRegressionModel.load(modelPath) elif algoName == "RandomForestClassification": from pyspark.ml.classification import RandomForestClassificationModel model = RandomForestClassificationModel.load(modelPath) elif algoName == "RandomForestRegression": from pyspark.ml.regression import RandomForestRegressionModel model = RandomForestRegressionModel.load(modelPath) elif algoName == "GBTClassification": from pyspark.ml.classification import GBTClassificationModel model = GBTClassificationModel.load(modelPath) elif algoName == "GBTRegression": from pyspark.ml.regression import GBTRegressionModel model = GBTRegressionModel.load(modelPath) #predict prediction = model.transform(data).select("prediction") #save prediction.write.format("csv").save(outputPath)
def get_model(model_version, spid, model_date): model_version_location = _get_model_version_folder(model_version) model_path = os.path.join(model_version_location, 'ctr_model_spid%d_%s' % (spid, model_date)).replace('\\', '/') return RandomForestClassificationModel.load(model_path)