def load_Random_Model(dataset): print ("Accuracy of best RFC Model with CrossValidation:") evaluator = BinaryClassificationEvaluator() best_RFModel = RandomForestClassificationModel.load("model/RFM1/") predictions = best_RFModel.transform(dataset) accuracy = evaluator.evaluate(predictions) print "The accuracy = %g" % accuracy
def RandomForest(data): path = 'modelo_RandomForest/modelRandomForest' randomModel = RandomForestClassificationModel.load(path) predictions = randomModel.transform(data) print("RANDOM FOREST") predictions.select('Email', 'Identificador', 'Burnout_Antes', 'prediction', 'probability').show(truncate=False)
def predict(test_path, model_name, output_path): if model_name is None: model_name = 'model' if output_path is None: output_path = os.path.join(dirname(os.getcwd()), 'predict.csv') model_path = os.path.join(dirname(os.getcwd()), 'models', model_name) spark = SparkSession \ .builder \ .master('local') \ .appName('Logistic App') \ .getOrCreate() # todo Delete the next line spark.sparkContext.setLogLevel('OFF') model = RandomForestClassificationModel.load(path=model_path) raw_data = spark.read.csv(test_path, header=True) dataset = mature_data(raw_data) prediction_df = model.transform(dataset).select( col('id'), col('prediction').cast('int')) prediction_df = prediction_df.toPandas() prediction_df.to_csv(output_path, index=False)
def read_model(self): if "LogisticRegression" in self.best_model_path: classifier = LogisticRegressionModel.load(self.best_model_path) elif "DecisionTree" in self.best_model_path: classifier = DecisionTreeClassificationModel.load( self.best_model_path) elif "RandomForest" in self.best_model_path: classifier = RandomForestClassificationModel.load( self.best_model_path) elif "LinearSVC" in self.best_model_path: classifier = LinearSVCModel.load(self.best_model_path) if "VGG16" in self.best_model_path: featurizer_name = "VGG16" elif "VGG19" in self.best_model_path: featurizer_name = "VGG19" elif "InceptionV3" in self.best_model_path: featurizer_name = "InceptionV3" elif "Xception" in self.best_model_path: featurizer_name = "Xception" elif "ResNet50" in self.best_model_path: featurizer_name = "ResNet50" return featurizer_name, classifier
def _load_models(self): hf_path = self.params_path.format('hf') idf_path = self.params_path.format('idfmodel') rf_path = self.params_path.format('rf') self.hashingTF = HashingTF.load(hf_path) self.idfmodel = IDFModel.load(idf_path) self.rf = RandomForestClassificationModel.load(rf_path)
def scoring_post_model(pargs, params): """ Function to score the input data using the saved model. """ # Load parameters label_class_type = configs['binary_or_multiclass'] saved_model_path = data_paths[configs['saved_model_path']].format( run_mode=run['run_mode'], run_id=run['run_id']) scoring_filter_column = configs['scoring_filter_column'] scoring_filter_date = datetime.datetime.strptime( str(configs['scoring_filter_date']), '%Y%m%d') feature_list_path = data_paths[configs['feature_list_path']] output_scored_data = data_paths[ configs['scored_data_path']] # scored data output if run['use_sample']: abo_dna_data = sqlContext.read.parquet( data_paths['abo_dna_sample'].format(run_mode=run['run_mode'], run_id=run['run_id'])) else: abo_dna_data = sqlContext.read.parquet( data_paths['abo_dna_full_file'].format(run_mode=run['run_mode'], run_id=run['run_id'])) trained_model = None if label_class_type == "binary": trained_model = GBTClassificationModel.load(saved_model_path) else: trained_model = RandomForestClassificationModel.load(saved_model_path) # Select which subset of abo dna data we want to use for scoring abo_dna_data_scoring = abo_dna_data.filter( F.col(scoring_filter_column) >= scoring_filter_date) onehot_pipeline = PipelineModel.load( data_paths['migration_onehot_model'].format(run_mode=run['run_mode'], run_id=run['run_id'])) scoring_data, onehot_pipeline, final_feature_list = preprocess_migration_model_data( abo_dna_data_scoring, False, onehot_pipeline, None, None) # validate that the same input columns are here as training (except the label based columns) if final_feature_list != list( sqlContext.read.parquet( feature_list_path.format(run_mode=run['run_mode'], run_id=run['run_id'])).columns): raise ValueError("Mismatch in training input and test input.") # Produce scoring: scored_data = trained_model.transform(scoring_data) scored_data.write.parquet(output_scored_data.format( run_mode=run['run_mode'], run_id=run['run_id']), mode='overwrite')
def init(): print("Begin function...", flush=True) global SPARK SPARK = SparkSession.builder.appName("DriftTest").getOrCreate() print("Spark variable:", SPARK, flush=True) global MODEL MODEL = RandomForestClassificationModel.load( "/hadoop/demo/titanic-spark/titanic")
def pred_rf_model_spark(cls, model_dir, feature_col, df_new): print('model loading start') # model = GBTClassificationModel.load(model_dir) model = RandomForestClassificationModel.load(str(model_dir)) # model = LinearSVCModel.load(model_dir) assembler = VectorAssembler(inputCols=feature_col, outputCol="features") # Set maxCategories so features with > 4 distinct values are treated as continuous. newData = assembler.transform(df_new) predictions = model.transform(newData) return predictions
def RandomForest(data): path = 'modelo_RandomForest/modelRandomForest' randomModel = RandomForestClassificationModel.load(path) predictions = randomModel.transform(data) prediccion = predictions.select( 'prediction', 'probability').rdd.flatMap(lambda x: x).collect() print(prediccion[0]) if prediccion[0] == 1.0: prediccionLabel = 'FALSO' else: prediccionLabel = 'VERDADERO' return prediccionLabel, prediccion[1][0] * 100
def main2(spark, output): # STEP 3: Use Classifier to predict Latent Factor Vector and updated ALS Model model = RandomForestClassificationModel.load( 'hdfs:/user/yh2857/short_rf.model') lfsdf = spark.read.parquet( 'hdfs:/user/yh2857/model_frac_1/rank10_reg0.1_alpha0.01/itemFactors') idx = lfsdf.rdd.map(lambda row: row[0]) features = lfsdf.rdd.map(lambda row: row[1]) lfsdf = idx.zip(features.map(lambda x: Vectors.dense(x))).toDF( schema=['id', 'features']) # print(lfsdf.count(),lfsdf.select('id').distinct().count(),lfsdf) with open('kmean_centers.txt', 'rb') as f: centers = pickle.load(f) new_centers = [] for i, c in enumerate(centers): new_centers.append([i, Vectors.dense(centers[0].tolist())]) centerdf = spark.createDataFrame( pd.DataFrame(data=new_centers)).withColumnRenamed( '0', 'center_idx').withColumnRenamed('1', 'center_features') # print(centerdf.count(), centerdf) df_path = 'hdfs:/user/yh2857/coldstart_processed_short.parquet' new_df = spark.read.parquet(df_path).withColumnRenamed( 'prediction', 'label') train, test = new_df.randomSplit([0.8, 0.2], 24) print(test.count(), test.select('item_index').distinct().count(), test) predictions = model.transform(test) # predictions.select("prediction").distinct().show() predicted = predictions.join(centerdf, predictions.prediction == centerdf.center_idx, 'left') # predicted.show() original_lfs = lfsdf.join(predicted, lfsdf.id == predicted.item_index, "leftanti") predicted = predicted.select('item_index', 'center_features').withColumnRenamed( "center_features", 'features') print(original_lfs) print(predicted) updated_lfs = original_lfs.withColumnRenamed('id', 'item_index').union(predicted) # updated_lfs.show() output_file = 'hdfs:/user/yh2857/rank10_reg0.1_alpha0.1/itemFactors' updated_lfs.write.mode('overwrite').parquet(output_file)
def fit(self, train): from pyspark.ml.feature import MinMaxScaler as minmax cols = [x for x in train.columns if x not in ['datetime','label']] train = train.fillna(0) train = train.withColumn('label', when(rand() > 0.5, 1).otherwise(0)) print(train.show(n=5)) assembler = VectorAssembler().setInputCols \ (cols).setOutputCol("features") print('assembler') train = assembler.transform(train) train = train.fillna(0) train = train.drop(*cols) rf = RandomForestClassifier(labelCol="label", featuresCol="features", predictionCol='predictions', numTrees=10) print('assembler') # print(train.show(n=5)) # train = assembler.transform(train) # Chain indexers and forest in a Pipeline train.show(n=5) # pipeline = Pipeline(stages=[rf]) print('Train model. This also runs the indexers.') model = rf.fit(train) # Save and load model model.write().overwrite().save('myRandomForestClassificationModel') sameModel = RandomForestClassificationModel.load('myRandomForestClassificationModel') print("make predictions") # Make predictions. predictions = model.transform(train) # Select example rows to display. predictions.select("predictions", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="predictions", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
def main(): spark = SparkSession \ .builder \ .appName("RandomForest") \ .config("spark.executor.heartbeatInterval","60s")\ .getOrCreate() sc = spark.sparkContext sqlContext = SQLContext(sc) sc.setLogLevel("INFO") # Loading the test data df_test = spark.read.parquet(sys.argv[1]) df_test, df_discard = df_test.randomSplit([0.2, 0.8]) # Load the model rf_model = RandomForestClassificationModel.load(sys.argv[2]) # Make the predictions predictions = rf_model.transform(df_test) #predictionsRDD=predictions.rdd #predictionsRDD.saveAsTextFile(sys.argv[3]+"output.text") evaluator_acc = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="label", metricName="accuracy") accuracy = evaluator_acc.evaluate(predictions) print "accuracy *******************" print accuracy evaluator_pre = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="label", metricName="weightedPrecision") print "precision *******************" print evaluator_pre.evaluate(predictions) print "recall **********************" print MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="label", metricName="weightedRecall").evaluate(predictions)
def check_input(data) -> int: spark = SparkSession.builder.appName( 'abc').enableHiveSupport().getOrCreate() sc = spark.sparkContext rdd = sc.parallelize([data]) df = spark.read.json(rdd) rdd = sc.parallelize([data]) df = spark.read.json(rdd) df_assembler = VectorAssembler( inputCols=['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'], outputCol="features") df = df_assembler.transform(df) model_df = df.select('features') rf = RandomForestClassificationModel.load("/home/admin/Downloads/RF_model") model_preditions = rf.transform(model_df) model_preditions = model_preditions.toPandas()['prediction'].values.tolist( ) return model_preditions[0]
def random_forest_classifier(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(df) td = si_model.transform(df) rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42) model = rf.fit(td) # model.featureImportances # # SparseVector(1, {0: 1.0}) # allclose(model.treeWeights, [1.0, 1.0, 1.0]) # # True test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) result = model.transform(test0).head() # result.prediction # # 0.0 # numpy.argmax(result.probability) # # 0 # numpy.argmax(result.rawPrediction) # # 0 # test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) # model.transform(test1).head().prediction # # 1.0 # model.trees # # [DecisionTreeClassificationModel (uid=...) of depth..., DecisionTreeClassificationModel...] temp_path = "." rfc_path = temp_path + "/rfc" rf.write().overwrite().save(rfc_path) rf2 = RandomForestClassifier.load(rfc_path) # rf2.getNumTrees() # # 3 model_path = temp_path + "/rfc_model" model.write().overwrite().save(model_path) model2 = RandomForestClassificationModel.load(model_path)
def init(path="./"): global indexModel, ohPipelineModel, scaler, mlModel, info, spark # start spark session spark = pyspark.sql.SparkSession.builder.appName('scoring').getOrCreate() # load the models stringIndexModelFile = path + 'stringIndexModel' oneHotEncoderModelFile = path + 'oneHotEncoderModel' featureScaleModelFile = path + 'featureScaleModel' scaler = StandardScalerModel.load(featureScaleModelFile) ohPipelineModel = PipelineModel.load(oneHotEncoderModelFile) indexModel = PipelineModel.load(stringIndexModelFile) mlModelFile = path + 'mlModel' mlModel = RandomForestClassificationModel.load(mlModelFile) infoFile = path + 'info' info = None # load info with open(infoFile, 'rb') as handle: info = pickle.load(handle)
def predict(self, test): cols = [x for x in test.columns if x not in ['datetime', 'label']] test = test.fillna(0) print(test.printSchema()) print('Test Columns : ' + str(len(test.columns))) print('Test Rows : ' + str(test.count())) assembler = VectorAssembler().setInputCols \ (cols).setOutputCol("features") print('assembler') test = assembler.transform(test) test = test.fillna(0) test = test.drop(*cols) rf = RandomForestClassificationModel.load('myRandomForestClassificationModel') preds = rf.transform(test) print(preds.printSchema()) return preds
def main(): # spark = SparkSession.builder.appName('google-play-store-streamer').getOrCreate() sc = SparkContext(appName="PysparkStreaming").getOrCreate() ssc = StreamingContext(sc, 3) # Load Model model = RandomForestClassificationModel.load(MODEL_PATH) def parseStream(rdd): if not rdd.isEmpty(): df = sc.read.json(rdd) df.show() # Vectorize data feature_cols = df.columns feature_cols.remove('Installs indexed') assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="error") pipeline = Pipeline(stages=[assembler]) outputModel = pipeline.fit(df) output = outputModel.transform(df) final_data = output.select("features", "Installs indexed") # Predict predictions = model.transform(final_data) evaluator = MulticlassClassificationEvaluator( labelCol="Installs indexed", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Random forest test Error = %g" % (1.0 - accuracy)) randomForestError = (1.0 - accuracy) print(randomForestError) stream_data = ssc.textFileStream('StreamData/') stream_data.foreachRDD(lambda rdd: parseStream(rdd)) ssc.start() ssc.awaitTermination()
def sendRecord(df): from pyspark.ml.classification import RandomForestClassificationModel sameModel = RandomForestClassificationModel.load("randomForest.model") df = df.withColumn("amount", df["amount"].cast(FloatType())) df = df.withColumn("newbalanceDest", df["newbalanceDest"].cast(FloatType())) df = df.withColumn("newbalanceOrig", df["newbalanceOrig"].cast(FloatType())) df = df.withColumn("oldbalanceDest", df["oldbalanceDest"].cast(FloatType())) df = df.withColumn("oldbalanceOrg", df["oldbalanceOrg"].cast(FloatType())) df = df.withColumn("isFlaggedFraud", df["isFlaggedFraud"].cast(IntegerType())) df = df.withColumn("step", df["step"].cast(IntegerType())) df = df.withColumn("Type", df["Type"].cast(IntegerType())) assembler = VectorAssembler(inputCols=[ "Type", "amount", "newbalanceDest", "newbalanceOrig", "oldbalanceDest", "oldbalanceOrg", "step" ], outputCol="features") output = assembler.transform(df).select("features") predictions = sameModel.transform(output) pr = predictions.select("prediction") pr = pr.rdd if (str(pr.collect() == [Row(prediction=1.0)]) == "True"): print("FRAUD!!!!") else: print("Not Fraud")
#reading the saved countvector model cv = CountVectorizerModel.load(args.model_path + '/countvector_model') #transforming test data to count vector testing_data = cv.transform(testing_data) #saving the transformed data as parquet file testing_data.write.parquet(args.model_path + '/testingdata.parquet') print( '********************* after cv transformation *****************') print( '********************* after cv transformation *****************') print( '********************* after cv transformation *****************') #reading the saved random forest model rfModel = RandomForestClassificationModel.load(args.model_path + '/rfmodel') #getting the predictions predictions = predict(rfModel, testing_data) #saving the predictions as parquet file predictions.write.parquet(args.model_path + '/predictions.parquet') print('********************* after predicitons *****************') print('********************* after predicitons *****************') print('********************* Done *****************') else: print("Enter correct mode (train or test)")
predictions = rForestModel.transform(pTestDF) # %% evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1") evaluator.evaluate(predictions) # %% lr = LogisticRegression(featuresCol='features', labelCol='class') lrModel = lr.fit(pTrainDF) predictionsLR = lrModel.transform(pTestDF) evaluator.evaluate(predictionsLR) # %% naiveBayes = NaiveBayes(featuresCol='features', labelCol='class') naiveModel = naiveBayes.fit(pTrainDF) predictionsNaive = naiveModel.transform(pTestDF) evaluator.evaluate(predictionsNaive) # %% pipelineModel.save('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/pipelineW2V') rForestModel.save('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/rForest') #%% pipelineModel = PipelineModel.load('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/pipelineW2V') rForestModel = RandomForestClassificationModel.load('D:/College_Stuff/3rd_Sem/CMPE256/Project/Models/rForest') # %%
#remove punctuation pp_udf = udf(preprocess, ArrayType(StringType())) words = ads_free.withColumn('Words', pp_udf(ads_free.Text)) #remove stop words remover = StopWordsRemover(inputCol="Words", outputCol="filtered") removed = remover.transform(words) params_path = '../tmp/{}' #Load trained hashing frequency and transform hf_path = params_path.format('hf') hashingTF = HashingTF.load(hf_path) featureized = hashingTF.transform(removed) #Load trained hashing frequency and transform idf_path = params_path.format('idfmodel') idfmodel = IDFModel.load(idf_path) result = idfmodel.transform(featureized) #load rf model and predict rf_path = params_path.format('rf') rf = RandomForestClassificationModel.load(rf_path) prediction = rf.transform(result) path_to_save = '../tmp/twitterstream_test_prediction.json' prediction.write.json(path_to_save) #test whether json is written test = spark.read.json(path_to_save)
def main(iso_date, base_path): APP_NAME = "make_predictions.py" # SparkSession이 없으면 환경 생성 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # 파이프라인에 모든 모델을 적재 # # 도착 지연 구간 설정 모델을 적재 from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # 모든 문자열 인덱서를 dict에 적재 from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # 수치 벡터 어셈블러 적재 from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # 분류 모델 적재 from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # 요청을 훈련 데이터로부터 변환을 통해 실행 # # 쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기 today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date()쿼리 범위를 지정하기 위해 ISO 문자열로 오늘과 내일 날짜 가져오기 iso_today = rounded_today.isoformat() # 해당 날짜의 입력 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조 today_input_path = "{}/data/prediction_tasks_daily.json/{}".format( base_path, iso_today ) from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), ]) prediction_requests = spark.read.json(today_input_path, schema=schema) prediction_requests.show() # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests.withColumn( 'Route', concat( prediction_requests.Origin, lit('-'), prediction_requests.Dest ) ) prediction_requests_with_route.show(6) # 해당 열에 대응하는 인덱서로 문자열 필드를 인덱싱 for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # 수치열 벡터화: DepDelay, Distance final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # 명목형 필드를 위한 인덱스 제거 index_columns = ["Carrier_index", "DayOfMonth_index","DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # 예측 생성 predictions = rfc.transform(final_vectorized_features) # 원래 필드를 제공하기 위해 특징 벡터와 예측 메타데이터를 제거 predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # 결과 검사 final_predictions.show() # 해당 날짜의 경로 생성: 날짜 기반의 프라이머리 키 디렉터리 구조 today_output_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today ) # 일별 구간에 결과 저장 final_predictions.repartition(1).write.mode("overwrite").json(today_output_path)
sc.setLogLevel("ERROR") app = Flask(__name__) schema = StructType([ StructField("sepal_length", FloatType()), StructField("sepal_width", FloatType()), StructField("petal_length", FloatType()), StructField("petal_width", FloatType()), StructField("class", StringType()) ]) predict_schema = StructType(schema.fields[:-1]) pipelineModel = PipelineModel.load("api/sparksaves/pipelineModel") rfModel = RandomForestClassificationModel.load("api/sparksaves/rfModel") spark = SparkSession.builder.getOrCreate() @app.route('/get_prediction', methods=['POST']) def calc_prob(): """Calculate probability for species.""" input_features = [[ float(request.json["sepal_length"]), float(request.json["sepal_width"]), float(request.json["petal_length"]), float(request.json["petal_width"]) ]] predict_df = spark.createDataFrame(data=input_features,
def label_failure_modes(cls, site, did, rd_item, df, model_dir, sc): ''' : param site: site, e.g. 'fab15', 'fab10' : did: design id, e.g. 'Z32D' : rd_item: rd bin in string format, e.g. 'rdC' ''' start_time = time.time() #Convert to Pandas dataframe # df = df.toPandas() # if 'FBD_REGION' in df.columns: # df['FBD_REGION'] = df['FBD_REGION'].apply(lambda x : cls.label_zone(x)) labelled_failure_modes = [] df = df.withColumn("row_id", F.monotonically_increasing_id()) model_features_list, model_name_list, model_dir_list = cls.__read_model_name( site, did, rd_item, model_dir) print(model_dir_list) if len(model_name_list) > 0: for name, features, dirname in zip(model_name_list, model_features_list, model_dir_list): features_missing = [e for e in features if e not in df.columns] if len(features_missing) > 0: print('Features %s missing for model %s' % (','.join(features_missing), name)) else: print(dirname) print(features) try: model = RandomForestClassificationModel.load( str(dirname)) # model = LinearSVCModel.load(model_dir) assembler = VectorAssembler(inputCols=features, outputCol="features") # Set maxCategories so features with > 4 distinct values are treated as continuous. newData = assembler.transform(df) df_i = model.transform(newData) #df_i = cls.pred_rf_model_spark(dirname, feature, name, df) df_i = df_i.withColumnRenamed("prediction", name) df = df.join(df_i.select("row_id", name), ("row_id")) labelled_failure_modes.append(name) print('Labelling done for: ', name) except: print('Labelling failed for: ', name) if len(labelled_failure_modes) > 0: df = df.withColumn( 'total', sum(df[col] for col in labelled_failure_modes)) df_labelled = df.filter(df.total > 0) df_unlabelled = df.filter(df.total == 0) else: df_labelled = [] df_unlabelled = df else: df_labelled = [] df_unlabelled = df print('No models found for: %s, %s, %s' % (site, did, rd_item)) print('Labelling time = ', time.time() - start_time) start_time = time.time() if df_labelled != []: df_labelled = df_labelled.toPandas() else: df_labelled = pd.DataFrame() df_unlabelled = df_unlabelled.toPandas() print('Pandas df conversion time = ', time.time() - start_time) return df_labelled, df_unlabelled, labelled_failure_modes
sensorImportancesPD = pd.DataFrame.from_records(list(sensorImportances.items()), columns=['Sensor','Importance (%)'])\ .sort_values('Importance (%)') sb.set_color_codes("pastel") sb.barplot(x="Importance (%)", y="Sensor", data=sensorImportancesPD, label="Total", color="b") # #### Model Saving/Loading # We can save models and pipelines for re-use later model.bestModel.write().overwrite().save(path='rf_sensor_maintenance.mdl') !rm -rf rf_sensor_maintenance.mdl !hdfs dfs -get models/rf_sensor_maintenance.mdl newModel = RandomForestClassificationModel.load('rf_sensor_maintenance.mdl') predictions = newModel.transform(li.transform(va)) accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) # Let's see how much maintenance we could have saved if we used this model def f(actual, predicted, cost): if actual==predicted: if actual=='Corrective': return 0 elif actual=='Preventive': return cost elif actual=='Healthy': return 30000 else: return cost
spark_session = SparkSession.builder.master("local").appName( "wineClasssification").getOrCreate() print("\nProgram has started : \n") ##-------------------------------------- code to read dataset ------------------------## testDataframe = spark_session.read.csv('TestDataset.csv', header='true', inferSchema='true', sep=';') feature = [c for c in testDataframe.columns if (c not in 'quality')] assembler_test = VectorAssembler(inputCols=feature, outputCol="features") test_trans = assembler_test.transform(testDataframe) ##-------------------------------------- code to load model ------------------------## model = RandomForestClassificationModel.load("model") ##-------------------------------------- code to predict ------------------------## predictions = model.transform(test_trans) ##-------------------------------------- code to print accuracy ------------------------## accuracy = MulticlassClassificationEvaluator( labelCol="quality", predictionCol="prediction", metricName="accuracy").evaluate(predictions) print("Testing- Accuracy Error = %g" % (1.0 - accuracy)) transformed_data = model.transform(test_trans) print( MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="accuracy").getMetricName(),
# Calculate and print Recall score for Decision Tree Algorithm evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="weightedRecall") dtcWeightedRecall = evaluator.evaluate(dtcPredictions) print("Decision Tree weightedRecall Error = %g" % (dtcWeightedRecall)) # Train a RandomForest algorithm rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10) rfm = rf.fit(trainingData) # Save trained Logistic Regression Model to s3 Bucket for future use rfm.save('s3://expedia-hotel-recommendations-workflow/rfm_model') # Load Pre-Trained Logistic Regression Model to illistrate how model will be imported for future use rfModel = RandomForestClassificationModel.load("s3://expedia-hotel-recommendations-workflow/rfm_model") # Make predictions with Random Forest model rfPredictions = rfModel.transform(testData) # Calculate and print Accuracy score for Random Forest Algorithm evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") rfAccuracy = evaluator.evaluate(rfPredictions) print("Random Forest accuracy Error = %g" % (rfAccuracy)) # Calculate and print F1 score for Random Forest Algorithm evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="f1") rfF1 = evaluator.evaluate(rfPredictions) print("Random Forest f1 Error = %g" % (rfF1))
def main(iso_date, base_path): APP_NAME = "make_predictions.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # Load each and every model in the pipeline # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string indexers into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model = StringIndexerModel.load( string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) rfc = RandomForestClassificationModel.load(random_forest_model_path) # # Run the requests through the transformations from training # # Get today and tomorrow's dates as iso strings to scope query today_dt = iso8601.parse_date(iso_date) rounded_today = today_dt.date() iso_today = rounded_today.isoformat() # Build the day's input path: a date based primary key directory structure today_input_path = "{}/data/prediction_tasks_daily.json/{}".format( base_path, iso_today) from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), ]) prediction_requests = spark.read.json(today_input_path, schema=schema) prediction_requests.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests.withColumn( 'Route', concat(prediction_requests.Origin, lit('-'), prediction_requests.Dest)) prediction_requests_with_route.show(6) # Index string fields with the corresponding indexer for that column for column in [ "Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear", "Origin", "Dest", "Route" ]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform( prediction_requests_with_route) # Vectorize numeric columns: DepDelay and Distance final_vectorized_features = vector_assembler.transform( prediction_requests_with_route) # Drop the indexes for the nominal fields index_columns = [ "Carrier_index", "DayOfMonth_index", "DayOfWeek_index", "DayOfYear_index", "Origin_index", "Origin_index", "Dest_index", "Route_index" ] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop( "rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Build the day's output path: a date based primary key directory structure today_output_path = "{}/data/prediction_results_daily.json/{}".format( base_path, iso_today) # Save the output to its daily bucket final_predictions.repartition(1).write.mode("overwrite").json( today_output_path)
def main(base_path): APP_NAME = "make_predictions_streaming.py" # Process data every 10 seconds PERIOD = 10 BROKERS = 'localhost:9092' PREDICTION_TOPIC = 'flight_delay_classification_request' try: sc and ssc except NameError as e: import findspark # Add the streaming package and initialize findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"]) findspark.init() import pyspark import pyspark.sql import pyspark.streaming conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf) ssc = StreamingContext(sc, PERIOD) spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # Load all models to be used in making predictions # # Load the arrival delay bucketizer from pyspark.ml.feature import Bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path) # Load all the string field vectorizer pipelines into a dict from pyspark.ml.feature import StringIndexerModel string_indexer_models = {} for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model = StringIndexerModel.load(string_indexer_model_path) string_indexer_models[column] = string_indexer_model # Load the numeric vector assembler from pyspark.ml.feature import VectorAssembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler = VectorAssembler.load(vector_assembler_path) # Load the classifier model from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) rfc = RandomForestClassificationModel.load( random_forest_model_path ) # # Process Prediction Requests in Streaming # stream = KafkaUtils.createDirectStream( ssc, [PREDICTION_TOPIC], { "metadata.broker.list": BROKERS, "group.id": "0", } ) object_stream = stream.map(lambda x: json.loads(x[1])) object_stream.pprint() row_stream = object_stream.map( lambda x: Row( FlightDate=iso8601.parse_date(x['FlightDate']), Origin=x['Origin'], Distance=x['Distance'], DayOfMonth=x['DayOfMonth'], DayOfYear=x['DayOfYear'], UUID=x['UUID'], DepDelay=x['DepDelay'], DayOfWeek=x['DayOfWeek'], FlightNum=x['FlightNum'], Dest=x['Dest'], Timestamp=iso8601.parse_date(x['Timestamp']), Carrier=x['Carrier'] ) ) row_stream.pprint() # # Create a dataframe from the RDD-based object stream # def classify_prediction_requests(rdd): from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField prediction_request_schema = StructType([ StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Timestamp", TimestampType(), True), StructField("UUID", StringType(), True), ]) prediction_requests_df = spark.createDataFrame(rdd, schema=prediction_request_schema) prediction_requests_df.show() # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat prediction_requests_with_route = prediction_requests_df.withColumn( 'Route', concat( prediction_requests_df.Origin, lit('-'), prediction_requests_df.Dest ) ) prediction_requests_with_route.show(6) # Vectorize string fields with the corresponding pipeline for that column # Turn category fields into categoric feature vectors, then drop intermediate fields for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer_model = string_indexer_models[column] prediction_requests_with_route = string_indexer_model.transform(prediction_requests_with_route) # Vectorize numeric columns: DepDelay, Distance and index columns final_vectorized_features = vector_assembler.transform(prediction_requests_with_route) # Inspect the vectors final_vectorized_features.show() # Drop the individual index columns index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"] for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Make the prediction predictions = rfc.transform(final_vectorized_features) # Drop the features vector and prediction metadata to give the original fields predictions = predictions.drop("Features_vec") final_predictions = predictions.drop("indices").drop("values").drop("rawPrediction").drop("probability") # Inspect the output final_predictions.show() # Store to Mongo if final_predictions.count() > 0: final_predictions.rdd.map(lambda x: x.asDict()).saveToMongoDB( "mongodb://localhost:27017/agile_data_science.flight_delay_classification_response" ) # Do the classification and store to Mongo row_stream.foreachRDD(classify_prediction_requests) ssc.start() ssc.awaitTermination()
#load model if algoName == "LogisticRegression": from pyspark.ml.classification import LogisticRegressionModel model = LogisticRegressionModel.load(modelPath) elif algoName == "LinearRegression": from pyspark.ml.regression import LinearRegressionModel model = LinearRegressionModel.load(modelPath) elif algoName == "DecisionTreeClassification": from pyspark.ml.classification import DecisionTreeClassificationModel model = DecisionTreeClassificationModel.load(modelPath) elif algoName == "DecisionTreeRegression": from pyspark.ml.regression import DecisionTreeRegressionModel model = DecisionTreeRegressionModel.load(modelPath) elif algoName == "RandomForestClassification": from pyspark.ml.classification import RandomForestClassificationModel model = RandomForestClassificationModel.load(modelPath) elif algoName == "RandomForestRegression": from pyspark.ml.regression import RandomForestRegressionModel model = RandomForestRegressionModel.load(modelPath) elif algoName == "GBTClassification": from pyspark.ml.classification import GBTClassificationModel model = GBTClassificationModel.load(modelPath) elif algoName == "GBTRegression": from pyspark.ml.regression import GBTRegressionModel model = GBTRegressionModel.load(modelPath) #predict prediction = model.transform(data).select("prediction") #save prediction.write.format("csv").save(outputPath)
import sys #Create and connect to spark session, read data given in docker command spark = SparkSession.builder.master('local[*]').appName( 'Predict_model').getOrCreate() test_set = spark.read.csv(sys.argv[-1], header=True, inferSchema=True, sep=';') # Create feature vector assembler = VectorAssembler(inputCols=[ test_set.columns[0], test_set.columns[1], test_set.columns[2], test_set.columns[3], test_set.columns[4], test_set.columns[5], test_set.columns[6], test_set.columns[7], test_set.columns[8], test_set.columns[9], test_set.columns[10] ], outputCol='features') test_assembled = assembler.transform(test_set) test_assembled = test_assembled.select(test_assembled.columns[-1], test_assembled.columns[-2]) # Load trained classification model rfp = RandomForestClassificationModel.load('RF_model') #Predict classes of new data predictions = rfp.transform(test_assembled) #Evaluate model performance multi_evaluator = MulticlassClassificationEvaluator( labelCol=test_assembled.columns[-1], metricName='f1') print('F-1 Score of the classification model:', multi_evaluator.evaluate(predictions))