def train(): schema = StructType([ StructField("Pregnancies", DoubleType()), StructField("Glucose", DoubleType()), StructField("BloodPressure", DoubleType()), StructField("SkinThickness", DoubleType()), StructField("Insulin", DoubleType()), StructField("BMI", DoubleType()), StructField("DiabetesPedigreeFunction", DoubleType()), StructField("Age", DoubleType()), StructField("Outcome", DoubleType()) ]) df = spark.read.schema(schema).csv("/home/admin/Downloads/diabetes.csv", header=True) df_assembler = VectorAssembler(inputCols=[ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age' ], outputCol="features") df = df_assembler.transform(df) model_df = df.select(['features', 'Outcome']) train_df, test_df = model_df.randomSplit([0.75, 0.25]) rf_classifier = RandomForestClassifier(labelCol='Outcome', numTrees=50).fit(train_df) rf_predictions = rf_classifier.transform(test_df) rf_accuracy = MulticlassClassificationEvaluator( labelCol='Outcome', metricName='accuracy').evaluate(rf_predictions) print(rf_accuracy) #Save Model As Pickle File rf_classifier.save("/home/admin/Downloads/RF_model")
# Creamos el modelo de Random Forest, lo entrenamos y realizamos la prediccion now = datetime.datetime.now() print(now.year, now.month, now.day, now.hour, now.minute, now.second) rf = RandomForestClassifier(labelCol='attack_cat_index', featuresCol='features', seed=1234, maxBins=136, maxDepth=25, featureSubsetStrategy='all') rf = rf.fit(train) now = datetime.datetime.now() print(now.year, now.month, now.day, now.hour, now.minute, now.second) result = rf.transform(test) # Evaluamos la prediccion evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index", metricName="accuracy") accuracy = evaluator.evaluate(result) print("Accuracy = {}".format(accuracy)) evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index", metricName="weightedPrecision") weightedPrecision = evaluator.evaluate(result) print("weightedPrecision = {}".format(weightedPrecision)) evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index", metricName="f1") f1 = evaluator.evaluate(result)
print('AUC ROC of Decision Tree model is %f' % evaluator1.evaluate(pred)) print('F1 score of Decision Tree model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose() # <a id="context323"></a> # #### 3.2.3. Random Forest # In[15]: from pyspark.ml.classification import RandomForestClassifier # model on training data numTrees is the hyperparameter rfModel = RandomForestClassifier(numTrees=100).fit(trainData) # make prediction on test data pred = rfModel.transform(testData) pred.select('label', 'prediction').show() evaluator1 = BinaryClassificationEvaluator(labelCol='label', metricName="areaUnderROC") evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName="f1") metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple)) print('AUC ROC of Random Forest model is %f' % evaluator1.evaluate(pred)) print('F1 score of Random Forest model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose() # <a id="context4"></a> # ## 4. Summary
#index y #分训练和测试 #labelIndexer = StringIndexer(inputCol = "affairs", outputCol = "indexedLabel").fit(df) #data = labelIndexer.transform(df) Data = feature_model.transform(data) print("所有的特征名称:{0}".format(Data.columns)) train_data, test_data = Data.randomSplit([0.7, 0.3], seed=1994) print("训练样本数:%d\n测试样本数:%d" % (train_data.count(), test_data.count())) #随机森林 rf = RandomForestClassifier(numTrees=100, featuresCol='features', labelCol="labels", seed=7).fit(train_data) Predictions = rf.transform(test_data) #f1 = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='f1',metricLabel=1).evaluate(lrPredictions) #accuracy = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='accuracy',metricLabel=1).evaluate(lrPredictions) #weightedPrecision = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='weightedPrecision',metricLabel=1).evaluate(lrPredictions) #weightedRecall = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='weightedRecall',metricLabel=1).evaluate(lrPredictions) #分类报告 report = Predictions.select("prediction", "labels", "features", "probability").toPandas() print( classification_report(y_true=report['labels'], y_pred=report['prediction'])) # 使用混淆矩阵评估模型性能[[TP,FN],[TN,FP]] TP = Predictions.filter(Predictions['prediction'] == 1).filter( Predictions['labels'] == 1).count()
from pyspark.ml.classification import RandomForestClassifier rf_classifier=RandomForestClassifier(labelCol='affairs',numTrees=50).fit(train_df) rf_predictions=rf_classifier.transform(test_df) rf_predictions.show() rf_predictions.groupBy('prediction').count().show() rf_predictions.select(['probability','affairs','prediction']).show(10,False) from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator rf_accuracy=MulticlassClassificationEvaluator(labelCol='affairs',metricName='accuracy').evaluate(rf_predictions) print('The accuracy of RF on test data is {0:.0%}'.format(rf_accuracy)) print(rf_accuracy) rf_precision=MulticlassClassificationEvaluator(labelCol='affairs',metricName='weightedPrecision').evaluate(rf_predictions) print('The precision rate on test data is {0:.0%}'.format(rf_precision)) rf_precision rf_auc=BinaryClassificationEvaluator(labelCol='affairs').evaluate(rf_predictions) print(rf_auc) # Feature importance rf_classifier.featureImportances df.schema["features"].metadata["ml_attr"]["attrs"] # Save the model rf_classifier.save("C:\\Users\\Hernan\\Data Science\\SPARK\\machine-learning-with-pyspark\\chapter_6_Random_Forests\\RF_model") from pyspark.ml.classification import RandomForestClassificationModel
def predict(model: RandomForestClassifier, testing_data: DataFrame) -> DataFrame: """Node for making predictions given a pre-trained model and a testing dataset. """ predictions = model.transform(testing_data) return predictions
print(kars_train.count(), kars_test.count()) # Create a Random Forest classifier #tree = DecisionTreeClassifier(labelCol="origin_idx") forest = RandomForestClassifier(labelCol="origin_idx", numTrees=5) # Learn from training data #tree = tree.fit(kars_train) forest = forest.fit(kars_train) print("\nforest.trees:") for i in forest.trees: print(" ", i) print() # Make predictions on testing data prediction = forest.transform(kars_test) prediction.show(9, False) print("\nforest.featureImportances:", forest.featureImportances, '\n') # Confusion matrix confusion_matrix = prediction.groupBy("origin_idx", "prediction").count() confusion_matrix.show() # Accuracy evaluator = MulticlassClassificationEvaluator(labelCol="origin_idx", metricName="accuracy") accuracy = evaluator.evaluate(prediction) print("Test set accuracy = " + str(accuracy)) spark.stop()
# A Pipeline object that combines all the transformations we defined above. # Use the pipeline object to transform our dataframe mushrooms_trans = pipeline \ .fit(mushrooms) \ .transform(mushrooms) \ .cache() # Train-test split mushrooms_train, mushrooms_val = mushrooms_trans.randomSplit([0.7, 0.3], seed=2017) model = RandomForestClassifier(labelCol='poisonous', featuresCol='features', numTrees=200) \ .fit(mushrooms_train) pred = model.transform(mushrooms_val) results = pred.select(['probability', 'prediction', 'poisonous']) # Select the columns relevant for evaluation # `results` looks like this: # +--------------------+----------+---------+ # | probability|prediction|poisonous| # +--------------------+----------+---------+ # |[0.97024593961675...| 0.0| 0.0| # |[0.96303265951929...| 0.0| 0.0| # |[0.95909221894651...| 0.0| 0.0| # |[0.95958294573868...| 0.0| 0.0| # |[0.95580449199223...| 0.0| 0.0| # +--------------------+----------+---------+ results_collect = results.collect()
#----------------- Decision and Random Forest ----------------- # Final assembly inputCols = ['norm_cols' ] + [cname + "classVec" for cname in categorical_cols] final_assembler = VectorAssembler(inputCols=inputCols, outputCol='features') stages += [final_assembler] pipeline = Pipeline(stages=stages) train_final = pipeline.fit(train).transform(train) test_final = pipeline.fit(test).transform(test) dt = DecisionTreeClassifier(featuresCol='features', labelCol='label').fit(train_final) res_dt = dt.transform(test_final) rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=20).fit(train_final) res_rf = rf.transform(test_final) res_lr.select('prediction', 'label').write.csv(sys.argv[2] + "lr", header=True) res_dt.select('prediction', 'label').write.csv(sys.argv[2] + "dt", header=True) res_rf.select('prediction', 'label').write.csv(sys.argv[2] + "rf", header=True) spark.stop()
dftest = impage.transform(dftest) dftest = dftest.drop('Age') # In[22]: dftest = sipclass.transform(dftest) dftest = dftest.drop('Pclass') dftest = ohe.transform(dftest) dftest = dftest.drop('idxPclass') # In[23]: dftest = va.transform(dftest) dftest = dftest.drop('SibSp', 'Parch', 'Fare', 'impAge', 'ohePclass') dftest.show() # In[24]: # predict using random forest classifier on test data predictions = rfc.transform(dftest) predictions.show() # In[25]: # evaluate prediction results from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy") evaluator.evaluate(predictions)