def train(): schema = StructType([ StructField("Pregnancies", DoubleType()), StructField("Glucose", DoubleType()), StructField("BloodPressure", DoubleType()), StructField("SkinThickness", DoubleType()), StructField("Insulin", DoubleType()), StructField("BMI", DoubleType()), StructField("DiabetesPedigreeFunction", DoubleType()), StructField("Age", DoubleType()), StructField("Outcome", DoubleType()) ]) df = spark.read.schema(schema).csv("/home/admin/Downloads/diabetes.csv", header=True) df_assembler = VectorAssembler(inputCols=[ 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age' ], outputCol="features") df = df_assembler.transform(df) model_df = df.select(['features', 'Outcome']) train_df, test_df = model_df.randomSplit([0.75, 0.25]) rf_classifier = RandomForestClassifier(labelCol='Outcome', numTrees=50).fit(train_df) rf_predictions = rf_classifier.transform(test_df) rf_accuracy = MulticlassClassificationEvaluator( labelCol='Outcome', metricName='accuracy').evaluate(rf_predictions) print(rf_accuracy) #Save Model As Pickle File rf_classifier.save("/home/admin/Downloads/RF_model")
import pyspark.sql.functions as func import pyspark conf = SparkConf().setAppName("Wine Quality Prediction").setMaster("local[4]") sc = SparkContext(conf=conf) spark = SparkSession.builder.getOrCreate() #Read data from csv data = spark.read.format('csv').options(header='true', inferSchema='true', delimiter=';').csv("s3://pa2smit/TrainingDataset.csv") print("\nPrinting Training Schema\n") data.printSchema() data.count() featureColumns = [col for col in data.columns if col != '""""quality"""""'] assembler = VectorAssembler(inputCols=featureColumns, outputCol='values') transformData = assembler.transform(data) rf = RandomForestClassifier(featuresCol='values', labelCol='""""quality"""""',numTrees=100, maxBins=484, maxDepth=25, minInstancesPerNode=5, seed=34) rfModel = rf.fit(transformData) evaluator = MulticlassClassificationEvaluator(labelCol='""""quality"""""', predictionCol="prediction", metricName="f1") rfTrainingPredictions = rfModel.transform(transformData) print("\nModel Training Completed ...\n") print("\nRandom Forest f1 of traning data = %g\n" % evaluator.evaluate(rfTrainingPredictions)) rf.save("s3://pa2smit/wine_model.model")
print(rf_accuracy) rf_precision=MulticlassClassificationEvaluator(labelCol='affairs',metricName='weightedPrecision').evaluate(rf_predictions) print('The precision rate on test data is {0:.0%}'.format(rf_precision)) rf_precision rf_auc=BinaryClassificationEvaluator(labelCol='affairs').evaluate(rf_predictions) print(rf_auc) # Feature importance rf_classifier.featureImportances df.schema["features"].metadata["ml_attr"]["attrs"] # Save the model rf_classifier.save("C:\\Users\\Hernan\\Data Science\\SPARK\\machine-learning-with-pyspark\\chapter_6_Random_Forests\\RF_model") from pyspark.ml.classification import RandomForestClassificationModel rf=RandomForestClassificationModel.load("C:\\Users\\Hernan\\Data Science\\SPARK\\machine-learning-with-pyspark\\chapter_6_Random_Forests\\RF_model") test_df.show(5) model_preditions=rf.transform(test_df) model_preditions.show() single_df = spark.createDataFrame([[5.0,33.0,5.0,1.0,5.0,0.0]], ['rate_marriage', 'age', 'yrs_married', 'children', 'religious', 'affairs']) single_df = df_assembler.transform(single_df) single_df = single_df.select(['features','affairs']) model_predition=rf.transform(single_df) model_predition.show()
sc = SparkContext(conf=config) myspark = SparkSession.builder.getOrCreate() # Read the data and Print the schema print("\nProgram Starting...\n") defTrain = myspark.read.format('csv').options(header='true', inferSchema='true', delimiter=';').csv("s3://winedataset/TrainingDataset.csv") print("\nTraining Schema\n") defTrain.printSchema() defTrain.count() featureData = [col for col in defTrain.columns if (col != '""""quality"""""')] assembler = VectorAssembler(inputCols=featureData, outputCol='features') dataDF = assembler.transform(defTrain) print("\n\nPrinting Training Schema with Features Table\n\n") dataDF.printSchema() # Random Forest Regression on TrainingDataset rf = RandomForestClassifier(featuresCol='features', labelCol='""""quality"""""', numTrees=100, maxBins=484, maxDepth=25, minInstancesPerNode=5, seed=34) rfPipeline = Pipeline(stages=[assembler, rf]) rfPipelineModel = rfPipeline.fit(trainingDF) evaluator = RegressionEvaluator( labelCol='""""quality"""""', predictionCol="prediction", metricName="rmse") rfTrainingPredictions = rfPipelineModel.transform(defTrain) rf.save("s3://myprogrambucket/rfwine_model.model")
model = idf.fit(featurized) result = model.transform(featurized) #save idf and idf model idf_path = PROJECT_HOME + 'tmp/idf' #idf.save(idf_path) idfmodel_path = PROJECT_HOME + 'tmp/idfmodel' #model.save(idfmodel_path) #load via following #loadedModel = IDFModel.load(idfmodel_path) #fit single rf model rf = RandomForestClassifier(numTrees=100, labelCol="label", seed=42) rf_model = rf.fit(result) rf_path = PROJECT_HOME + 'tmp/rf' rf.save(rf_path) rfmodel_path = PROJECT_HOME + 'tmp/rfmodel' rf_model.save(rfmodel_path) """ #Prepare Train Test Split train, test = result.randomSplit([0.8, 0.2], seed=42) # Configure an ML pipeline, which consists of tree stages: hashingTF, idf and RandomForestClassifier. rf = RandomForestClassifier(labelCol="label", seed=42) pipeline = Pipeline(stages=[rf]) #grid search paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [100]).addGrid(rf.maxDepth, [5]).build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
# Random Splitting of Data splitValue = 0.7 trainingDF, testDF = defTrain.randomSplit([splitValue, 1 - splitValue]) print("\nSplitted Data into Training and Testing Dataset\n") # Random Forest Regression on TrainingDataset rf = RandomForestClassifier(featuresCol='features', labelCol='""""quality"""""', numTrees=100, maxBins=484, maxDepth=25, minInstancesPerNode=5, seed=34) rfPipeline = Pipeline(stages=[assembler, rf]) rfPipelineModel = rfPipeline.fit(trainingDF) evaluator = RegressionEvaluator(labelCol='""""quality"""""', predictionCol="prediction", metricName="rmse") rfTrainingPredictions = rfPipelineModel.transform(defTrain) rfTestPredictions = rfPipelineModel.transform(testDF) print( "\nCompleted Model Training...\n\nRandom Forest RMSE on traning data = %g\n" % evaluator.evaluate(rfTrainingPredictions)) print("\nRandom Forest RMSE on test data = %g\n" % evaluator.evaluate(rfTestPredictions)) rf.save("rfPipelineModel")
from pyspark.ml.classification import RandomForestClassifier rf_classifier = RandomForestClassifier(labelCol='故障', numTrees=50).fit( train_df ) # numTrees设置随机数的数量为50,还有其他参数:maxDepth 树深;返回的模型类型为:RandomForestClassificationModel rf_predictions = rf_classifier.transform(test_df) print('{}{}'.format( '评估每个属性的重要性:', rf_classifier.featureImportances)) # featureImportances : 评估每个功能的重要性, rf_predictions.select(['probability', '故障', 'prediction']).show(10, False) print("------查阅pyspark api,没有发现有训练准确率的字段,所以还需要计算预测的准确率------") from pyspark.ml.evaluation import BinaryClassificationEvaluator # 对二进制分类的评估器,它期望两个输入列:原始预测值和标签 from pyspark.ml.evaluation import MulticlassClassificationEvaluator # 多类分类的评估器,它期望两个输入列:预测和标签 rf_accuracy = MulticlassClassificationEvaluator( labelCol='故障', metricName='accuracy').evaluate(rf_predictions) print('MulticlassClassificationEvaluator 随机深林测试的准确性: {0:.0%}'.format( rf_accuracy)) rf_auc = BinaryClassificationEvaluator(labelCol='故障').evaluate(rf_predictions) print('BinaryClassificationEvaluator 随机深林测试的准确性: {0:.0%}'.format(rf_auc)) print('-----------保持模型,用于下次使用----------------') rf_classifier.save("RF_model")