def main(path_data, path_parameters, dir_models): logger = logging.getLogger(__name__) spark = ( pyspark.sql.SparkSession .builder .appName("Python Spark Random Forest model training") .enableHiveSupport() .getOrCreate() ) logger.info("Reading parquet data and splitting into test and train datasets") data_df = spark.read.parquet(path_data) splits = data_df.randomSplit([0.7, 0.3]) training_df = splits[0] validation_df = splits[1] logger.info("Constructing pipeline for prediction model") with open(path_parameters) as json_file: parameters = json.load(json_file) feature_columns = parameters['feature_columns'] rf_params = parameters['rf_params'] assembler = feature.VectorAssembler( inputCols=feature_columns, outputCol="features") rf = classification.RandomForestClassifier( labelCol="churn", **rf_params) rf_pipeline = pipeline.Pipeline(stages=[assembler, rf]) logger.info("Training prediction model") pipeline_model = rf_pipeline.fit(training_df) logger.info("Calculating model metrics") train_predictions_df = pipeline_model.transform(training_df) validation_predictions_df = pipeline_model.transform(validation_df) accuracy_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="accuracy", labelCol="churn", predictionCol="prediction") precision_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="weightedPrecision", labelCol="churn", predictionCol="prediction") recall_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="weightedRecall", labelCol="churn", predictionCol="prediction") f1_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="f1", labelCol="churn", predictionCol="prediction") auroc_evaluator = evaluation.BinaryClassificationEvaluator(metricName='areaUnderROC', labelCol="churn") logger.info("Saving model and metrics data") train_metrics = { "accuracy": accuracy_evaluator.evaluate(train_predictions_df), "precision": precision_evaluator.evaluate(train_predictions_df), "recall": recall_evaluator.evaluate(train_predictions_df), "f1": f1_evaluator.evaluate(train_predictions_df), "auroc": auroc_evaluator.evaluate(train_predictions_df) } validation_metrics = { "accuracy": accuracy_evaluator.evaluate(validation_predictions_df), "precision": precision_evaluator.evaluate(validation_predictions_df), "recall": recall_evaluator.evaluate(validation_predictions_df), "f1": f1_evaluator.evaluate(validation_predictions_df), "auroc": auroc_evaluator.evaluate(validation_predictions_df) } rf_model = pipeline_model.stages[-1] model_params = rf_model.extractParamMap() model_description = { "name": "Random Forest", "params": {param.name: value for param, value in model_params.items()}, } dir_model = pathlib.Path(dir_models) dir_model.mkdir(parents=True, exist_ok=True) path_pipeline_model = pathlib.Path(dir_model).joinpath("pipeline_model") path_train_metrics = pathlib.Path(dir_model).joinpath("metrics_train.json") path_validation_metrics = pathlib.Path(dir_model).joinpath("metrics_validation.json") path_model_description = pathlib.Path(dir_model).joinpath("model_description.json") pipeline_model.save(str(path_pipeline_model)) with open(path_train_metrics, "w") as f: json.dump(train_metrics, f) with open(path_validation_metrics, "w") as f: json.dump(validation_metrics, f) with open(path_model_description, "w") as f: json.dump(model_description, f)
df_test =scaler.transform(df_test) df_test = df_test.drop('features').withColumnRenamed('scaledFeatures','features') # train data algo = DecisionTreeClassifier() grid = ParamGridBuilder().build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=algo, estimatorParamMaps=grid, numFolds=10, evaluator=evaluator) cv_model = cv.fit(df_train) pred = cv_model.transform(df_test) print("from {}, {} died. {}".format(pred.count(), pred.filter(pred.prediction == 0).count(),pred.filter(pred.prediction == 0).count()/pred.count())) pred_csv = pred.toPandas() pred_csv = pred_csv.filter(['PassengerId', 'prediction']) pred_csv = pred_csv.rename(columns={'prediction':'Survived'}) pred_csv.to_csv (r'lr.csv', index = False, header=True) data_train, data_test = df_train.randomSplit([0.8,0.2]) # train model logistic regression algo_t = DecisionTreeClassifier() model_t = algo_t.fit(data_train) pat = model_t.transform(data_test) print("from {}, {} died. {}".format(pat.count(), pat.filter(pat.prediction == 0).count(),pat.filter(pat.prediction == 0).count()/pat.count())) evaloator = evaluation.MulticlassClassificationEvaluator(metricName='accuracy') print("Accuracy DecisionTreeClassifier: {}".format(evaloator.evaluate((pat))))
# Use a classifier to generate the final predictions classifier = smc.GBTClassifier(labelCol='label', featuresCol=reducer.getOutputCol(), predictionCol='predictedLabel') # Combine all steps in a pipeline pipeline = sm.Pipeline(stages=[scaler, reducer, classifier]) # Create an evaluator which will quantify model performance # evaluator = sme.BinaryClassificationEvaluator( # labelCol='label', # rawPredictionCol='predictedLabel', # metricName='areaUnderROC' # ) eval_f1 = sme.MulticlassClassificationEvaluator(labelCol='label', predictionCol='predictedLabel', metricName='f1') # Set up a parameter grid for cross validation param_grid = smt.ParamGridBuilder().addGrid( reducer.k, [10, 20, 50, 75]).addGrid(classifier.maxDepth, [2, 5, 10]).addGrid(classifier.subsamplingRate, [0.1, 0.2, 0.3]).build() # Bring everything together validator = smt.CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=eval_f1, numFolds=3)
print(type(test_model)) # In[55]: display(test_model.select('prediction').distinct()) # We observe that our model does not give any 2 star ratings. Let's check how accurate it is. # ### 8: Performance Evaluation # In[58]: import pyspark.ml.evaluation as ev # Create an object of the Evaluator class that evaluates our model based on the rawPredictionCol and the label that we need to predict evaluator = ev.MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='review_stars') # Extract the needed evaluation criteria from the object print(evaluator.evaluate(test_model, {evaluator.metricName: 'accuracy'})) # print(evaluator.evaluate(test_model,{evaluator.metricName: 'areaUnderPR'})) # We observe an accuracy of 51 percent on this baseline model # In[60]: get_ipython().run_line_magic('fs', 'ls reviewAnalysisDf.tsv') # In[61]: # In[62]:
hiperparam_sets = make_param_sets(params["hiperparameter_grid"]) assembler = feature.VectorAssembler(inputCols=feature_columns, outputCol="features") count = 0 for hiperparams in hiperparam_sets: count = count + 1 rf_params = hiperparams classific = classification.RandomForestClassifier(labelCol="churn", **rf_params) dt_pipe_md = pipeline.Pipeline(stages=[assembler, classific]) dt_pipe_md_model = dt_pipe_md.fit(training_df) train_predictions_df = dt_pipe_md_model.transform(training_df) validation_predictions_df = dt_pipe_md_model.transform(validation_df) test_prediction_df = dt_pipe_md_model.transform(test_df) accuracy_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="accuracy", labelCol="churn", predictionCol="prediction") precision_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="weightedPrecision", labelCol="churn") recall_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="weightedRecall", labelCol="churn", predictionCol="prediction") auroc_evaluator = evaluation.BinaryClassificationEvaluator( metricName='areaUnderROC', labelCol="churn") f1_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="f1", labelCol="churn", predictionCol="prediction") train_metrics = { "accuracy": accuracy_evaluator.evaluate(train_predictions_df),
# ### Prediction Result # In[70]: #Run prediction on the whole dataset df_test_pred1 = forest_model.transform(df_test) df_test_pred1.show() # ### Confusion Metrics # In[71]: df_test_pred1.groupBy("Success_Failure").pivot("prediction").count().show() # ### Evaluate # In[72]: #Evaluate evaluator = evaluation.MulticlassClassificationEvaluator(labelCol="success_failure", metricName="accuracy", predictionCol="prediction") evaluator.evaluate(df_test_pred1)