def main(path_data, path_parameters, dir_models):
    logger = logging.getLogger(__name__)
    spark = (
        pyspark.sql.SparkSession
            .builder
            .appName("Python Spark Random Forest model training")
            .enableHiveSupport()
            .getOrCreate()
    )

    logger.info("Reading parquet data and splitting into test and train datasets")
    data_df = spark.read.parquet(path_data)
    splits = data_df.randomSplit([0.7, 0.3])
    training_df = splits[0]
    validation_df = splits[1]

    logger.info("Constructing pipeline for prediction model")
    with open(path_parameters) as json_file:
        parameters = json.load(json_file)
    feature_columns = parameters['feature_columns']
    rf_params = parameters['rf_params']
    assembler = feature.VectorAssembler(
        inputCols=feature_columns,
        outputCol="features")

    rf = classification.RandomForestClassifier(
        labelCol="churn", **rf_params)

    rf_pipeline = pipeline.Pipeline(stages=[assembler, rf])
    logger.info("Training prediction model")
    pipeline_model = rf_pipeline.fit(training_df)

    logger.info("Calculating model metrics")
    train_predictions_df = pipeline_model.transform(training_df)
    validation_predictions_df = pipeline_model.transform(validation_df)

    accuracy_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="accuracy", labelCol="churn", predictionCol="prediction")

    precision_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="weightedPrecision", labelCol="churn", predictionCol="prediction")

    recall_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="weightedRecall", labelCol="churn", predictionCol="prediction")

    f1_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="f1", labelCol="churn", predictionCol="prediction")

    auroc_evaluator = evaluation.BinaryClassificationEvaluator(metricName='areaUnderROC', labelCol="churn")

    logger.info("Saving model and metrics data")
    train_metrics = {
        "accuracy": accuracy_evaluator.evaluate(train_predictions_df),
        "precision": precision_evaluator.evaluate(train_predictions_df),
        "recall": recall_evaluator.evaluate(train_predictions_df),
        "f1": f1_evaluator.evaluate(train_predictions_df),
        "auroc": auroc_evaluator.evaluate(train_predictions_df)
    }
    validation_metrics = {
        "accuracy": accuracy_evaluator.evaluate(validation_predictions_df),
        "precision": precision_evaluator.evaluate(validation_predictions_df),
        "recall": recall_evaluator.evaluate(validation_predictions_df),
        "f1": f1_evaluator.evaluate(validation_predictions_df),
        "auroc": auroc_evaluator.evaluate(validation_predictions_df)
    }

    rf_model = pipeline_model.stages[-1]
    model_params = rf_model.extractParamMap()
    model_description = {
        "name": "Random Forest",
        "params": {param.name: value for param, value in model_params.items()},
    }

    dir_model = pathlib.Path(dir_models)
    dir_model.mkdir(parents=True, exist_ok=True)

    path_pipeline_model = pathlib.Path(dir_model).joinpath("pipeline_model")
    path_train_metrics = pathlib.Path(dir_model).joinpath("metrics_train.json")
    path_validation_metrics = pathlib.Path(dir_model).joinpath("metrics_validation.json")
    path_model_description = pathlib.Path(dir_model).joinpath("model_description.json")

    pipeline_model.save(str(path_pipeline_model))
    with open(path_train_metrics, "w") as f:
        json.dump(train_metrics, f)
    with open(path_validation_metrics, "w") as f:
        json.dump(validation_metrics, f)
    with open(path_model_description, "w") as f:
        json.dump(model_description, f)
示例#2
0
df_test =scaler.transform(df_test)
df_test = df_test.drop('features').withColumnRenamed('scaledFeatures','features')


# train data
algo = DecisionTreeClassifier()
grid = ParamGridBuilder().build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=algo, estimatorParamMaps=grid, numFolds=10, evaluator=evaluator)
cv_model = cv.fit(df_train)
pred = cv_model.transform(df_test)
print("from {}, {} died. {}".format(pred.count(), pred.filter(pred.prediction == 0).count(),pred.filter(pred.prediction == 0).count()/pred.count()))

pred_csv = pred.toPandas()
pred_csv = pred_csv.filter(['PassengerId', 'prediction'])
pred_csv = pred_csv.rename(columns={'prediction':'Survived'})
pred_csv.to_csv (r'lr.csv', index = False, header=True)


data_train, data_test = df_train.randomSplit([0.8,0.2])
# train model logistic regression
algo_t = DecisionTreeClassifier()
model_t = algo_t.fit(data_train)
pat = model_t.transform(data_test)
print("from {}, {} died. {}".format(pat.count(), pat.filter(pat.prediction == 0).count(),pat.filter(pat.prediction == 0).count()/pat.count()))

evaloator = evaluation.MulticlassClassificationEvaluator(metricName='accuracy')
print("Accuracy DecisionTreeClassifier: {}".format(evaloator.evaluate((pat))))


示例#3
0
# Use a classifier to generate the final predictions
classifier = smc.GBTClassifier(labelCol='label',
                               featuresCol=reducer.getOutputCol(),
                               predictionCol='predictedLabel')

# Combine all steps in a pipeline
pipeline = sm.Pipeline(stages=[scaler, reducer, classifier])

# Create an evaluator which will quantify model performance
# evaluator = sme.BinaryClassificationEvaluator(
#     labelCol='label',
#     rawPredictionCol='predictedLabel',
#     metricName='areaUnderROC'
# )
eval_f1 = sme.MulticlassClassificationEvaluator(labelCol='label',
                                                predictionCol='predictedLabel',
                                                metricName='f1')

# Set up a parameter grid for cross validation
param_grid = smt.ParamGridBuilder().addGrid(
    reducer.k,
    [10, 20, 50, 75]).addGrid(classifier.maxDepth,
                              [2, 5, 10]).addGrid(classifier.subsamplingRate,
                                                  [0.1, 0.2, 0.3]).build()

# Bring everything together
validator = smt.CrossValidator(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=eval_f1,
                               numFolds=3)
示例#4
0
print(type(test_model))

# In[55]:

display(test_model.select('prediction').distinct())

# We observe that our model does not give any 2 star ratings. Let's check how accurate it is.

# ### 8: Performance Evaluation

# In[58]:

import pyspark.ml.evaluation as ev

# Create an object of the Evaluator class that evaluates our model based on the rawPredictionCol and the label that we need to predict
evaluator = ev.MulticlassClassificationEvaluator(predictionCol='prediction',
                                                 labelCol='review_stars')

# Extract the needed evaluation criteria from the object
print(evaluator.evaluate(test_model, {evaluator.metricName: 'accuracy'}))
# print(evaluator.evaluate(test_model,{evaluator.metricName: 'areaUnderPR'}))

# We observe an accuracy of 51 percent on this baseline model

# In[60]:

get_ipython().run_line_magic('fs', 'ls reviewAnalysisDf.tsv')

# In[61]:

# In[62]:
hiperparam_sets = make_param_sets(params["hiperparameter_grid"])
assembler = feature.VectorAssembler(inputCols=feature_columns,
                                    outputCol="features")
count = 0
for hiperparams in hiperparam_sets:
    count = count + 1
    rf_params = hiperparams
    classific = classification.RandomForestClassifier(labelCol="churn",
                                                      **rf_params)
    dt_pipe_md = pipeline.Pipeline(stages=[assembler, classific])
    dt_pipe_md_model = dt_pipe_md.fit(training_df)
    train_predictions_df = dt_pipe_md_model.transform(training_df)
    validation_predictions_df = dt_pipe_md_model.transform(validation_df)
    test_prediction_df = dt_pipe_md_model.transform(test_df)

    accuracy_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="accuracy", labelCol="churn", predictionCol="prediction")

    precision_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="weightedPrecision", labelCol="churn")
    recall_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="weightedRecall",
        labelCol="churn",
        predictionCol="prediction")
    auroc_evaluator = evaluation.BinaryClassificationEvaluator(
        metricName='areaUnderROC', labelCol="churn")

    f1_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="f1", labelCol="churn", predictionCol="prediction")

    train_metrics = {
        "accuracy": accuracy_evaluator.evaluate(train_predictions_df),

# ### Prediction Result 

# In[70]:


#Run prediction on the whole dataset
df_test_pred1 = forest_model.transform(df_test)
df_test_pred1.show()


# ### Confusion Metrics

# In[71]:


df_test_pred1.groupBy("Success_Failure").pivot("prediction").count().show()


# ### Evaluate

# In[72]:


#Evaluate
evaluator = evaluation.MulticlassClassificationEvaluator(labelCol="success_failure", 
                                        metricName="accuracy", predictionCol="prediction")
evaluator.evaluate(df_test_pred1)