def svm(df, trainingData, testData, maxIterValue, regParamValue, depth, thresholdValue): print("\n") print("mvs") svm = LinearSVC(labelCol="G3", featuresCol="features", maxIter=maxIterValue, regParam=regParamValue, aggregationDepth=depth, threshold=thresholdValue) # Fit the model model = svm.fit(trainingData) # make predictions using our trained model predictions = model.transform(testData) # estimate the accuracy of the prediction #Métricas de evaluación multi_evaluator = MulticlassClassificationEvaluator( labelCol="G3", predictionCol="prediction", metricName="accuracy") accuracy = multi_evaluator.evaluate(predictions) multi_evaluator = multi_evaluator.setMetricName('precisionByLabel') precision = multi_evaluator.evaluate(predictions) multi_evaluator = multi_evaluator.setMetricName('f1') f1_score = multi_evaluator.evaluate(predictions) multi_evaluator = multi_evaluator.setMetricName('recallByLabel') recall = multi_evaluator.evaluate(predictions) bin_evaluator = BinaryClassificationEvaluator( labelCol="G3", rawPredictionCol="prediction", metricName="areaUnderROC") area = bin_evaluator.evaluate(predictions) #results = [["Accuracy",accuracy],["Precision",precision], #["Recall",recall],["F1 Score",f1_score],["Area under ROC curve",area]] print("Accuracy = {}".format(accuracy)) print("Precision = {}".format(precision)) print("Recall = {}".format(recall)) print("F1 score = {}".format(f1_score)) print("Area under ROC curve = {}".format(area)) return (model)
def predict(row): svm = LinearSVC.load("Modelo1") predictions = svm.transform(row) multi_evaluator = MulticlassClassificationEvaluator( labelCol="G3", predictionCol="prediction", metricName="accuracy") accuracy = multi_evaluator.evaluate(predictions) multi_evaluator = multi_evaluator.setMetricName('precisionByLabel') precision = multi_evaluator.evaluate(predictions) multi_evaluator = multi_evaluator.setMetricName('f1') f1_score = multi_evaluator.evaluate(predictions) multi_evaluator = multi_evaluator.setMetricName('recallByLabel') recall = multi_evaluator.evaluate(predictions) bin_evaluator = BinaryClassificationEvaluator( labelCol="G3", rawPredictionCol="prediction", metricName="areaUnderROC") area = bin_evaluator.evaluate(predictions) return (accuracy, precision, f1_score, recall, area)
def train_model(self, data): # Create features vector from multiple columns assembler = VectorAssembler(inputCols=self.FEATURE_COLUMNS, outputCol='features', handleInvalid='skip') data_with_features_column = assembler.transform(data) feature_indexer = VectorIndexer( inputCol='features', outputCol='indexed_features').fit(data_with_features_column) pipeline = Pipeline(stages=[feature_indexer, self.model]) train_set, test_set = data_with_features_column.randomSplit([0.8, 0.2]) # Train the model trained_model = pipeline.fit(train_set) # Make predictions predictions = trained_model.transform(test_set) # Output metrics evaluator = MulticlassClassificationEvaluator( labelCol='end_cluster', predictionCol='prediction') evaluator.setMetricName('accuracy') accuracy = evaluator.evaluate(predictions) evaluator.setMetricName('weightedPrecision') precision = evaluator.evaluate(predictions) evaluator.setMetricName('weightedRecall') recall = evaluator.evaluate(predictions) evaluator.setMetricName('f1') f1 = evaluator.evaluate(predictions) print(f'Accuracy: {accuracy}') print(f'Precision: {precision}') print(f'Recall: {recall}') print(f'F1-Score: {f1}')
def evaluate_classification(predictions): evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") # print(evaluator.explainParams()) f1 = evaluator.evaluate(predictions) evaluator.setMetricName('weightedPrecision') weighted_precision = evaluator.evaluate(predictions) evaluator.setMetricName('weightedRecall') weighted_recall = evaluator.evaluate(predictions) evaluator.setMetricName('accuracy') accuracy = evaluator.evaluate(predictions) print() print("Test set accuracy = " + str(accuracy)) print("Test set weightedPrecision = " + str(weighted_precision)) print("Test set weightedRecall = " + str(weighted_recall)) print("Test set f1 = " + str(f1))
display(predDF) # COMMAND ---------- # MAGIC %md # MAGIC #Step 4) Collecting Metrics on test dataset # COMMAND ---------- from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(metricName="f1", labelCol="label_index") metricsDF = spark.createDataFrame( [("f1", evaluator.evaluate(predDF)), ("accuracy", evaluator.setMetricName("accuracy").evaluate(predDF))], ["Metric", "Value"]) display(metricsDF) # COMMAND ---------- from datetime import date today = date.today() # dd/mm/YY d1 = today.strftime("%d-%m-%Y") # COMMAND ---------- import mlflow import mlflow.tracking
# with text_clean: 0.607 # with text_clean + build_ngrams(n=2): 0.612 bceval = BinaryClassificationEvaluator() print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid), 3)) #Evaluate the model. metric : Area Under PR...... areaUnderPR:0.732 # with text_clean: 0.728 # with text_clean + build_ngrams(n=2): 0.729 bceval.setMetricName("areaUnderPR") print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid), 3)) #Evaluate the model. metric : F1 score...... f1:0.865 # with text_clean: 0.858 # with text_clean + build_ngrams(n=2): 0.882 mceval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") print(mceval.getMetricName() + ":" + str(round(mceval.evaluate(preds_valid), 3))) #Evaluate the model. metric : accuracy...... accuracy:0.866 # with text_clean: 0.859 # with text_clean + build_ngrams(n=2): 0.883 mceval.setMetricName("accuracy") print(mceval.getMetricName() + ":" + str(round(mceval.evaluate(preds_valid), 3))) ######### sc.stop()
# 多分类模型 评估指标 from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator() evaluator.setMetricName("f1|weightedPrecision|weightedRecall|accuracy") evaluator.evaluate(...) # 回归模型 评估指标 from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator() evaluator.setMetricName("rmse|mse|mae|r2") evaluator.evaluate(...) # 聚类模型 评估指标 (计算轮廓系数) from pyspark.ml.evaluation import ClusteringEvaluator evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(...)
def randomForest(df, feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'], maxDepth=5, numTrees=20, seed=None, overwrite_model=False): # Checks if there is a SparkContext running if so grab that if not start a new one # sc = SparkContext.getOrCreate() # sqlContext = SQLContext(sc) # sqlContext.setLogLevel('INFO') feature_list.sort() feature_name = '_'.join(feature_list) param_name = '_'.join([str(maxDepth), str(numTrees)]) model_path_name = model_dir + 'RandomForest/' + feature_name + '_' + param_name model = None vector_assembler = VectorAssembler(inputCols=feature_list, outputCol="features") df_temp = vector_assembler.transform(df) df = df_temp.select(['label', 'features']) trainingData, testData = df.randomSplit([0.7, 0.3]) if os.path.isdir(model_path_name) and not overwrite_model: print('Loading model from ' + model_path_name) model = RandomForestClassificationModel.load(model_path_name) else: rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=numTrees, maxDepth=maxDepth, seed=seed) model = rf.fit(trainingData) print('Making predictions on validation data') predictions = model.transform(testData) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") # f1|weightedPrecision|weightedRecall|accuracy evaluator.setMetricName('accuracy') print('Evaluating accuracy') accuracy = evaluator.evaluate(predictions) evaluator.setMetricName('f1') print('Evaluating f1') f1 = evaluator.evaluate(predictions) evaluator.setMetricName('weightedPrecision') print('Evaluating weightedPrecision') weightedPrecision = evaluator.evaluate(predictions) evaluator.setMetricName('weightedRecall') print('Evaluating weightedRecall') weightedRecall = evaluator.evaluate(predictions) print('accuracy {}'.format(accuracy)) print('f1 {}'.format(f1)) print('weightedPrecision {}'.format(weightedPrecision)) print('weightedRecall {}'.format(weightedRecall)) # test distribution of outputs total = df.select('label').count() tape = df.filter(df.label == 0).count() disk = df.filter(df.label == 1).count() cloud = df.filter(df.label == 2).count() # print outputs print('Random Forests') print(feature_list) print('Data distribution') print('Total Observations {}'.format(total)) print(' Cloud %{}'.format((cloud / total) * 100)) print(' Disk %{}'.format((disk / total) * 100)) print(' Tape %{}\n'.format((tape / total) * 100)) print(" Test Error = {}".format((1.0 - accuracy) * 100)) print(" Test Accuracy = {}\n".format(accuracy * 100)) print('Error distribution') misses = predictions.filter(predictions.label != predictions.prediction) # now get percentage of error tape_misses = misses.filter(misses.label == 0).count() disk_misses = misses.filter(misses.label == 1).count() cloud_misses = misses.filter(misses.label == 2).count() tape_pred = predictions.filter(predictions.label == 0).count() disk_pred = predictions.filter(predictions.label == 1).count() cloud_pred = predictions.filter(predictions.label == 2).count() print(' Cloud Misses %{}'.format((cloud_misses / cloud_pred) * 100)) print(' Disk Misses %{}'.format((disk_misses / disk_pred) * 100)) print(' Tape Misses %{}'.format((tape_misses / tape_pred) * 100)) if accuracy > 0.80: if os.path.isdir(model_path_name): if overwrite_model: print('Saving model to ' + model_path_name) model.write().overwrite().save(model_path_name) else: pass else: print('Saving model to ' + model_path_name) model.save(model_path_name) metrics = { 'data': { 'Total': total, 'Cloud': (cloud / total) * 100, 'Disk': (disk / total) * 100, 'Tape': (tape / total) * 100 }, 'metrics': { 'Accuracy': accuracy * 100, 'f1': f1 * 100, 'Weighted Precision': weightedPrecision * 100, 'Weighted Recall': weightedRecall * 100 }, 'error_percentage': { 'Cloud': cloud_misses / cloud_pred * 100, 'Disk': disk_misses / disk_pred * 100, 'Tape': tape_misses / tape_pred * 100 }, 'params': { 'Number of Trees': model.getNumTrees, 'Maximum Depth': maxDepth }, 'model_debug': model.toDebugString, 'name': 'Random Forest Model', 'features': feature_list } with open('tmp/temp.yml', 'w') as outfile: yaml.dump(metrics, outfile) return metrics, model
display(testPredDF.orderBy("probability")) # COMMAND ---------- display(testPredDF.filter("label != prediction")) # COMMAND ---------- # DBTITLE 1,Confusion Matrix (more False Negatives) display(testPredDF.groupBy("label", "prediction").count()) # COMMAND ---------- # DBTITLE 1,Evaluate from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy') metricsDF = spark.createDataFrame( [("f1", evaluator.evaluate(testPredDF)), ("accuracy", evaluator.setMetricName("accuracy").evaluate(testPredDF))], ["Metric", "Value"]) display(metricsDF) # COMMAND ---------- # COMMAND ----------
def process(time, rdd): print( "=========*********************************************** %s ***********************************************============" % str(time)) print("\n") if not (rdd.isEmpty()): df = spark.createDataFrame(rdd, ["label", "text"]) print( "=========***********************************************$ Raw Data From Stream $***********************************************=========" ) df.show() pipeline_data = loded_pipeline.transform(df) print("\n") print( "=========***********************************************$ Transformed Data After Running Pre Loded Pipeline $***********************************************=========" ) pipeline_data.show() print("\n\n") print( "=========***********************************************$ Classification Using Pre Trained Logistic Classification Model $***********************************************=========" ) predictions = saved_logistic_model.transform(pipeline_data) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction") f1 = evaluator.setMetricName("f1").evaluate(predictions) weightedPrecision = evaluator.setMetricName( "weightedPrecision").evaluate(predictions) weightedRecall = evaluator.setMetricName("weightedRecall").evaluate( predictions) accuracyNaiveBayes = evaluator.setMetricName("accuracy").evaluate( predictions) predictions = predictions.select("label", "prediction") predictions = predictions.withColumn( "Current Stream Accuracy %", lit(str((accuracyNaiveBayes) * 100) + "%")) predictions = predictions.withColumn( "Current Stream Error %", lit(str((1.0 - accuracyNaiveBayes) * 100) + "%")) predictions = predictions.withColumn("Current Stream F1 Score", lit(str(f1))) predictions = predictions.withColumn("Current Stream weightedRecall", lit(str(weightedRecall))) predictions = predictions.withColumn( "Current Stream weightedPrecision", lit(str(weightedPrecision))) # To Print Data Frame Schema for Debbuging #predictions.printSchema() label = mapSpeciesTypeWithNumericLabel( predictions.select("prediction").first()) labelInitial = mapSpeciesTypeWithNumericLabel( predictions.select("label").first()) global total_count_logistic_classification global correct_count_logistic_classification total_count_logistic_classification = total_count_logistic_classification + 1 if (labelInitial == label): correct_count_logistic_classification = correct_count_logistic_classification + 1 overall_accuracy_percent = ( float(correct_count_logistic_classification) / float(total_count_logistic_classification)) * 100 predictions = predictions.withColumn("News_Category_Predicted", lit(str(label))) predictions = predictions.withColumn("News_Category_InitalLabel", lit(str(labelInitial))) predictions.show() # Overall Stats total_predictions = predictions.select("label") total_predictions = total_predictions.withColumn( "Overall Correct Count", lit(str(correct_count_logistic_classification))) total_predictions = total_predictions.select("Overall Correct Count") total_predictions = total_predictions.withColumn( "Total Count", lit(str(total_count_logistic_classification))) total_predictions = total_predictions.withColumn( "Overall Accuracy Percent(%)", lit(str(overall_accuracy_percent) + "%")) total_predictions = total_predictions.withColumn( "Overall Error Percent(%)", lit(str(100 - overall_accuracy_percent) + "%")) print("\n") print( "=========***********************************************$ Overall Classification Metrics Logistic Classification Model $***********************************************=========" ) total_predictions.show() # print("Test Error for Naive Bayes :" + str((1.0 - accuracyNaiveBayes) * 100) + "%") # print("Test Accuracy for Naive Bayes :" + str((accuracyNaiveBayes) * 100) + "%") # print("Test weightedRecall for Naive Bayes :" + str(weightedRecall)) # print("Test weightedPrecision for Naive Bayes :" + str(weightedPrecision)) # print("Test f1 score for Naive Bayes :" + str(f1)) # Naive bayes Model Classification print("\n\n") print( "=========***********************************************$ Classification Using Pre Trained Naive Bayes Classification Model $***********************************************=========" ) print("\n") naive_bayes_predictions = saved_naive_bayes_model.transform( pipeline_data) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction") f1 = evaluator.setMetricName("f1").evaluate(naive_bayes_predictions) weightedPrecision = evaluator.setMetricName( "weightedPrecision").evaluate(naive_bayes_predictions) weightedRecall = evaluator.setMetricName("weightedRecall").evaluate( naive_bayes_predictions) accuracyNaiveBayes = evaluator.setMetricName("accuracy").evaluate( naive_bayes_predictions) naive_bayes_predictions = naive_bayes_predictions.select( "label", "prediction") naive_bayes_predictions = naive_bayes_predictions.withColumn( "Current Stream Accuracy %", lit(str((accuracyNaiveBayes) * 100) + "%")) naive_bayes_predictions = naive_bayes_predictions.withColumn( "Current Stream Error %", lit(str((1.0 - accuracyNaiveBayes) * 100) + "%")) naive_bayes_predictions = naive_bayes_predictions.withColumn( "Current Stream F1 Score", lit(str(f1))) naive_bayes_predictions = naive_bayes_predictions.withColumn( "Current Stream weightedRecall", lit(str(weightedRecall))) naive_bayes_predictions = naive_bayes_predictions.withColumn( "Current Stream weightedPrecision", lit(str(weightedPrecision))) # To Print Data Frame Schema for Debbuging # predictions.printSchema() label_naive_bayes = mapSpeciesTypeWithNumericLabel( naive_bayes_predictions.select("prediction").first()) labelInitial_naive_bayes = mapSpeciesTypeWithNumericLabel( naive_bayes_predictions.select("label").first()) # Loading Global Variables global total_count_naive_bayes_classification global correct_count_naive_bayes_classification total_count_naive_bayes_classification = total_count_naive_bayes_classification + 1 if (label_naive_bayes == labelInitial_naive_bayes): correct_count_naive_bayes_classification = correct_count_naive_bayes_classification + 1 overall_accuracy_naive_bayes_percent = ( float(correct_count_naive_bayes_classification) / float(total_count_naive_bayes_classification)) * 100 naive_bayes_predictions = naive_bayes_predictions.withColumn( "News_Category_Predicted", lit(str(label_naive_bayes))) naive_bayes_predictions = naive_bayes_predictions.withColumn( "News_Category_InitalLabel", lit(str(labelInitial_naive_bayes))) naive_bayes_predictions.show() print("\n") # Overall Stats total_naive_bayes_predictions = naive_bayes_predictions.select("label") total_naive_bayes_predictions = total_naive_bayes_predictions.withColumn( "Overall Correct Count", lit(str(correct_count_naive_bayes_classification))) total_naive_bayes_predictions = total_naive_bayes_predictions.select( "Overall Correct Count") total_naive_bayes_predictions = total_naive_bayes_predictions.withColumn( "Total Count", lit(str(total_count_naive_bayes_classification))) total_naive_bayes_predictions = total_naive_bayes_predictions.withColumn( "Overall Accuracy Percent(%)", lit(str(overall_accuracy_naive_bayes_percent) + "%")) total_naive_bayes_predictions = total_naive_bayes_predictions.withColumn( "Overall Error Percent(%)", lit(str(100 - overall_accuracy_naive_bayes_percent) + "%")) print( "=========***********************************************$ Overall Classification Metrics Naive Bayes Classification Model $***********************************************=========" ) total_naive_bayes_predictions.show() print("\n") print( "=========*********************************************** End of Single Stream ***********************************************=========" )
#Logistic Regression Classification lr = LogisticRegression(maxIter=25, regParam=0.3, elasticNetParam=0) lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 0).select("text","index","probability","label","prediction").orderBy("probability", ascending=False).show(n = 10, truncate = 30) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") accuracy = evaluator.evaluate(predictions) print("Test Error for Logistic Regression :" + str((1.0 - accuracy)*100)+ "%") print("Test Accuracy for Logistic Regression :" + str((accuracy)*100)+ "%") evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction" ,metricName='f1') f1 = evaluator.setMetricName("f1").evaluate(predictions) weightedPrecision = evaluator.setMetricName("weightedPrecision").evaluate(predictions) weightedRecall = evaluator.setMetricName("weightedRecall").evaluate(predictions) accuracy = evaluator.setMetricName("accuracy").evaluate(predictions) print("Test weightedRecall for Logistic Regression :" + str(weightedRecall)) print("Test weightedPrecision for Logistic Regression :" + str(weightedPrecision)) print("Test f1 score for Logistic Regression :" + str(f1)) # Save model save_model_path = output_folder_path + "LogisticClassificationModel" lrModel.write().overwrite().save(save_model_path) print("Logistic Classification Model Successfully trained and saved in project Output directory")
# %% pred_test = crossvalidation_mode.transform(testing) pred_test.show(5) # %% [markdown] # ## Best model from cross validation # %% print("The parameter smoothing has best value:", crossvalidation_mode.bestModel._java_obj.getSmoothing()) # %% [markdown] # ### Prediction accuracy on train data # %% print('training data (f1):', evaluator.setMetricName('f1').evaluate(pred_train), "\n", 'training data (weightedPrecision): ', evaluator.setMetricName('weightedPrecision').evaluate(pred_train),"\n", 'training data (weightedRecall): ', evaluator.setMetricName('weightedRecall').evaluate(pred_train),"\n", 'training data (accuracy): ', evaluator.setMetricName('accuracy').evaluate(pred_train)) # %% [markdown] # ### Prediction accuracy on test data # %% print('test data (f1):', evaluator.setMetricName('f1').evaluate(pred_test), "\n", 'test data (weightedPrecision): ', evaluator.setMetricName('weightedPrecision').evaluate(pred_test),"\n", 'test data (weightedRecall): ', evaluator.setMetricName('weightedRecall').evaluate(pred_test),"\n", 'test data (accuracy): ', evaluator.setMetricName('accuracy').evaluate(pred_test)) # %% [markdown] # ## Confusion matrix
print("\nModels Evaluation:") print("{:-<24}".format("")) for idx, c in enumerate(classifiers): print(c) # fit the model model = classifiers[c].fit(train_set) # make predictions predictions = model.transform(test_set) predictions.cache() # evaluate performance evaluator = MulticlassClassificationEvaluator(labelCol="Label_Idx", predictionCol="prediction") for m in metrics: evaluator.setMetricName(m) metric = evaluator.evaluate(predictions) print("{name} = {value:.2f}".format(name=m, value=metric)) # Build confusion matrix using Scikit-learn (sktlearn) target_list = predictions.select("Label_Idx").rdd.flatMap(lambda x: x).collect() pred_list = predictions.select("prediction").rdd.flatMap(lambda x: x).collect() label_num_list = predictions.select("Label_Idx").distinct().orderBy("Label_Idx").rdd.flatMap(lambda x: x).collect() # print("\nClassification report using Sklearn:") # print(classification_report(target_list, pred_list, target_names=label_list)) conf_matrix = confusion_matrix(target_list, pred_list, label_num_list) plt.figure(idx) plt.title("Confusion matrix - {model}".format(model=c)) sns.heatmap(conf_matrix.T, square=True, annot=True, fmt='d', cbar=False, annot_kws={"size": 7.5}, xticklabels=label_list, yticklabels=label_list) plt.xlabel('true label')
ax0, ax1 = axList ax0.set_title('First Model', color='#999999') ax1.set_title('Second Model', color='#999999') generateROC(axList[0], labelsAndScores) generateROC(axList[1], labelsAndScores2) display(fig) # COMMAND ---------- from pyspark.ml.evaluation import MulticlassClassificationEvaluator metric = 'precision' multiclassEval = MulticlassClassificationEvaluator() multiclassEval.setMetricName(metric) print 'Model one {0}: {1:.3f}'.format(metric, multiclassEval.evaluate(irisTestPredictions)) print 'Model two {0}: {1:.3f}\n'.format(metric, multiclassEval.evaluate(irisTestPredictions2)) # COMMAND ---------- import inspect print inspect.getsource(MulticlassClassificationEvaluator) # COMMAND ---------- # MAGIC %md # MAGIC #### Using MLlib instead of ML # MAGIC # MAGIC We've been using `ml` transformers, estimators, pipelines, and evaluators. How can we accomplish the same things with MLlib?
def randomForest(df, feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'], maxDepth=5, numTrees=20, seed=None): # Checks if there is a SparkContext running if so grab that if not start a new one # sc = SparkContext.getOrCreate() # sqlContext = SQLContext(sc) # sqlContext.setLogLevel('INFO') print( 'Sanity check that Im not doing something dumb like using the label in the feature_list: {}' .format(feature_list)) vector_assembler = VectorAssembler(inputCols=feature_list, outputCol="features") df_temp = vector_assembler.transform(df) df = df_temp.select(['label', 'features']) (trainingData, testData) = df.randomSplit([0.7, 0.3]) rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=numTrees, maxDepth=maxDepth, seed=seed) model = rf.fit(trainingData) predictions = model.transform(testData) # predictions.select("prediction", "label").show(100) # df.select('label').distinct().show() evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") # f1|weightedPrecision|weightedRecall|accuracy evaluator.setMetricName('accuracy') accuracy = evaluator.evaluate(predictions) evaluator.setMetricName('f1') f1 = evaluator.evaluate(predictions) evaluator.setMetricName('weightedPrecision') weightedPrecision = evaluator.evaluate(predictions) evaluator.setMetricName('weightedRecall') weightedRecall = evaluator.evaluate(predictions) print('accuracy {}'.format(accuracy)) print('f1 {}'.format(f1)) print('weightedPrecision {}'.format(weightedPrecision)) print('weightedRecall {}'.format(weightedRecall)) # test distribution of outputs total = df.select('label').count() tape = df.filter(df.label == 0).count() disk = df.filter(df.label == 1).count() cloud = df.filter(df.label == 2).count() # print outputs print('Random Forests') print(feature_list) print('Total Observations {}'.format(total)) print(' Cloud %{}'.format((cloud / total) * 100)) print(' Disk %{}'.format((disk / total) * 100)) print(' Tape %{}\n'.format((tape / total) * 100)) print(" Test Error = {}".format((1.0 - accuracy) * 100)) print(" Test Accuracy = {}\n".format(accuracy * 100)) misses = predictions.filter(predictions.label != predictions.prediction) # now get percentage of error tape_misses = misses.filter(misses.label == 0).count() disk_misses = misses.filter(misses.label == 1).count() cloud_misses = misses.filter(misses.label == 2).count() print(' Cloud Misses %{}'.format((cloud_misses / cloud) * 100)) print(' Disk Misses %{}'.format((disk_misses / disk) * 100)) print(' Tape Misses %{}'.format((tape_misses / tape) * 100)) # plt.xlabel("FPR", fontsize=14) # plt.ylabel("TPR", fontsize=14) # plt.title("ROC Curve", fontsize=14) # plt.plot(fp[0:250], tp, linewidth=2) # buf = io.BytesIO() # plt.savefig(buf, format='png') # buf.seek(0) # image = tf.image.decode_png(buf.getvalue(), channels=4) # image = tf.expand_dims(image, 0) # summary_op = tf.summary.image("ROC Curve", image) return accuracy, 'Random Forests: {}'.format(accuracy), model
def multinomialRegression(df, feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'], maxIter=100, regParam=0.0, elasticNetParam=0.0, threshold=0.5, overwrite_model=False): # Checks if there is a SparkContext running if so grab that if not start a new one # sc = SparkContext.getOrCreate() # sqlContext = SQLContext(sc) # sqlContext.setLogLevel('INFO') feature_list.sort() feature_name = '_'.join(feature_list) param_name = '_'.join( [str(regParam), str(elasticNetParam), str(maxIter), str(threshold)]) model_path_name = model_dir + 'MultinomialRegression/' + feature_name + '_' + param_name model = None vector_assembler = VectorAssembler(inputCols=feature_list, outputCol="features") df_temp = vector_assembler.transform(df) df = df_temp.select(['label', 'features']) trainingData, testData = df.randomSplit([0.7, 0.3]) if os.path.isdir(model_path_name) and not overwrite_model: print('Loading model from ' + model_path_name) model = LogisticRegressionModel.load(model_path_name) else: lr = LogisticRegression(labelCol="label", maxIter=maxIter, regParam=regParam, elasticNetParam=elasticNetParam) model = lr.fit(trainingData) print('Making predictions on validation data') predictions = model.transform(testData) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") evaluator.setMetricName('accuracy') print('Evaluating accuracy') accuracy = evaluator.evaluate(predictions) evaluator.setMetricName('f1') print('Evaluating f1') f1 = evaluator.evaluate(predictions) evaluator.setMetricName('weightedPrecision') print('Evaluating weightedPrecision') weightedPrecision = evaluator.evaluate(predictions) evaluator.setMetricName('weightedRecall') print('Evaluating weightedRecall') weightedRecall = evaluator.evaluate(predictions) print('accuracy {}'.format(accuracy)) print('f1 {}'.format(f1)) print('weightedPrecision {}'.format(weightedPrecision)) print('weightedRecall {}'.format(weightedRecall)) # test distribution of outputs total = df.select('label').count() tape = df.filter(df.label == 0).count() disk = df.filter(df.label == 1).count() cloud = df.filter(df.label == 2).count() # print outputs print('Multinomial Regression Classification') print(feature_list) print('Data distribution') print('Total Observations {}'.format(total)) print(' Cloud %{}'.format((cloud / total) * 100)) print(' Disk %{}'.format((disk / total) * 100)) print(' Tape %{}\n'.format((tape / total) * 100)) print(" Test Error = {}".format((1.0 - accuracy) * 100)) print(" Test Accuracy = {}\n".format(accuracy * 100)) print('Error distribution') misses = predictions.filter(predictions.label != predictions.prediction) # now get percentage of error tape_misses = misses.filter(misses.label == 0).count() disk_misses = misses.filter(misses.label == 1).count() cloud_misses = misses.filter(misses.label == 2).count() tape_pred = predictions.filter(predictions.label == 0).count() disk_pred = predictions.filter(predictions.label == 1).count() cloud_pred = predictions.filter(predictions.label == 2).count() print(' Cloud Misses %{}'.format((cloud_misses / cloud_pred) * 100)) print(' Disk Misses %{}'.format((disk_misses / disk_pred) * 100)) print(' Tape Misses %{}'.format((tape_misses / tape_pred) * 100)) if accuracy > 0.80: if os.path.isdir(model_path_name): if overwrite_model: print('Saving model to ' + model_path_name) model.write().overwrite().save(model_path_name) else: pass else: print('Saving model to ' + model_path_name) model.save(model_path_name) metrics = { 'data': { 'Total': total, 'Cloud': (cloud / total) * 100, 'Disk': (disk / total) * 100, 'Tape': (tape / total) * 100 }, 'metrics': { 'Accuracy': accuracy * 100, 'f1': f1 * 100, 'Weighted Precision': weightedPrecision * 100, 'Weighted Recall': weightedRecall * 100 }, 'error_percentage': { 'Cloud': cloud_misses / cloud_pred * 100, 'Disk': disk_misses / disk_pred * 100, 'Tape': tape_misses / tape_pred * 100 }, 'params': { 'Regularization Parameter': regParam, 'Maximum Iteration': maxIter, 'ElasticNet Mixing Parameter': elasticNetParam, 'Threshold': threshold }, 'name': 'Multinomial Regression Classification', 'features': feature_list } with open('tmp/temp2.yml', 'w') as outfile: yaml.dump(metrics, outfile) return metrics, model