def validateClassification(self, df, model, modelName=None): predictions0 = model.transform(df) evaluator = BinaryClassificationEvaluator( metricName=ValidateModels.ROC) roc = evaluator.evaluate(predictions0) evaluator.setMetricName(ValidateModels.PR) pr = evaluator.evaluate(predictions0) if modelName != None: self.metrics[modelName][ValidateModels.ROC] = roc self.metrics[modelName][ValidateModels.PR] = pr return {ValidateModels.ROC: roc, ValidateModels.PR: pr}
def build_model(source_df, config_df): """ Args: source_df: config_df: Returns: """ config_dict = config_df.asDict() pipeline_builder = PipelineBuilder(source_df, config_dict) target_df = pipeline_builder.transform() (training_data, test_data) = target_df.randomSplit(config_dict['randomSplit'], seed=config_dict['seed']) # Create initial LogisticRegression model lr = LogisticRegression(labelCol="label", featuresCol="features") # Create ParamGrid for Cross Validation param_grid = (ParamGridBuilder().addGrid( lr.regParam, config_dict['regParam']).addGrid( lr.elasticNetParam, config_dict['elasticNetParam']).addGrid( lr.maxIter, config_dict['maxIter']).build()) # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") evaluator.setMetricName(config_dict['metricName']) # Create K-fold CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=config_dict["numFolds"]) # Run cross validations cv_model = cv.fit(training_data) # Use test set here so we can measure the accuracy of our model on new data test_predictions = cv_model.transform(test_data) evaluator.evaluate(test_predictions) # Extract weights coefficients = cv_model.bestModel.coefficients weights = [] for index, feature in enumerate(pipeline_builder.features()): weights.append( Row(feature=feature, weight=float(coefficients[index]), intercept=cv_model.bestModel.intercept)) return cv_model, weights, test_predictions
def validateModel(model, filename): test = spark.read.load('../dataset/merged/article/') test = test.withColumn('label', test._hyperpartisan.cast('integer')) test = model.transform(test) ev = BinaryClassificationEvaluator() with open(filename,"a") as file: file.write(f"{ev.getMetricName()}: {ev.evaluate(test)}\n") ev = MulticlassClassificationEvaluator() file.write(f"{ev.getMetricName()}: {ev.evaluate(test)}\n") ev.setMetricName("weightedPrecision") file.write(f"{ev.getMetricName()}: {ev.evaluate(test)}\n") ev.setMetricName("weightedRecall") file.write(f"{ev.getMetricName()}: {ev.evaluate(test)}\n") ev.setMetricName("accuracy") file.write(f"{ev.getMetricName()}: {ev.evaluate(test)}\n")
# test_with_prediction.show(5) test_with_prediction.select("Class","rawPrediction","probability","prediction").show(5) # **Note:** The resulting DataFrame includes three types of predictions. The # `rawPrediction` is a vector of log-odds, `prediction` is a vector or # probabilities `prediction` is the predicted class based on the probability # vector. # Create an instance of `BinaryClassificationEvaluator` class: from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="Class", metricName="areaUnderROC") print(evaluator.explainParams()) evaluator.evaluate(test_with_prediction) # Evaluate using another metric: evaluator.setMetricName("areaUnderPR").evaluate(test_with_prediction) # ## Score out a new dataset # There are two ways to score out a new dataset. # **Method1:** The `evaluate` method # The more expensive way is to use the `evaluate` method of the # `LogisticRegressionModel` class. The `predictions` attribute of the # resulting `BinaryLogisticRegressionSummary` instance contains the scored # DataFrame: test_with_evaluation = log_reg_model.evaluate(df_test) test_with_evaluation.predictions.printSchema() test_with_evaluation.predictions.head(5)
for reg in regs: print("Regularization rate: {}".format(reg)) with main_run.child_run("reg-" + str(reg)) as run: lr = LogisticRegression(featuresCol="features", labelCol='label', regParam=reg) pipe = Pipeline(stages=[ stringIndexer, tokenizer, stopwordsRemover, hashingTF, idf, lr ]) model_p = pipe.fit(training_data) # make prediction on test_data pred = model_p.transform(test_data) bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction') au_roc = bce.setMetricName('areaUnderROC').evaluate(pred) au_prc = bce.setMetricName('areaUnderPR').evaluate(pred) totalCount = pred.count() tp = pred.where("prediction == 1 and label == 1").count() tn = pred.where("prediction == 0 and label == 0").count() fp = pred.where("prediction == 1 and label == 0").count() fn = pred.where("prediction == 0 and label == 1").count() acc = (tp + tn) / totalCount precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = 2 * precision * recall / (precision + recall) run.log("reg", reg) run.log("au_roc", au_roc) run.log("au_prc", au_prc) run.log("TN", tn)
lrmodel = lr.fit(adulttrain) lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain) lrmodel.weights lrmodel.intercept #section 8.2.3 validpredicts = lrmodel.transform(adultvalid) from pyspark.ml.evaluation import BinaryClassificationEvaluator bceval = BinaryClassificationEvaluator() bceval.evaluate(validpredicts) bceval.getMetricName() bceval.setMetricName("areaUnderPR") bceval.evaluate(validpredicts) #section 8.2.5 from pyspark.ml.tuning import CrossValidator from pyspark.ml.tuning import ParamGridBuilder cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5) paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid( lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build() cv.setEstimatorParamMaps(paramGrid) cvmodel = cv.fit(adulttrain) cvmodel.bestModel.weights BinaryClassificationEvaluator().evaluate( cvmodel.bestModel.transform(adultvalid)) #section 8.2.6
# %% [markdown] # ## Prediction on training data # %% pred_training_logr = logr_model.transform(training) show_columns = ['features', 'label', 'prediction', 'rawPrediction', 'probability'] pred_training_logr.select(show_columns).show(5, truncate=True) # %% [markdown] # ## Evaluator # %% from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") print('Accuracy on training data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_training_logr)) # %% [markdown] # ## Prediction on test data # %% pred_testing_logr = logr_model.transform(testing) pred_testing_logr.select(show_columns).show(5, truncate=True) # %% print('Accuracy on testing data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_testing_logr)) # %% [markdown] # ## Confusion Matrix
# fitting the model lrModel = lr.fit(vtrain_df) # printing the coefficients and intercept for logistic regression lr_coeff = lrModel.coefficients print('\nCoefficients: ') print([round(i, 3) for i in lr_coeff]) print('\nIntercept: ', lrModel.intercept, '\n') # getting train predictions and accuracy rate / area under ROC / area under PR evaluator = BinaryClassificationEvaluator() train_pred = lrModel.transform(vtrain_df) train_pred.show(5, False) print('\nEntire Train Predictions DataFrame\n') train_roc = evaluator.setMetricName('areaUnderROC').evaluate(train_pred) train_pr = evaluator.setMetricName('areaUnderPR').evaluate(train_pred) condensed_train_pred = train_pred.select(['label', 'prediction']) train_acc = round(get_accuracy_rate(condensed_train_pred), 2) condensed_train_pred.show(10, False) print('\nCondensed Train Predictions DataFrame\n') print('Train Accuracy Rate :', train_acc) print('Train Area Under ROC :', round(train_roc, 4)) print('Train Area Under PR :', round(train_pr, 4), '\n') # getting test predictions and accuracy rate test_pred = lrModel.transform(vtest_df) test_pred.show(5, False) print('\nEntire Test Predictions DataFrame\n') test_roc = evaluator.setMetricName('areaUnderROC').evaluate(test_pred) test_pr = evaluator.setMetricName('areaUnderPR').evaluate(test_pred)
pipeline_model = pipeline.fit(taxi) final_columns = feature_columns + ['features', 'label'] taxi_df = pipeline_model.transform(taxi).select(final_columns) #taxi_df.show(5) train, test = taxi_df.randomSplit([0.8, 0.2], seed=1234) random_forest = RandomForestClassifier(featuresCol='features', labelCol='label') param_grid = ParamGridBuilder().\ addGrid(random_forest.maxDepth, [2, 3, 4]).\ addGrid(random_forest.minInfoGain, [0.0, 0.1, 0.2, 0.3]).\ build() evaluator = BinaryClassificationEvaluator() crossvalidation = CrossValidator(estimator=random_forest, estimatorParamMaps=param_grid, evaluator=evaluator) crossvalidation_mod = crossvalidation.fit(taxi_df) pred_test = crossvalidation_mod.transform(test) pred_test.show(5) label_pred_test = pred_test.select('label', 'prediction') label_pred_test.rdd.zipWithIndex().countByKey() print('Accuracy : ', evaluator.setMetricName('areaUnderROC').evaluate(pred_test)) print('Precision : ', evaluator.setMetricName('areaUnderPR').evaluate(pred_test)) #print('Precision : ', evaluator.setMetricName('precision').evaluate(pred_test))
# inputCols=["LIMIT_BAL", "SEX", "EDUCATION","MARRIAGE","AGE"], # outputCol="features") assembler=VectorAssembler(inputCols=["LIMIT_BAL","BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6", \ "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6","SEX_Vec","MARRIAGE_Vec", \ "AGE_Vec","EDUCATION_Vec","PAY_0_Vec","PAY_2_Vec","PAY_3_Vec","PAY_4_Vec","PAY_5_Vec","PAY_6_Vec"],outputCol="features") output = assembler.transform(trans_df3) #Split the training & test data (trainingData, testData) = output.randomSplit([0.7, 0.3]) from pyspark.ml.evaluation import BinaryClassificationEvaluator binaryEvaluator=BinaryClassificationEvaluator(labelCol="Y",rawPredictionCol="rawPrediction") binaryEvaluator.setMetricName("areaUnderROC") from pyspark.ml.evaluation import RegressionEvaluator evaluatorRegression=RegressionEvaluator(labelCol="Y",predictionCol="prediction") evaluatorRegression.setMetricName("rmse") from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(labelCol='Y',maxIter=10, regParam=0.03, elasticNetParam=0.8) model = lr.fit(trainingData) print(model.summary.areaUnderROC) prediction=model.transform(trainingData) areaTraining=binaryEvaluator.evaluate(prediction) print("Area Under ROC using Logistics Regression on training data =" + str(areaTraining))
print('\n\tFinal Test Dataframe for Logistic Regression\n') # Start Logistic Regression Model logisticReg = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter = 10) logisticRegModel = logisticReg.fit(vTrainDataFrame) logisticRegCoeff = logisticRegModel.coefficients print('\nCoefficients: ') print([round(i, 3) for i in logisticRegCoeff]) print('\nIntercept: ', logisticRegModel.intercept, '\n') # Calculate Train Predictions, Accuracy Rate, Area under ROC and Area unde PR evaluator = BinaryClassificationEvaluator() trainPredict = logisticRegModel.transform(vTrainDataFrame) trainPredict.show(5, False) print('\nTraining Predictions DataFrame\n') trainROC = evaluator.setMetricName('areaUnderROC').evaluate(trainPredict) trainPR = evaluator.setMetricName('areaUnderPR').evaluate(trainPredict) condenseTrainPredict = trainPredict.select(['label', 'prediction']) trainACC = round(getAccuracyRate(condenseTrainPredict), 2) condenseTrainPredict.show(5, False) print('\nTraining Predictions DataFrame\n') print('Training Accuracy Rate:', trainACC) print('Training Area under ROC:', round(trainROC, 4)) print('Training Area under PR:', round(trainPR, 4), '\n') # Calculate Test Predictions and Accuracy Rate testPredict = logisticRegModel.transform(vTestDataFrame) testPredict.show(5, False) print('\nTest Predictions DataFrame\n') testROC = evaluator.setMetricName('areaUnderROC').evaluate(testPredict) testPR = evaluator.setMetricName('areaUnderPR').evaluate(testPredict)
# %% pred_training_rf = rf_model.transform(training) show_columns = [ 'features', 'label', 'prediction', 'rawPrediction', 'probability' ] pred_training_rf.select(show_columns).show(5, truncate=True) # %% [markdown] # ## Evaluator # # %% from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") print('Accuracy on training data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_training_rf)) # %% [markdown] # ## Prediction on test data # %% pred_testing_rf = rf_model.transform(testing) pred_testing_rf.select(show_columns).show(5, truncate=True) # %% print('Accuracy on testing data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_testing_rf)) # %% [markdown] # ## Confusion Matrix
# step 10 lr = LogisticRegression(regParam=0.01, maxIter=100, fitIntercept=True) bceval = BinaryClassificationEvaluator() cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(n_fold) paramGrid = ParamGridBuilder().addGrid(lr.maxIter, max_iter)\ .addGrid(lr.regParam, reg_params).build() cv.setEstimatorParamMaps(paramGrid) cvmodel = cv.fit(train) print(cvmodel.bestModel.coefficients) print('') print(cvmodel.bestModel.intercept) print('') print(cvmodel.bestModel.getMaxIter()) print('') print(cvmodel.bestModel.getRegParam()) print('') # step 11 result11 = bceval.setMetricName('areaUnderROC').evaluate( cvmodel.bestModel.transform(valid)) print(result11) ss.stop()
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True) lrmodel = lr.fit(adulttrain) lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain) lrmodel.weights lrmodel.intercept #section 8.2.3 validpredicts = lrmodel.transform(adultvalid) from pyspark.ml.evaluation import BinaryClassificationEvaluator bceval = BinaryClassificationEvaluator() bceval.evaluate(validpredicts) bceval.getMetricName() bceval.setMetricName("areaUnderPR") bceval.evaluate(validpredicts) #section 8.2.5 from pyspark.ml.tuning import CrossValidator from pyspark.ml.tuning import ParamGridBuilder cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5) paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build() cv.setEstimatorParamMaps(paramGrid) cvmodel = cv.fit(adulttrain) cvmodel.bestModel.weights BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid)) #section 8.2.6 penschema = StructType([ StructField("pix1",DoubleType(),True),
def gradientBoosting(df, feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'], maxIter=20, stepSize=0.1, maxDepth=5, overwrite_model=False): # Checks if there is a SparkContext running if so grab that if not start a new one # sc = SparkContext.getOrCreate() # sqlContext = SQLContext(sc) # sqlContext.setLogLevel('INFO') feature_list.sort() feature_name = '_'.join(feature_list) param_name = '_'.join([str(maxDepth), str(stepSize), str(maxIter)]) model_path_name = model_dir + 'GradientBoosting/' + feature_name + '_' + param_name model = None vector_assembler = VectorAssembler(inputCols=feature_list, outputCol="features") df_temp = vector_assembler.transform(df) df = df_temp.select(['label', 'features']) trainingData, testData = df.randomSplit([0.7, 0.3]) if os.path.isdir(model_path_name) and not overwrite_model: print('Loading model from ' + model_path_name) model = GBTClassificationModel.load(model_path_name) else: gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=maxIter, stepSize=stepSize, maxDepth=maxDepth) model = gbt.fit(trainingData) print('Making predictions on validation data') predictions = model.transform(testData) evaluator = BinaryClassificationEvaluator() evaluator.setMetricName('areaUnderROC') print('Evaluating areaUnderROC') auc = evaluator.evaluate(predictions) evaluator.setMetricName('areaUnderPR') print('Evaluating areaUnderPR') areaUnderPR = evaluator.evaluate(predictions) # test distribution of outputs total = df.select('label').count() disk = df.filter(df.label == 0).count() cloud = df.filter(df.label == 1).count() # print outputs print('Gradient-Boosted Tree') print(feature_list) print('Data distribution') print('Total Observations {}'.format(total)) print(' Cloud %{}'.format((cloud / total) * 100)) print(' Disk %{}'.format((disk / total) * 100)) print(" Test AUC = {}\n".format(auc * 100)) print('Error distribution') misses = predictions.filter(predictions.label != predictions.prediction) # now get percentage of error disk_misses = misses.filter(misses.label == 0).count() cloud_misses = misses.filter(misses.label == 1).count() disk_pred = predictions.filter(predictions.label == 0).count() cloud_pred = predictions.filter(predictions.label == 1).count() print(' Cloud Misses %{}'.format((cloud_misses / cloud_pred) * 100)) print(' Disk Misses %{}'.format((disk_misses / disk_pred) * 100)) if auc > 0.80: if os.path.isdir(model_path_name): if overwrite_model: print('Saving model to ' + model_path_name) model.write().overwrite().save(model_path_name) else: pass else: print('Saving model to ' + model_path_name) model.save(model_path_name) metrics = { 'data': { 'total': total, 'cloud': (cloud / total) * 100, 'disk': (disk / total) * 100 }, 'metrics': { 'Area Under ROC curve': auc * 100, 'Area Under PR curve': areaUnderPR * 100 }, 'error_percentage': { 'cloud': cloud_misses / cloud_pred * 100, 'disk': disk_misses / disk_pred * 100 }, 'params': { 'Number of Trees': model.getNumTrees, 'Maximum Depth': maxDepth, 'Maximum Number of Iterations': maxIter, 'Step Size': stepSize }, 'model_debug': model.toDebugString, 'name': 'Gradient Boosted Model', 'features': feature_list } with open('tmp/temp1.yml', 'w') as outfile: yaml.dump(metrics, outfile) return metrics, model
def linearSVC(df, feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'], maxIter=100, regParam=0.0, threshold=0.0, overwrite_model=False): # Checks if there is a SparkContext running if so grab that if not start a new one # sc = SparkContext.getOrCreate() # sqlContext = SQLContext(sc) # sqlContext.setLogLevel('INFO') feature_list.sort() feature_name = '_'.join(feature_list) param_name = '_'.join([str(regParam), str(threshold), str(maxIter)]) model_path_name = model_dir + 'LinearSVC/' + feature_name + '_' + param_name model = None vector_assembler = VectorAssembler(inputCols=feature_list, outputCol="features") df_temp = vector_assembler.transform(df) df = df_temp.select(['label', 'features']) trainingData, testData = df.randomSplit([0.7, 0.3]) if os.path.isdir(model_path_name) and not overwrite_model: print('Loading model from ' + model_path_name) model = LinearSVCModel.load(model_path_name) else: lsvc = LinearSVC(maxIter=maxIter, regParam=regParam, threshold=threshold) model = lsvc.fit(trainingData) print('Making predictions on validation data') predictions = model.transform(testData) evaluator = BinaryClassificationEvaluator() evaluator.setMetricName('areaUnderROC') print('Evaluating areaUnderROC') auc = evaluator.evaluate(predictions) evaluator.setMetricName('areaUnderPR') print('Evaluating areaUnderPR') areaUnderPR = evaluator.evaluate(predictions) # test distribution of outputs total = df.select('label').count() disk = df.filter(df.label == 0).count() cloud = df.filter(df.label == 1).count() # print outputs print('Linear SVC') print(feature_list) print('Data distribution') print('Total Observations {}'.format(total)) print(' Cloud %{}'.format((cloud / total) * 100)) print(' Disk %{}'.format((disk / total) * 100)) print(" Test AUC = {}\n".format(auc * 100)) print('Error distribution') misses = predictions.filter(predictions.label != predictions.prediction) # now get percentage of error disk_misses = misses.filter(misses.label == 0).count() cloud_misses = misses.filter(misses.label == 1).count() disk_pred = predictions.filter(predictions.label == 0).count() cloud_pred = predictions.filter(predictions.label == 1).count() print(' Cloud Misses %{}'.format((cloud_misses / cloud_pred) * 100)) print(' Disk Misses %{}'.format((disk_misses / disk_pred) * 100)) if auc > 0.70: if os.path.isdir(model_path_name): if overwrite_model: print('Saving model to ' + model_path_name) model.write().overwrite().save(model_path_name) else: pass else: print('Saving model to ' + model_path_name) model.save(model_path_name) metrics = { 'data': { 'total': total, 'cloud': (cloud / total) * 100, 'disk': (disk / total) * 100 }, 'metrics': { 'Area Under ROC curve': auc * 100, 'Area Under PR curve': areaUnderPR * 100 }, 'error_percentage': { 'cloud': cloud_misses / cloud_pred * 100, 'disk': disk_misses / disk_pred * 100 }, 'params': { 'Regularization Parameter': regParam, 'Maximum Iteration': maxIter, 'Threshold': threshold }, 'name': 'Linear SVC', 'features': feature_list } with open('tmp/temp3.yml', 'w') as outfile: yaml.dump(metrics, outfile) return metrics, model