def PRCurve(model, X_train, X_test, y_train, y_test): ''' A function to visualize Precision Recall Curve. Returns average precision score of the model. Data to fit must be training i.e. X_train, y_train. Data score will be estimated on X_test, y_test. ''' viz = PrecisionRecallCurve(model) viz.fit(X_train, y_train) avg_prec = viz.score(X_test, y_test) plt.legend(labels = ['Binary PR Curve',"AP=%.3f"%avg_prec], loc = 'lower right', prop={'size': 14}) plt.xlabel(xlabel = 'Recall', size = 14) plt.ylabel(ylabel = 'Precision', size = 14) plt.title(label = 'Precision Recall Curve', size = 16)
def eva_model(c, n, X, y, X_test, y_test, class_names, outdir): model = svm.LinearSVC(class_weight='balanced', dual=False, max_iter=10000, C=c) rfe = RFE(model, n_features_to_select=n) ## learning curve plt.clf() viz_LC = LearningCurve( rfe, scoring='f1_weighted', n_jobs=4 ) viz_LC.fit(X, y) viz_LC.show(outpath=outdir + '/LC.png') ## classification report plt.clf() viz_CR = ClassificationReport(rfe, classes=class_names, support=True) viz_CR.fit(X, y) viz_CR.score(X_test, y_test) viz_CR.show(outpath=outdir + '/CR.png') ## confusion matrix plt.clf() viz_CM = ConfusionMatrix(rfe, classes=class_names) viz_CM.fit(X, y) viz_CM.score(X_test, y_test) viz_CM.show(outpath=outdir + '/CM.png') ## precision recall curve plt.clf() viz_PRC = PrecisionRecallCurve(rfe, per_class=True, iso_f1_curves=True, fill_area=False, micro=False, classes=class_names) viz_PRC.fit(X, y) viz_PRC.score(X_test, y_test) viz_PRC.show(outpath=outdir + '/PRC.png',size=(1080,720)) ## class prediction error plt.clf() viz_CPE = ClassPredictionError( rfe, classes=class_names ) viz_CPE.fit(X, y) viz_CPE.score(X_test, y_test) viz_CPE.show(outpath=outdir + '/CPE.png') ## ROCAUC plt.clf() viz_RA = ROCAUC(rfe, classes=class_names, size=(1080,720)) viz_RA.fit(X, y) viz_RA.score(X, y) viz_RA.show(outpath=outdir + '/RA.png') fit = rfe.fit(X,y) y_predict = fit.predict(X_test) f1 = f1_score(y_test, y_predict, average='weighted') features_retained_RFE = X.columns[rfe.get_support()].values feature_df =pd.DataFrame(features_retained_RFE.tolist()) feature_df.to_csv(outdir + '/features.csv', sep='\t', index=False) return f1
def draw_binary(outpath=None): _, ax = plt.subplots(figsize=(9, 6)) X_train, X_test, y_train, y_test = load_binary(split=True) oz = PrecisionRecallCurve(RidgeClassifier(), ax=ax) oz.fit(X_train, y_train) oz.score(X_test, y_test) oz.poof(outpath=outpath)
def precision_recall(model, classes, X_train, Y_train, X_test, Y_test): from yellowbrick.classifier import PrecisionRecallCurve # Load the dataset and split into train/test splits # Create the visualizer, fit, score, and poof it viz = PrecisionRecallCurve(model) viz.fit(X_train, Y_train) viz.score(X_test, Y_test) viz.poof()
def precision_recall_f1(model, classes, X_train, Y_train, X_test, Y_test): from yellowbrick.classifier import PrecisionRecallCurve viz = PrecisionRecallCurve(model, per_class=True, iso_f1_curves=True, fill_area=False, micro=False) viz.fit(X_train, Y_train) viz.score(X_test, Y_test) viz.poof()
def draw_multiclass(outpath=None, simple=True): _, ax = plt.subplots(figsize=(9, 6)) X_train, X_test, y_train, y_test = load_multiclass() if simple: oz = PrecisionRecallCurve(RandomForestClassifier(), ax=ax) else: oz = PrecisionRecallCurve(MultinomialNB(), ax=ax, per_class=True, iso_f1_curves=True, fill_area=False, micro=False) oz.fit(X_train, y_train) oz.score(X_test, y_test) oz.poof(outpath=outpath)
def draw_binary(outpath=None): _, ax = plt.subplots(figsize=(9,6)) X_train, X_test, y_train, y_test = load_binary(split=True) oz = PrecisionRecallCurve(RidgeClassifier(), ax=ax) oz.fit(X_train, y_train) oz.score(X_test, y_test) oz.poof(outpath=outpath)
def draw_multiclass(outpath=None, simple=True): _, ax = plt.subplots(figsize=(9,6)) X_train, X_test, y_train, y_test = load_multiclass() if simple: oz = PrecisionRecallCurve(RandomForestClassifier(), ax=ax) else: oz = PrecisionRecallCurve(MultinomialNB(), ax=ax, per_class=True, iso_f1_curves=True, fill_area=False, micro=False) oz.fit(X_train, y_train) oz.score(X_test, y_test) oz.poof(outpath=outpath)
def evaluate(df, modelv="gnb", race="W", census=False, report=True, roc=True, pr=True): """ Run model evaluations for a specified model and race class """ # get model models = joblib.load(DIR + "/data/models/models_binary_%s%s.joblib" % (modelv, model_string)) model = models[race] # get data df = prep_data(df) tes = joblib.load(DIR + "/data/models/transformers_binary.joblib") # transform data for col in [ "first_name", "last_name", "middle_name"]: te = tes[race][col] df[col] = te.transform(df[col]) df[col] = df[col].fillna(0) tmpa = np.where(df.race_code == race, True, False) df = df.fillna(0) # run specified evaluation visualizer if report: visualizer = ClassificationReport(model, classes=model.classes_, support=True) visualizer.score(df[MODEL_COLS], tmpa) visualizer.show() if roc: visualizer = ROCAUC(model, classes=["W", "not-W"]) visualizer.score(df[MODEL_COLS], tmpa) visualizer.show() if pr: viz = PrecisionRecallCurve(model, is_fitted=True, classes=["W", "not-W"]) viz.score(df[MODEL_COLS], tmpa) viz.show()
def plot_precision_recall_curve_1(X_train, y_train, X_test, y_test, model): """ Function to plot precision recall curve :param X_train: training set :param y_train: training set target :param X_test: test set :param y_test: test set target :param model: model to analyze performance for :return: precision recall curve plot """ viz = PrecisionRecallCurve(model) viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.show()
def __init__(self, X_train, X_test, y_train, y_test, labels, model, viz_selection, upsampled=False): """ Class for yellowbrick classifier visualizer Args: X_train: numpy ndarray of model features training data values X_test: numpy ndarray of model features test data values y_train: numpy ndarray of model target variable training data values y_test: numpy ndarray of model target variable test data values labels: list of class labels for binary classification model: sklearn estimator for classification viz_selection: string value used to reference yellowbrick classification visualizer upsampled: binary value to determine to which subdirectory output image should be saved """ self.labels = labels self.model = model self.viz_selection = viz_selection self.upsampled = upsampled self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test if self.viz_selection == 'ClassificationReport': self.visualizer = ClassificationReport(self.model, classes=self.labels, support=True) elif self.viz_selection == 'ROCAUC': self.visualizer = ROCAUC(self.model, classes=self.labels, support=True) elif self.viz_selection == 'PrecisionRecallCurve': self.visualizer = PrecisionRecallCurve(self.model) elif self.viz_selection == 'ConfusionMatrix': self.visualizer = ConfusionMatrix(model, classes=self.labels) else: return print( "Error: viz_selection does not match accepted values. View Visualizer Class for accepted values." )
def eval_models(df, race="W", models=["gnb", "rf", "xgb"], census=False, report=False, roc=False, pr=False, cpe=False): """ Run evaluation on a set of models and a single race class """ df = prep_data(df) tes = joblib.load(DIR + "/data/models/transformers_binary.joblib") for col in [ "first_name", "last_name", "middle_name"]: te = tes[race][col] df[col] = te.transform(df[col]) df[col] = df[col].fillna(0) tmpa = np.where(df.race_code == race, True, False) df = df.fillna(0) for modelv in models: models = joblib.load(DIR + "/data/models/models_binary_%s%s.joblib" % (modelv, model_string)) model = models[race] model.target_type_ = "binary" if report: visualizer = ClassificationReport(model, classes=model.classes_, support=True) visualizer.score(df[MODEL_COLS], tmpa) visualizer.show() if roc: visualizer = ROCAUC(model, classes=["W", "not-W"]) visualizer.score(df[MODEL_COLS], tmpa) visualizer.show() if pr: viz = PrecisionRecallCurve(model, is_fitted=True, classes=["W", "not-W"]) viz.score(df[MODEL_COLS], tmpa) viz.show() if cpe: viz = ClassPredictionError(model) viz.score(df[MODEL_COLS], tmpa) viz.show()
from yellowbrick.classifier import ClassificationReport model = GaussianNB() visualizer = ClassificationReport(model, support=True) visualizer.fit(X_train, y_train) # Fit the visualizer and the model visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # In[20]: from yellowbrick.classifier import PrecisionRecallCurve visualizer = PrecisionRecallCurve(GaussianNB()) visualizer.fit(X_train, y_train) # Fit the visualizer and the model visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # In[22]: clf = LogisticRegression() visualizer = PrecisionRecallCurve(clf) visualizer.fit(X_train, y_train) visualizer.score(X_test,y_test) visualizer.show()
# Specify a threshold y_pred_thresh = rf.predict_proba(X_test)[:, 1] > 0.85 # 0.85 as threshold print(classification_report(y_test, y_pred_thresh)) ### Precision-Recall curve from scikitplot.metrics import plot_precision_recall rf_probas = rf.predict_proba(X_test)[:, 1] plot_precision_recall(y_test, rf_probas) from yellowbrick.classifier import PrecisionRecallCurve viz = PrecisionRecallCurve(rf) viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.poof() # Discimination Threshold - probability or score at which the positive class is chosen over the negative class from yellowbrick.classifier import DiscriminationThreshold viz = DiscriminationThreshold(rf) viz.fit(X_train, y_train) viz.poof() # Average Precision from sklearn.metrics import average_precision_score
# Run model with 4-fold cross validation. Report mean accuracy. scores = cross_val_score(mlp, X_train, y_train, cv=4) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Plot ROC, AUC. classes = ["Normal", "Pre-Ictal", "Seizure"] visualizer = ROCAUC(mlp, classes=classes) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data ROC_title = "ROCAUC_{}.png".format(animal_id) g = visualizer.poof(outpath=ROC_title) # Save plot w unique title # Plot the precision-recall curve. viz = PrecisionRecallCurve(mlp) viz.fit(X_train, y_train) # Fit the training data to the visualizer viz.score(X_test, y_test) # Evaluate the model on the test data PR_title = "PR_{}.png".format(animal_id) viz.poof(outpath=PR_title) # Save plot w unique title # Plot loss curve aka cost function. loss_values = mlp.loss_curve_ plt.plot(loss_values) plt.show() Loss_title = "Loss_{}.png".format(animal_id) plt.savefig(Loss_title) sys.stdout.close() # In[ ]:
def score_model_outcome(X_train, y_train, X_test, y_test, model, **kwargs): """ A function that returns the different metrics of accuracy, confusion matrix and other model reports depending on the type of model that is asked. This function is for prognosis Parameters ---------- X_train: matrix of training features y_train: vector of training labels X_test: matrix of test features y_test: vector of test labels Returns ------- - Accuracy, F1 score and ROC_AUC for the train and test set - Confusion matrix - ClassificationReport - PrecisionRecallCurve - ClassPredictionError """ # Train the model model.fit(X_train, y_train, **kwargs) # Predict on the train set prediction_train = model.predict(X_train) # Compute metrics for the train set accuracy_train = accuracy_score(y_train, prediction_train) # False Positive Rate, True Positive Rate, Threshold fpr_train, tpr_train, thresholds_train = roc_curve(y_train, prediction_train) auc_train = auc(fpr_train, tpr_train) f1_score_train = f1_score(y_train, prediction_train) # Predict on the test set prediction_test = model.predict(X_test) accuracy_test = accuracy_score(y_test, prediction_test) fpr_test, tpr_test, thresholds_test = roc_curve(y_test, prediction_test) auc_test = auc(fpr_test, tpr_test) f1_score_test = f1_score(y_test, prediction_test) print("{}:".format(model.__class__.__name__)) # Compute and return F1 (harmonic mean of precision and recall) print( "On training we get an Accuracy {}, an AUC {} and F1 score {} ".format( accuracy_train, auc_train, f1_score_train)) print("For test we get an Accuracy {}, an AUC {} and F1 score {}".format( accuracy_test, auc_test, f1_score_test)) fig, axes = plt.subplots(3, 2, figsize=(20, 20)) visualgrid = [ ConfusionMatrix(model, ax=axes[0][0], classes=['Death', 'Survival'], cmap="YlGnBu"), ClassificationReport( model, ax=axes[0][1], classes=['Death', 'Survival'], cmap="YlGn", ), PrecisionRecallCurve(model, ax=axes[1][0]), ClassPredictionError(model, classes=['Death', 'Survival'], ax=axes[1][1]), ] for viz in visualgrid: viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.finalize() try: roc_auc(model, X_train, y_train, X_test=X_test, y_test=y_test, classes=['Death', 'Survival'], ax=axes[2][0]) except: print('Can plot ROC curve for this model') try: viz = FeatureImportances(model, ax=axes[2][1], stack=True, relative=False) viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.finalize() except: print('Don\'t have feature importance') plt.show() print('\n')
def plot_pr(model, X_train, y_train, X_valid, y_valid): visualizer = PrecisionRecallCurve(model) visualizer.fit(X_train, y_train) visualizer.score(X_valid, y_valid) visualizer.poof()
visualizer.score(X_fclass_test, y_test) # Evaluate the model on the test data visualizer.poof(outpath="bag_classification_report_f_classIF.png") visualizer = ClassPredictionError(model, classes=classes) visualizer.fit(X_fclass_train, y_train) visualizer.score(X_fclass_test, y_test) visualizer.poof(outpath="bag_class_errorf_classIF.png") visualizer = DiscriminationThreshold(model) visualizer.fit(X_fclass_train, y_train) # Fit the training data to the visualizer visualizer.score(X_fclass_test, y_test) visualizer.poof(outpath="bag_descrimination_thresholdf_classIF.png") # Create the visualizer, fit, score, and poof it viz = PrecisionRecallCurve(model) viz.fit(X_fclass_train, y_train) viz.score(X_fclass_test, y_test) viz.poof(outpath="bag_precision_recall_curvef_classIF.png") #KNeighborsClassifier with f_classif features model = KNeighborsClassifier() model.fit(X_fclass_train, y_train) visualizer = ClassificationReport(model, classes=classes) visualizer.fit(X_fclass_train, y_train) # Fit the visualizer and the model visualizer.score(X_fclass_test, y_test) # Evaluate the model on the test data visualizer.poof(outpath="kneear_classification_report_fclassIF.png") visualizer = ClassPredictionError(model, classes=classes) visualizer.fit(X_fclass_train, y_train)
def evaluate_visualizer(self, classes=None, params={}): LOGGER.info('Initializing plot model') if os.path.isdir(os.path.join(os.getcwd(), 'visualizer/')) == False: os.makedirs(os.path.join(os.getcwd(), 'visualizer/')) if classes is None: classes = pd.value_counts(self.y.values.flatten()).index.tolist() visualizers = [] for idx, (name_model, estimator) in enumerate(self.estimator.items()): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( self.X, self.y, test_size=0.2, stratify=self.y, random_state=24) try: LOGGER.info('Visualizer ClassificationReport') visualizer = ClassificationReport(model=estimator, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = ClassificationReport( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR ClassificationReport') try: LOGGER.info('Visualizer ConfusionMatrix') visualizer = ConfusionMatrix(model=estimator, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = ConfusionMatrix( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR ConfusionMatrix') try: LOGGER.info('Visualizer ROCAUC') visualizer = ROCAUC(model=estimator, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = ROCAUC( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR ROCAUC') try: LOGGER.info('Visualizer PrecisionRecallCurve') visualizer = PrecisionRecallCurve(model=estimator, per_class=True, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = PrecisionRecallCurve( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR PrecisionRecallCurve') try: LOGGER.info('Visualizer ClassPredictionError') visualizer = ClassPredictionError(model=estimator, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = ClassPredictionError( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR ClassPredictionError') try: LOGGER.info('Visualizer Discrimination Threshold') visualizer = DiscriminationThreshold(model=estimator, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = DiscriminationThreshold( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR Discrimination Threshold')
df_abalone.columns = ['sex', 'length', 'diameter', 'height', 'whole weight', 'sucked weight', 'viscera weight', 'shell weight', 'rings'] # print(df_abalone.head()) sns.countplot(data=df_abalone, x='sex', hue='rings', palette='gist_heat') # plt.show() # print(df_abalone.describe()) # df_abalone.info() le = preprocessing.LabelEncoder() df_abalone['sex'] = le.fit_transform(df_abalone['sex']) # print(df_abalone.head()) cols = [col for col in df_abalone.columns if col is not "rings"] # print(cols) data = df_abalone[cols] target = df_abalone['rings'] data_train, data_test, target_train, target_test = train_test_split(data, target, test_size = 0.20, random_state = 10) data_train.info() logReg = LogisticRegression() pred = logReg.fit(data_train, target_train).predict(data_test) # print(pred) print("Logistic Regression accuracy: ", accuracy_score(target_test, pred, normalize=True)) visualizer = PrecisionRecallCurve(logReg) visualizer.fit(data_train, target_train) visualizer.score(data_test, target_test) visualizer.show()
col_names = ['pos_' + str(x) for x in range(1, 43)] col_names.append('outcome') df = pd.read_csv(data_loc, names=col_names) di = {'x': 1, 'o': -1, 'b': 0, 'win': 1, 'loss': -1, 'draw': 0} df.replace(di, inplace=True) df_x = df.drop(col_names[-1], axis=1) df_y = df[col_names[-1]] df_y = LabelEncoder().fit_transform(df_y) train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size=0.3, random_state=0) adab_tuned = AdaBoostClassifier( random_state=1, learning_rate=0.1, n_estimators=500, base_estimator=tree.DecisionTreeClassifier(max_depth=10)) adab_tuned = PrecisionRecallCurve(adab_tuned, per_class=True, iso_f1_curves=True, fill_area=False, micro=False) adab_tuned.fit(train_x, train_y) # Draw precision-recall curve adab_tuned.score(test_x, test_y) adab_tuned.poof()
def confusion_matrix(ax=None): data = load_spam(return_dataset=True) X, y = data.to_pandas() viz = PrecisionRecallCurve(LogisticRegression(), ax=ax) return tts_plot(viz, X, y)