def plot_optimal_threshold(model, x_train, y_train): """ """ # Visualization Threshold visualizer = DiscriminationThreshold(model) visualizer.fit(x_train, y_train) # Fit the data to the visualizer visualizer.show()
def discrimination_threshold(self) -> None: visualizer = DiscriminationThreshold(self.trained_model) visualizer.fit(self.X_test, self.y_test) # Fit the data to the visualizer save_dir = f"{self.plots_dir}/discrimination_plot_{self.model_id}.png" visualizer.show(outpath=save_dir) if not LOCAL: upload_to_s3(save_dir, f'plots/discrimination_plot_{self.model_id}.png', bucket=S3_BUCKET_NAME) plt.clf()
def discrimination_thersold(model, classes, X_train, Y_train, X_test, Y_test): from yellowbrick.classifier import DiscriminationThreshold # Instantiate the classification model and visualizer viz = DiscriminationThreshold(model) # visualizer.fit(X, y) # Fit the training data to the visualizer # visualizer.poof() # Draw/show/poof the data viz.fit(X_train, Y_train) # viz.score(X_test, Y_test) viz.poof()
def plot_discrimination_threshold(clf, data='spam', outpath=None): if data == 'spam': X, y = load_spam() elif data == 'churn': X, y = load_churn() else: raise ValueError("no dataset loader '{}'".format(data)) _, ax = plt.subplots() visualizer = DiscriminationThreshold(clf, ax=ax) visualizer.fit(X, y) visualizer.poof(outpath=outpath)
def selectDiscr(): data_path = "labeled_data.csv" data = pd.read_csv(data_path) # We create the preprocessing pipelines for both numeric and categorical data. numeric_features = ['count_reviews', 'rating'] numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['product_category'] categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, numeric_features ), ('cat', categorical_transformer, categorical_features)]) viz = DiscriminationThreshold(LogisticRegression()) clf = VisualPipeline(steps=[ ('preprocessor', preprocessor), #('classifier', LogisticRegression(solver='lbfgs')), ('viz', viz) ]) X = data.drop('label', axis=1) y = data['label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) model = clf.fit(X_train, y_train) model.poof()
def classification(fname="classification.png"): # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18, 6)) # Add ClassificationReport to the reft data = load_spam(split=True) oz = ClassificationReport(MultinomialNB(), classes=["ham", "spam"], ax=axes[0]) oz.fit(data.X.train, data.y.train) oz.score(data.X.test, data.y.test) oz.finalize() # Add DiscriminationThreshold to the right data = load_spam(split=False) oz = DiscriminationThreshold(LogisticRegression(), ax=axes[1]) oz.fit(data.X, data.y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def telecom_churn_prediction(algorithm, name, X_train, X_test, y_train, y_test, cols, cf=None, plot=False, threshold=False): #model start = time() # Get start time algorithm.fit(X_train, y_train) end = time() # Get end time # Calculate the training time train_time = round(end - start, 4) #predict start = time() # Get start time predictions_test = algorithm.predict(X_test) end = time() # Get end time # Calculate the training time pred_time = round(end - start, 4) predictions_train = algorithm.predict(X_train) probabilities = algorithm.predict_proba(X_test) #coeffs if cf != None: if cf == "coefficients": coefficients = pd.DataFrame(algorithm.coef_.ravel()) elif cf == "features": coefficients = pd.DataFrame(algorithm.feature_importances_) column_df = pd.DataFrame(cols) coef_sumry = (pd.merge(coefficients, column_df, left_index=True, right_index=True, how="left")) coef_sumry.columns = ["coefficients", "features"] coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False) print(algorithm) print("\n Classification report : \n", classification_report(y_test, predictions_test)) #confusion matrix conf_matrix = confusion_matrix(y_test, predictions_test) #roc_auc_score model_roc_auc = roc_auc_score(y_test, predictions_test) print('train') print("Accuracy Score : ", accuracy_score(y_train, predictions_train)) print("Area under curve : ", roc_auc_score(y_train, predictions_train), "\n") print('test') print("Accuracy Score :", accuracy_score(y_test, predictions_test)) print("Area under curve : ", model_roc_auc, "\n") fpr, tpr, thresholds = roc_curve(y_test, probabilities[:, 1]) accuracy = accuracy_score(y_test, predictions_test) recallscore = recall_score(y_test, predictions_test) precision = precision_score(y_test, predictions_test) roc_auc_train = roc_auc_score(y_train, predictions_train) roc_auc_test = roc_auc_score(y_test, predictions_test) f1score = f1_score(y_test, predictions_test) result = pd.DataFrame({ "Model": [name], "Accuracy_score": [accuracy], "Recall_score": [recallscore], "Precision": [precision], "f1_score": [f1score], "Area_under_curve(train)": [roc_auc_train], "Area_under_curve(test)": [roc_auc_test], "train_time": [train_time], 'pred_time': [pred_time] }) if cf != None: plt.figure(figsize=(12, 8)) #plot confusion matrix plt.subplot(221) plt.grid(b=None) #無網格 plot_confusion_matrix(conf_matrix, ["Not churn", "Churn"]) plt.subplot(222) #plot roc curve plt.plot(fpr, tpr, label="ROC Curve") plt.title('Receiver operating characteristic') plt.xlabel("false positive rate") plt.ylabel("true positive rate (recall)") #plot coeffs sns.set(font_scale=1) plt.subplot(212) plt.title('Feature Importances') plt.xticks(rotation='90') sns.barplot(coef_sumry['features'], coef_sumry['coefficients']) plt.subplots_adjust(top=1.2, bottom=0.2, left=0.10, right=0.95, hspace=0.25, wspace=0.35) if threshold == True: #plot threshold plt.figure(figsize=(14, 4)) visualizer = DiscriminationThreshold(algorithm) visualizer.fit(X_train, y_train) visualizer.poof() elif cf == None: plt.figure(figsize=(12, 4)) #plot confusion matrix plt.subplot(121) plt.grid(b=None) #無網格 plot_confusion_matrix(conf_matrix, ["Not churn", "Churn"]) plt.subplot(122) #plot roc curve plt.plot(fpr, tpr, label="ROC Curve") plt.title('Receiver operating characteristic') plt.xlabel("false positive rate") plt.ylabel("true positive rate (recall)") plt.subplots_adjust(top=1.2, bottom=0.2, left=0.10, right=0.95, hspace=0.25, wspace=0.35) return result
rf_probas = rf.predict_proba(X_test)[:, 1] plot_precision_recall(y_test, rf_probas) from yellowbrick.classifier import PrecisionRecallCurve viz = PrecisionRecallCurve(rf) viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.poof() # Discimination Threshold - probability or score at which the positive class is chosen over the negative class from yellowbrick.classifier import DiscriminationThreshold viz = DiscriminationThreshold(rf) viz.fit(X_train, y_train) viz.poof() # Average Precision from sklearn.metrics import average_precision_score average_precision_score( y_test, rf.predict_proba(X_test)[:, 1]) # slice to give probs of class 1 # AUC and ROC curve from sklearn.metrics import roc_auc_score
# %% classification_report(clf, X, y) # %% visualizer = ROCAUC(clf, classes=class_names) visualizer.score(X, y) visualizer.poof() # %% visualizer = ClassPredictionError(clf, classes=class_names) visualizer.score(X, y) visualizer.poof() # %% visualizer = DiscriminationThreshold(clf) visualizer.fit(X, y) visualizer.poof() # %% keep = [263, 268, 287, 288, 300, 302, 307, 308, 313, 315] # %% seed = 15 test_size = 0.33 Xt, Xv, yt, yv = \ sklearn.model_selection.train_test_split( X[keep], y, test_size=test_size, stratify=y, random_state=seed) # %% explainer = shap.TreeExplainer(clf)
def evaluate_visualizer(self, classes=None, params={}): LOGGER.info('Initializing plot model') if os.path.isdir(os.path.join(os.getcwd(), 'visualizer/')) == False: os.makedirs(os.path.join(os.getcwd(), 'visualizer/')) if classes is None: classes = pd.value_counts(self.y.values.flatten()).index.tolist() visualizers = [] for idx, (name_model, estimator) in enumerate(self.estimator.items()): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( self.X, self.y, test_size=0.2, stratify=self.y, random_state=24) try: LOGGER.info('Visualizer ClassificationReport') visualizer = ClassificationReport(model=estimator, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = ClassificationReport( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR ClassificationReport') try: LOGGER.info('Visualizer ConfusionMatrix') visualizer = ConfusionMatrix(model=estimator, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = ConfusionMatrix( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR ConfusionMatrix') try: LOGGER.info('Visualizer ROCAUC') visualizer = ROCAUC(model=estimator, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = ROCAUC( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR ROCAUC') try: LOGGER.info('Visualizer PrecisionRecallCurve') visualizer = PrecisionRecallCurve(model=estimator, per_class=True, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = PrecisionRecallCurve( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR PrecisionRecallCurve') try: LOGGER.info('Visualizer ClassPredictionError') visualizer = ClassPredictionError(model=estimator, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = ClassPredictionError( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR ClassPredictionError') try: LOGGER.info('Visualizer Discrimination Threshold') visualizer = DiscriminationThreshold(model=estimator, classes=classes) if visualizer.__class__.__name__ in params.keys(): visualizer = DiscriminationThreshold( **params[visualizer.__class__.__name__]) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show(outpath=os.path.join( os.getcwd(), f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png' )) plt.cla() except: LOGGER.warn('ERROR Discrimination Threshold')
def supervised_prediction(algorithm, training_x, testing_x, training_y, testing_y, cols, cf, threshold_plot): #model algorithm.fit(training_x, training_y) predictions = algorithm.predict(testing_x) probabilities = algorithm.predict_proba(testing_x) #coeffs if cf == "coefficients": coefficients = pd.DataFrame(algorithm.coef_.ravel()) elif cf == "features": coefficients = pd.DataFrame(algorithm.feature_importances_) column_df = pd.DataFrame(cols) coef_sumry = (pd.merge(coefficients, column_df, left_index=True, right_index=True, how="left")) coef_sumry.columns = ["coefficients", "features"] coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False) print(algorithm) print("\n Classification report : \n", classification_report(testing_y, predictions)) print("Accuracy Score : ", accuracy_score(testing_y, predictions)) #confusion matrix conf_matrix = confusion_matrix(testing_y, predictions) #roc_auc_score model_roc_auc = roc_auc_score(testing_y, predictions) print("Area under curve : ", model_roc_auc, "\n") fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1]) #plot confusion matrix trace1 = go.Heatmap(z=conf_matrix, x=["Not churn", "Churn"], y=["Not churn", "Churn"], showscale=False, colorscale="Picnic", name="matrix") #plot roc curve trace2 = go.Scatter(x=fpr, y=tpr, name="Roc : " + str(model_roc_auc), line=dict(color=('rgb(22, 96, 167)'), width=2)) trace3 = go.Scatter(x=[0, 1], y=[0, 1], line=dict(color=('rgb(205, 12, 24)'), width=2, dash='dot')) #plot coeffs trace4 = go.Bar(x=coef_sumry["features"], y=coef_sumry["coefficients"], name="coefficients", marker=dict(color=coef_sumry["coefficients"], colorscale="Picnic", line=dict(width=.6, color="black"))) #subplots fig = tls.make_subplots( rows=2, cols=2, specs=[[{}, {}], [{ 'colspan': 2 }, None]], subplot_titles=('Confusion Matrix', 'Receiver operating characteristic', 'Feature Importances')) fig.append_trace(trace1, 1, 1) fig.append_trace(trace2, 1, 2) fig.append_trace(trace3, 1, 2) fig.append_trace(trace4, 2, 1) fig['layout'].update(showlegend=False, title="Model performance", autosize=False, height=900, width=800, plot_bgcolor='rgba(240,240,240, 0.95)', paper_bgcolor='rgba(240,240,240, 0.95)', margin=dict(b=195)) fig["layout"]["xaxis2"].update(dict(title="false positive rate")) fig["layout"]["yaxis2"].update(dict(title="true positive rate")) fig["layout"]["xaxis3"].update( dict(showgrid=True, tickfont=dict(size=10), tickangle=90)) py.iplot(fig) if threshold_plot == True: visualizer = DiscriminationThreshold(algorithm) visualizer.fit(training_x, training_y) visualizer.poof()
ax3 = fig3.add_subplot(111) ax3.set_title('Scoreverteilung') score = np.linspace(0, 1, 1000) ax3.plot(score, num_tp1, "-", label = "Signal") ax3.plot(score, num_tn1, "-", label = "Background") ax3.set_ylabel("Anzahl") ax3.set_xlabel("Scorecut") ax3.legend() fig3.savefig("plots/forest/Scoredistribution.pdf") # precision recall threshold curve # https://www.kaggle.com/kevinarvai/fine-tuning-a-classifier-in-scikit-learn, http://www.scikit-yb.org/en/latest/api/classifier/threshold.html from yellowbrick.classifier import DiscriminationThreshold fig5 = plt.figure(5) ax5 = fig5.add_subplot(111) visualizer = DiscriminationThreshold(forest, exclude = ("queue_rate", "fscore"), ax = ax5) visualizer.fit(data_train_X, data_train_y) # Fit the training data to the visualizer visualizer.poof(outpath="plots/forest/precrecathresh.pdf") # Draw/show/poof the data print(time.clock()) print(confusion_matrix(expected, (predicted_probs[:,1] > 0.3).astype(bool))) from sklearn.metrics import classification_report print(classification_report(expected, (predicted_probs[:,1] > 0.3).astype(bool))) # http://www.scikit-yb.org/en/latest/api/features/importances.html from yellowbrick.features.importances import FeatureImportances importances = forest.feature_importances_ indices = np.argsort(importances)[::-1]
visualizer = ClassPredictionError(model) visualizer.score(X_test, y_test) visualizer.show() # 分类报告 visualizer = ClassificationReport(model) visualizer.score(X_test, y_test) visualizer.show() # 混淆矩阵 visualizer = ConfusionMatrix(model) visualizer.score(X_test, y_test) visualizer.show() # 阈值选择 visualizer = DiscriminationThreshold(model) visualizer.fit(X_train, y_train) visualizer.show() # 学习率 visualizer = LearningCurve(model, scoring='f1_weighted') visualizer.fit(X_train, y_train) visualizer.show() # 交叉验证 visualizer = CVScores(model, cv=5, scoring='f1_weighted') visualizer.fit(X_train, y_train) visualizer.show() # 特征重要性 visualizer = FeatureImportances(model)
def discrimination_thresholding(xx,yy,estimatorss,**kwargs): vz = DiscriminationThreshold(estimatorss, classes=['Reach, 1 Reach, or L/R Reach', 'Null, Multiple Reaches, Or Multiple Arms'], cmap="YlGn", size=(600, 360), **kwargs) vz.fit(xx,yy) vz.score(xx,yy) vz.show()
def train_and_evaluate_classifier( algorithm, training_x, testing_x, training_y, testing_y, cols, cf='coefficients', threshold_plot=True, ): """ Обучение классификатора на тренировочных данных, оценка прогноза на тестовых, и визуализация некоторых метрик качества прогноза. algorithm - использованный алгоритм с методами fit, predict и predict_proba training_x - данные для предсказывающих переменных (обучение) testing_x - данные для предсказывающих переменных (тест) training_y - целевая переменная (обучение) training_y - целевая переменная (тест) cf - ["coefficients","features"](коэффициенты для логрегрессии, параметры для деревьев) threshold_plot - если True, возвращает для модели threshold plot """ # модель algorithm.fit(training_x, training_y) predictions = algorithm.predict(testing_x) probabilities = algorithm.predict_proba(testing_x) # коэффициенты if cf == "coefficients": coefficients = pd.DataFrame(algorithm.coef_.ravel()) elif cf == "features": coefficients = pd.DataFrame(algorithm.feature_importances_) else: raise ValueError( '`coefficients` value must be one of {`coefficients`, `features`}') column_df = pd.DataFrame(cols) coef_sumry = (pd.merge(coefficients, column_df, left_index=True, right_index=True, how="left")) coef_sumry.columns = ["coefficients", "features"] coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False) print(algorithm) print("\n Отчет по классфицикации : \n", classification_report(testing_y, predictions)) print("Точность : ", accuracy_score(testing_y, predictions)) # confusion_matrix conf_matrix = confusion_matrix(testing_y, predictions) # roc_auc_score model_roc_auc = roc_auc_score(testing_y, predictions) print("Площадь под кривой : ", model_roc_auc, "\n") fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1]) # готовим confusion_matrix для рисования trace1 = go.Heatmap( z=conf_matrix, x=["Пользователи", "Отток"], y=["Пользователи", "Отток"], showscale=False, colorscale="Picnic", name="matrix", ) # готовим roc_curve для рисования trace2 = go.Scatter(x=fpr, y=tpr, name="Roc : " + str(model_roc_auc), line=dict(color=('rgb(22, 96, 167)'), width=2)) trace3 = go.Scatter(x=[0, 1], y=[0, 1], line=dict(color=('rgb(205, 12, 24)'), width=2, dash='dot')) # готовим коэффициенты для рисования trace4 = go.Bar(x=coef_sumry["features"], y=coef_sumry["coefficients"], name="coefficients", marker=dict(color=coef_sumry["coefficients"], colorscale="Picnic", line=dict(width=.6, color="black"))) # рисуем fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{ 'colspan': 2 }, None]]) fig.append_trace(trace1, 1, 1) fig.append_trace(trace2, 1, 2) fig.append_trace(trace3, 1, 2) fig.append_trace(trace4, 2, 1) fig['layout'].update( showlegend=False, autosize=False, height=900, width=800, plot_bgcolor='rgba(240,240,240, 0.95)', paper_bgcolor='rgba(240,240,240, 0.95)', margin=dict(b=195), ) fig["layout"]["xaxis2"].update(dict(title="false positive rate")) fig["layout"]["yaxis2"].update(dict(title="true positive rate")) fig["layout"]["xaxis3"].update( dict(showgrid=True, tickfont=dict(size=10), tickangle=90)) py.iplot(fig) if threshold_plot: visualizer = DiscriminationThreshold(algorithm) visualizer.fit(training_x, training_y) visualizer.poof()
fig["layout"]["yaxis2"].update(dict(title = "true positive rate")) fig["layout"]["xaxis3"].update(dict(showgrid = True,tickfont = dict(size = 10),tickangle = 90)) py.iplot(fig) # Usually, we assign an object to a class if the probabily of belonging to this class is above 0.5 # However, this threshold can be adjusted and this function allows to find the optimal value given some metrics (recall, precison, f1, queue rate) # Find optimal threshold optimal_idx = np.argmax(tpr - fpr) optimal_threshold = thresholds[optimal_idx] # The optimal threshold is 0.30. # Better method to find optimal threshold visualizer = DiscriminationThreshold(classifier) visualizer.fit(X_train,y_train) visualizer.poof() ########################################## IMPROVEMENTS # Implement SMOTE """ # SMOTE is not appropriate as it does not deal with dummy variables. sm = SMOTE() X_smote, y_smote = sm.fit_resample(X_train, y_train)
def discrimination(): X, y = load_spam() oz = DiscriminationThreshold(LogisticRegression(solver="lbfgs"), ax=newfig()) oz.fit(X, y) savefig(oz, "discrimination_threshold")
def telecom_churn_prediction_alg(algorithm, training_x, testing_x, training_y, testing_y, threshold_plot=True): # model algorithm.fit(training_x, training_y) predictions = algorithm.predict(testing_x) probabilities = algorithm.predict_proba(testing_x) print(algorithm) print("\n Classification report : \n", classification_report(testing_y, predictions)) print("Accuracy Score : ", accuracy_score(testing_y, predictions)) # confusion matrix conf_matrix = confusion_matrix(testing_y, predictions) # roc_auc_score model_roc_auc = roc_auc_score(testing_y, predictions) print("Area under curve : ", model_roc_auc) fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1]) # plot roc curve trace1 = go.Scatter( x=fpr, y=tpr, name="Roc : " + str(model_roc_auc), line=dict(color=('rgb(22, 96, 167)'), width=2), ) trace2 = go.Scatter(x=[0, 1], y=[0, 1], line=dict(color=('rgb(205, 12, 24)'), width=2, dash='dot')) # plot confusion matrix trace3 = go.Heatmap(z=conf_matrix, x=["Not churn", "Churn"], y=["Not churn", "Churn"], showscale=False, colorscale="Blues", name="matrix", xaxis="x2", yaxis="y2") layout = go.Layout( dict(title="Model performance", autosize=False, height=500, width=800, showlegend=False, plot_bgcolor="rgb(243,243,243)", paper_bgcolor="rgb(243,243,243)", xaxis=dict(title="false positive rate", gridcolor='rgb(255, 255, 255)', domain=[0, 0.6], ticklen=5, gridwidth=2), yaxis=dict(title="true positive rate", gridcolor='rgb(255, 255, 255)', zerolinewidth=1, ticklen=5, gridwidth=2), margin=dict(b=200), xaxis2=dict(domain=[0.7, 1], tickangle=90, gridcolor='rgb(255, 255, 255)'), yaxis2=dict(anchor='x2', gridcolor='rgb(255, 255, 255)'))) data = [trace1, trace2, trace3] fig = go.Figure(data=data, layout=layout) py.iplot(fig) if threshold_plot == True: visualizer = DiscriminationThreshold(algorithm) visualizer.fit(training_x, training_y) visualizer.poof()
# Compute the accuracy: accuracy accuracy = float(np.sum(preds==y_test))/y_test.shape[0] print("accuracy: %f" % (accuracy)) #%% #Measuign performance train_score2= xgbc.score(X_train, y_train) test_score2 = xgbc.score(X_test, y_test) print(train_score1) print(test_score1) #%% #Measuign performance print(confusion_matrix(y_test, preds)) print(classification_report(y_test, preds)) print(accuracy_score(y_test, preds)) #%% #ROC # Instantiate the visualizer with the classification model visualizer = ROCAUC(xgbc, classes=["will not default", "will default"]) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() #%% visualizer = DiscriminationThreshold(xgbc) visualizer.fit(X, y) # Fit the data to the visualizer visualizer.show()
### ROC-AUC from yellowbrick.classifier import ROCAUC visualizer = ROCAUC(LogisticRegression(), classes=classes) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof() ### Class Prediction Error from yellowbrick.classifier import ClassPredictionError visualizer = ClassPredictionError(LogisticRegression(), classes=classes) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof() ### Discrimination Threshold # Only works for binary classification from yellowbrick.classifier import DiscriminationThreshold visualizer = DiscriminationThreshold(LogisticRegression()) visualizer.fit(X, y) visualizer.poof()
LR = LogisticRegression() LR.fit(X_train,y_train) evaluate_model(LR,X_test,y_test,True) # evaluate model # let's look discimination pred = LR.predict(X_test) proba = LR.predict_proba(X_test) # discrimination threshold is 0.5, let's find best disc. threshhold. proba = pd.DataFrame(proba,columns=["0","1"]) proba["Selected Class"] = pred # try to best threshold to maximize f1 score vis = DiscriminationThreshold(LR) vis.fit(X_train,y_train) vis.poof() # algorithm trys to maximize f1 score # threshold = 0.29 # KNN from sklearn.neighbors import KNeighborsClassifier k_scores = {} for k in range(1,30,2): KNN = KNeighborsClassifier(n_neighbors=k) KNN.fit(X_train,y_train) k_scores[k] = [KNN.score(X_test,y_test),roc_auc_score(y_test,KNN.predict(X_test))]
refit=True, random_state=random_state, verbose=True) grid_search.fit(X, y, **fit_params) opt_parameters = grid_search.best_params_ lgbm_clf = lgbm.LGBMClassifier(**opt_parameters) # In[89]: model_performance(lgbm_clf, 'LightGBM') scores_table(lgbm_clf, 'LightGBM') # In[90]: visualizer = DiscriminationThreshold(lgbm_clf) visualizer.fit(X, y) visualizer.poof() # In[91]: knn_clf = KNeighborsClassifier() voting_clf = VotingClassifier(estimators=[('lgbm_clf', lgbm_clf), ('knn', KNeighborsClassifier())], voting='soft', weights=[1, 1]) params = {'knn__n_neighbors': np.arange(1, 30)}
def discrimination_threshold(ax=None): data = load_spam(return_dataset=True) X, y = data.to_pandas() viz = DiscriminationThreshold(RandomForestClassifier(n_estimators=10), ax=ax) return tts_plot(viz, X, y, score=False)