def discrimination_thersold(model, classes, X_train, Y_train, X_test, Y_test): from yellowbrick.classifier import DiscriminationThreshold # Instantiate the classification model and visualizer viz = DiscriminationThreshold(model) # visualizer.fit(X, y) # Fit the training data to the visualizer # visualizer.poof() # Draw/show/poof the data viz.fit(X_train, Y_train) # viz.score(X_test, Y_test) viz.poof()
def plot_discrimination_threshold(clf, data='spam', outpath=None): if data == 'spam': X, y = load_spam() elif data == 'churn': X, y = load_churn() else: raise ValueError("no dataset loader '{}'".format(data)) _, ax = plt.subplots() visualizer = DiscriminationThreshold(clf, ax=ax) visualizer.fit(X, y) visualizer.poof(outpath=outpath)
grid_search.fit(X, y, **fit_params) opt_parameters = grid_search.best_params_ lgbm_clf = lgbm.LGBMClassifier(**opt_parameters) # In[89]: model_performance(lgbm_clf, 'LightGBM') scores_table(lgbm_clf, 'LightGBM') # In[90]: visualizer = DiscriminationThreshold(lgbm_clf) visualizer.fit(X, y) visualizer.poof() # In[91]: knn_clf = KNeighborsClassifier() voting_clf = VotingClassifier(estimators=[('lgbm_clf', lgbm_clf), ('knn', KNeighborsClassifier())], voting='soft', weights=[1, 1]) params = {'knn__n_neighbors': np.arange(1, 30)} grid = GridSearchCV(estimator=voting_clf, param_grid=params, cv=5) grid.fit(X, y)
def telecom_churn_prediction(algorithm, name, X_train, X_test, y_train, y_test, cols, cf=None, plot=False, threshold=False): #model start = time() # Get start time algorithm.fit(X_train, y_train) end = time() # Get end time # Calculate the training time train_time = round(end - start, 4) #predict start = time() # Get start time predictions_test = algorithm.predict(X_test) end = time() # Get end time # Calculate the training time pred_time = round(end - start, 4) predictions_train = algorithm.predict(X_train) probabilities = algorithm.predict_proba(X_test) #coeffs if cf != None: if cf == "coefficients": coefficients = pd.DataFrame(algorithm.coef_.ravel()) elif cf == "features": coefficients = pd.DataFrame(algorithm.feature_importances_) column_df = pd.DataFrame(cols) coef_sumry = (pd.merge(coefficients, column_df, left_index=True, right_index=True, how="left")) coef_sumry.columns = ["coefficients", "features"] coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False) print(algorithm) print("\n Classification report : \n", classification_report(y_test, predictions_test)) #confusion matrix conf_matrix = confusion_matrix(y_test, predictions_test) #roc_auc_score model_roc_auc = roc_auc_score(y_test, predictions_test) print('train') print("Accuracy Score : ", accuracy_score(y_train, predictions_train)) print("Area under curve : ", roc_auc_score(y_train, predictions_train), "\n") print('test') print("Accuracy Score :", accuracy_score(y_test, predictions_test)) print("Area under curve : ", model_roc_auc, "\n") fpr, tpr, thresholds = roc_curve(y_test, probabilities[:, 1]) accuracy = accuracy_score(y_test, predictions_test) recallscore = recall_score(y_test, predictions_test) precision = precision_score(y_test, predictions_test) roc_auc_train = roc_auc_score(y_train, predictions_train) roc_auc_test = roc_auc_score(y_test, predictions_test) f1score = f1_score(y_test, predictions_test) result = pd.DataFrame({ "Model": [name], "Accuracy_score": [accuracy], "Recall_score": [recallscore], "Precision": [precision], "f1_score": [f1score], "Area_under_curve(train)": [roc_auc_train], "Area_under_curve(test)": [roc_auc_test], "train_time": [train_time], 'pred_time': [pred_time] }) if cf != None: plt.figure(figsize=(12, 8)) #plot confusion matrix plt.subplot(221) plt.grid(b=None) #無網格 plot_confusion_matrix(conf_matrix, ["Not churn", "Churn"]) plt.subplot(222) #plot roc curve plt.plot(fpr, tpr, label="ROC Curve") plt.title('Receiver operating characteristic') plt.xlabel("false positive rate") plt.ylabel("true positive rate (recall)") #plot coeffs sns.set(font_scale=1) plt.subplot(212) plt.title('Feature Importances') plt.xticks(rotation='90') sns.barplot(coef_sumry['features'], coef_sumry['coefficients']) plt.subplots_adjust(top=1.2, bottom=0.2, left=0.10, right=0.95, hspace=0.25, wspace=0.35) if threshold == True: #plot threshold plt.figure(figsize=(14, 4)) visualizer = DiscriminationThreshold(algorithm) visualizer.fit(X_train, y_train) visualizer.poof() elif cf == None: plt.figure(figsize=(12, 4)) #plot confusion matrix plt.subplot(121) plt.grid(b=None) #無網格 plot_confusion_matrix(conf_matrix, ["Not churn", "Churn"]) plt.subplot(122) #plot roc curve plt.plot(fpr, tpr, label="ROC Curve") plt.title('Receiver operating characteristic') plt.xlabel("false positive rate") plt.ylabel("true positive rate (recall)") plt.subplots_adjust(top=1.2, bottom=0.2, left=0.10, right=0.95, hspace=0.25, wspace=0.35) return result
score = np.linspace(0, 1, 1000) ax3.plot(score, num_tp1, "-", label = "Signal") ax3.plot(score, num_tn1, "-", label = "Background") ax3.set_ylabel("Anzahl") ax3.set_xlabel("Scorecut") ax3.legend() fig3.savefig("plots/forest/Scoredistribution.pdf") # precision recall threshold curve # https://www.kaggle.com/kevinarvai/fine-tuning-a-classifier-in-scikit-learn, http://www.scikit-yb.org/en/latest/api/classifier/threshold.html from yellowbrick.classifier import DiscriminationThreshold fig5 = plt.figure(5) ax5 = fig5.add_subplot(111) visualizer = DiscriminationThreshold(forest, exclude = ("queue_rate", "fscore"), ax = ax5) visualizer.fit(data_train_X, data_train_y) # Fit the training data to the visualizer visualizer.poof(outpath="plots/forest/precrecathresh.pdf") # Draw/show/poof the data print(time.clock()) print(confusion_matrix(expected, (predicted_probs[:,1] > 0.3).astype(bool))) from sklearn.metrics import classification_report print(classification_report(expected, (predicted_probs[:,1] > 0.3).astype(bool))) # http://www.scikit-yb.org/en/latest/api/features/importances.html from yellowbrick.features.importances import FeatureImportances importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Print the Top 20 feature ranking
def supervised_prediction(algorithm, training_x, testing_x, training_y, testing_y, cols, cf, threshold_plot): #model algorithm.fit(training_x, training_y) predictions = algorithm.predict(testing_x) probabilities = algorithm.predict_proba(testing_x) #coeffs if cf == "coefficients": coefficients = pd.DataFrame(algorithm.coef_.ravel()) elif cf == "features": coefficients = pd.DataFrame(algorithm.feature_importances_) column_df = pd.DataFrame(cols) coef_sumry = (pd.merge(coefficients, column_df, left_index=True, right_index=True, how="left")) coef_sumry.columns = ["coefficients", "features"] coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False) print(algorithm) print("\n Classification report : \n", classification_report(testing_y, predictions)) print("Accuracy Score : ", accuracy_score(testing_y, predictions)) #confusion matrix conf_matrix = confusion_matrix(testing_y, predictions) #roc_auc_score model_roc_auc = roc_auc_score(testing_y, predictions) print("Area under curve : ", model_roc_auc, "\n") fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1]) #plot confusion matrix trace1 = go.Heatmap(z=conf_matrix, x=["Not churn", "Churn"], y=["Not churn", "Churn"], showscale=False, colorscale="Picnic", name="matrix") #plot roc curve trace2 = go.Scatter(x=fpr, y=tpr, name="Roc : " + str(model_roc_auc), line=dict(color=('rgb(22, 96, 167)'), width=2)) trace3 = go.Scatter(x=[0, 1], y=[0, 1], line=dict(color=('rgb(205, 12, 24)'), width=2, dash='dot')) #plot coeffs trace4 = go.Bar(x=coef_sumry["features"], y=coef_sumry["coefficients"], name="coefficients", marker=dict(color=coef_sumry["coefficients"], colorscale="Picnic", line=dict(width=.6, color="black"))) #subplots fig = tls.make_subplots( rows=2, cols=2, specs=[[{}, {}], [{ 'colspan': 2 }, None]], subplot_titles=('Confusion Matrix', 'Receiver operating characteristic', 'Feature Importances')) fig.append_trace(trace1, 1, 1) fig.append_trace(trace2, 1, 2) fig.append_trace(trace3, 1, 2) fig.append_trace(trace4, 2, 1) fig['layout'].update(showlegend=False, title="Model performance", autosize=False, height=900, width=800, plot_bgcolor='rgba(240,240,240, 0.95)', paper_bgcolor='rgba(240,240,240, 0.95)', margin=dict(b=195)) fig["layout"]["xaxis2"].update(dict(title="false positive rate")) fig["layout"]["yaxis2"].update(dict(title="true positive rate")) fig["layout"]["xaxis3"].update( dict(showgrid=True, tickfont=dict(size=10), tickangle=90)) py.iplot(fig) if threshold_plot == True: visualizer = DiscriminationThreshold(algorithm) visualizer.fit(training_x, training_y) visualizer.poof()
# precision recall threshold curve_fit from sklearn.metrics import precision_recall_curve precisions, recalls, thresholds = precision_recall_curve(expected, predicted) fig4 = plt.figure(4) ax4 = fig4.add_subplot(111) ax4.set_title( "Precision and Recall Scores as a function of the decision threshold") ax4.plot(thresholds, precisions[:-1], label="Precision") ax4.plot(thresholds, recalls[:-1], label="Recall") ax4.set_ylabel("Score") ax4.set_xlabel("Decision Threshold") ax4.legend() fig4.savefig("plots/Precision_Recall_Threshold.pdf") # precision recall threshold curve_fit2 from sklearn.linear_model import LogisticRegression from yellowbrick.classifier import DiscriminationThreshold # Instantiate the classification model and visualizer fig5 = plt.figure(5) ax5 = fig5.add_subplot(111) visualizer = DiscriminationThreshold(nbayes, exclude=("queue_rate", "fscore"), ax=ax5) visualizer.fit(data_train_X, data_train_y) # Fit the training data to the visualizer visualizer.poof( outpath="plots/prec_reca_thresh.pdf") # Draw/show/poof the data
def telecom_churn_prediction_alg(algorithm, training_x, testing_x, training_y, testing_y, threshold_plot=True): # model algorithm.fit(training_x, training_y) predictions = algorithm.predict(testing_x) probabilities = algorithm.predict_proba(testing_x) print(algorithm) print("\n Classification report : \n", classification_report(testing_y, predictions)) print("Accuracy Score : ", accuracy_score(testing_y, predictions)) # confusion matrix conf_matrix = confusion_matrix(testing_y, predictions) # roc_auc_score model_roc_auc = roc_auc_score(testing_y, predictions) print("Area under curve : ", model_roc_auc) fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1]) # plot roc curve trace1 = go.Scatter( x=fpr, y=tpr, name="Roc : " + str(model_roc_auc), line=dict(color=('rgb(22, 96, 167)'), width=2), ) trace2 = go.Scatter(x=[0, 1], y=[0, 1], line=dict(color=('rgb(205, 12, 24)'), width=2, dash='dot')) # plot confusion matrix trace3 = go.Heatmap(z=conf_matrix, x=["Not churn", "Churn"], y=["Not churn", "Churn"], showscale=False, colorscale="Blues", name="matrix", xaxis="x2", yaxis="y2") layout = go.Layout( dict(title="Model performance", autosize=False, height=500, width=800, showlegend=False, plot_bgcolor="rgb(243,243,243)", paper_bgcolor="rgb(243,243,243)", xaxis=dict(title="false positive rate", gridcolor='rgb(255, 255, 255)', domain=[0, 0.6], ticklen=5, gridwidth=2), yaxis=dict(title="true positive rate", gridcolor='rgb(255, 255, 255)', zerolinewidth=1, ticklen=5, gridwidth=2), margin=dict(b=200), xaxis2=dict(domain=[0.7, 1], tickangle=90, gridcolor='rgb(255, 255, 255)'), yaxis2=dict(anchor='x2', gridcolor='rgb(255, 255, 255)'))) data = [trace1, trace2, trace3] fig = go.Figure(data=data, layout=layout) py.iplot(fig) if threshold_plot == True: visualizer = DiscriminationThreshold(algorithm) visualizer.fit(training_x, training_y) visualizer.poof()
LR.fit(X_train,y_train) evaluate_model(LR,X_test,y_test,True) # evaluate model # let's look discimination pred = LR.predict(X_test) proba = LR.predict_proba(X_test) # discrimination threshold is 0.5, let's find best disc. threshhold. proba = pd.DataFrame(proba,columns=["0","1"]) proba["Selected Class"] = pred # try to best threshold to maximize f1 score vis = DiscriminationThreshold(LR) vis.fit(X_train,y_train) vis.poof() # algorithm trys to maximize f1 score # threshold = 0.29 # KNN from sklearn.neighbors import KNeighborsClassifier k_scores = {} for k in range(1,30,2): KNN = KNeighborsClassifier(n_neighbors=k) KNN.fit(X_train,y_train) k_scores[k] = [KNN.score(X_test,y_test),roc_auc_score(y_test,KNN.predict(X_test))]
def train_and_evaluate_classifier( algorithm, training_x, testing_x, training_y, testing_y, cols, cf='coefficients', threshold_plot=True, ): """ Обучение классификатора на тренировочных данных, оценка прогноза на тестовых, и визуализация некоторых метрик качества прогноза. algorithm - использованный алгоритм с методами fit, predict и predict_proba training_x - данные для предсказывающих переменных (обучение) testing_x - данные для предсказывающих переменных (тест) training_y - целевая переменная (обучение) training_y - целевая переменная (тест) cf - ["coefficients","features"](коэффициенты для логрегрессии, параметры для деревьев) threshold_plot - если True, возвращает для модели threshold plot """ # модель algorithm.fit(training_x, training_y) predictions = algorithm.predict(testing_x) probabilities = algorithm.predict_proba(testing_x) # коэффициенты if cf == "coefficients": coefficients = pd.DataFrame(algorithm.coef_.ravel()) elif cf == "features": coefficients = pd.DataFrame(algorithm.feature_importances_) else: raise ValueError( '`coefficients` value must be one of {`coefficients`, `features`}') column_df = pd.DataFrame(cols) coef_sumry = (pd.merge(coefficients, column_df, left_index=True, right_index=True, how="left")) coef_sumry.columns = ["coefficients", "features"] coef_sumry = coef_sumry.sort_values(by="coefficients", ascending=False) print(algorithm) print("\n Отчет по классфицикации : \n", classification_report(testing_y, predictions)) print("Точность : ", accuracy_score(testing_y, predictions)) # confusion_matrix conf_matrix = confusion_matrix(testing_y, predictions) # roc_auc_score model_roc_auc = roc_auc_score(testing_y, predictions) print("Площадь под кривой : ", model_roc_auc, "\n") fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1]) # готовим confusion_matrix для рисования trace1 = go.Heatmap( z=conf_matrix, x=["Пользователи", "Отток"], y=["Пользователи", "Отток"], showscale=False, colorscale="Picnic", name="matrix", ) # готовим roc_curve для рисования trace2 = go.Scatter(x=fpr, y=tpr, name="Roc : " + str(model_roc_auc), line=dict(color=('rgb(22, 96, 167)'), width=2)) trace3 = go.Scatter(x=[0, 1], y=[0, 1], line=dict(color=('rgb(205, 12, 24)'), width=2, dash='dot')) # готовим коэффициенты для рисования trace4 = go.Bar(x=coef_sumry["features"], y=coef_sumry["coefficients"], name="coefficients", marker=dict(color=coef_sumry["coefficients"], colorscale="Picnic", line=dict(width=.6, color="black"))) # рисуем fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{ 'colspan': 2 }, None]]) fig.append_trace(trace1, 1, 1) fig.append_trace(trace2, 1, 2) fig.append_trace(trace3, 1, 2) fig.append_trace(trace4, 2, 1) fig['layout'].update( showlegend=False, autosize=False, height=900, width=800, plot_bgcolor='rgba(240,240,240, 0.95)', paper_bgcolor='rgba(240,240,240, 0.95)', margin=dict(b=195), ) fig["layout"]["xaxis2"].update(dict(title="false positive rate")) fig["layout"]["yaxis2"].update(dict(title="true positive rate")) fig["layout"]["xaxis3"].update( dict(showgrid=True, tickfont=dict(size=10), tickangle=90)) py.iplot(fig) if threshold_plot: visualizer = DiscriminationThreshold(algorithm) visualizer.fit(training_x, training_y) visualizer.poof()