def show_performance(X, y, clf, conf_matrix_cutoff=0.5, show_auc=True, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True): y_pred_proba=clf.predict_proba(X)[:,1] y_pred=[1 if i>=conf_matrix_cutoff else 0 for i in y_pred_proba] if show_auc: print ("AUC:{0:.3f}".format(__roc_auc_score(y,y_pred_proba)),"\n") if show_accuracy: print ("Accuracy:{0:.3f}".format(__accuracy_score(y,y_pred)),"\n") if show_classification_report: print ("Classification report") print (__classification_report(y,y_pred)) if show_confusion_matrix: print ("Confusion matrix") showcm(__confusion_matrix(y, y_pred)) print (__confusion_matrix(y,y_pred,labels=[0,1]),"\n")
def print_report_tt(Y_TRUE_TRAIN, Y_PREDICTED_TRAIN, Y_TRUE_TEST, Y_PREDICTED_TEST, FEAT_IMP='', dcl =[1,5,10,20,30,40,50,60,70,80,90,100], CNT_TO_PRINT=10, NAME_MODEL='PTB', NFLAG_PRINT_TABLES = True, TARGET_NAME='', DATA_SET='', METHOD='', TIME_PERIOD='', UNDER='', CALIBR='', PYTHON_SCRIPT='', ORACLE_TABLE='', CSV_FILE='', SQL_FILE=''): gs = __GridSpec(9, 1, left=0, right=0.2, hspace=0.3, wspace=0.3) __plt.figure(figsize=(55,75)) __plt.subplot(gs[0,0]) _show_acclift([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train'), (Y_PREDICTED_TEST, Y_TRUE_TEST, 'Test')], NUM_SUBPLOT=gs[0, 0]) __plt.subplot(gs[1, 0]) _showlift([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train'), (Y_PREDICTED_TEST, Y_TRUE_TEST, 'Test')], NUM_SUBPLOT=gs[1, 0]) __plt.subplot(gs[2, 0]) _showroccurve([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train'), (Y_PREDICTED_TEST, Y_TRUE_TEST, 'Test')], NUM_SUBPLOT=gs[2, 0]) __plt.subplot(gs[3, 0]) width = 0.35 ind = __np.arange(10) x_test=get_lift_df(Y_PREDICTED_TEST,Y_TRUE_TEST)['lift'] __plt.bar(ind, x_test, width, label='test', color='orange', alpha = 0.6) __plt.xlabel('Part') __plt.ylabel('Lift') __plt.xticks(ind + width / 2, ('1', '2', '3', '4', '5', '6', '7', '8', '9', '10')) __plt.title('DECILE_LIFT', fontsize=23) __plt.title(r'Decile lift') for i in zip(ind, x_test): __plt.text(i[0], i[1], str(round(i[1],2)), horizontalalignment='center', verticalalignment='bottom', fontsize=13) __plt.legend(loc='best', fontsize=15) __plt.subplot(gs[4, 0]) width = 0.35 ind = __np.arange(10) z_test=get_lift_df(Y_PREDICTED_TEST,Y_TRUE_TEST)['pred_mean'] __plt.bar(ind, z_test, width, label='test', color='orange', alpha = 0.6) __plt.xlabel('Part') __plt.ylabel('Responce') __plt.xticks(ind + width / 2, ('1', '2', '3', '4', '5', '6', '7', '8', '9', '10')) __plt.title('DECILE_RESPONCE', fontsize=23) __plt.title(r'Decile responce') for i in zip(ind, z_test): __plt.text(i[0], i[1], str(round(i[1],2)), horizontalalignment='center', verticalalignment='bottom', fontsize=13) __plt.legend(loc='best', fontsize=15) __plt.subplot(gs[5, 0]) _gainchart([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train'), (Y_PREDICTED_TEST, Y_TRUE_TEST, 'Test')], NUM_SUBPLOT=gs[5, 0]) __plt.subplot(gs[6, 0]) _kschart([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train'), (Y_PREDICTED_TEST, Y_TRUE_TEST, 'Test')], NUM_SUBPLOT=gs[6, 0]) __plt.subplot(gs[7, 0]) _showcc([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train'), (Y_PREDICTED_TEST, Y_TRUE_TEST, 'Test')], NUM_SUBPLOT=gs[7, 0]) info = __pd.DataFrame(columns=['BASE', 'TARGET', 'TARGET, %', 'ROC_AUC', 'GINI', 'Target name', 'Data set name','Model method', 'Time period','Undersampling usage', 'Calibrated', 'Python script location', 'Data set oracle location', 'Data set csv location', 'SQL code location'], index=range(1)) info.loc[0,'BASE'] = len(Y_TRUE_TRAIN) info.loc[0,'TARGET'] = int(sum(Y_TRUE_TRAIN)) info.loc[0,'TARGET, %'] = float(sum(Y_TRUE_TRAIN)/len(Y_TRUE_TRAIN))*100 info.loc[0,'ROC_AUC'] = __roc_auc_score(Y_TRUE_TRAIN,Y_PREDICTED_TRAIN) info.loc[0,'GINI'] = 2*__roc_auc_score(Y_TRUE_TRAIN,Y_PREDICTED_TRAIN) - 1 info.loc[1,'BASE'] = len(Y_TRUE_TEST) info.loc[1,'TARGET'] = int(sum(Y_TRUE_TEST)) info.loc[1,'TARGET, %'] = float(sum(Y_TRUE_TEST)/len(Y_TRUE_TEST))*100 info.loc[1,'ROC_AUC'] = __roc_auc_score(Y_TRUE_TEST,Y_PREDICTED_TEST) info.loc[1,'GINI'] = 2*__roc_auc_score(Y_TRUE_TEST,Y_PREDICTED_TEST) - 1 info.loc[0,'Target name'] = TARGET_NAME info.loc[0,'Data set name'] = DATA_SET info.loc[0,'Model method'] = METHOD info.loc[0,'Time period'] = TIME_PERIOD info.loc[0,'Undersampling usage'] = UNDER info.loc[0,'Calibrated'] = CALIBR info.loc[0,'Python script location'] = PYTHON_SCRIPT info.loc[0,'Data set oracle location'] = ORACLE_TABLE info.loc[0,'Data set csv location'] = CSV_FILE info.loc[0,'SQL code location'] = SQL_FILE info_t = info.T info_t.rename(columns={0: 'Train', 1: 'Test'}, inplace=True) if NFLAG_PRINT_TABLES==True: print ('Model info') print (__pd.DataFrame(info_t)) print ('________________________________________________________') print ('Feature importance') print (__pd.DataFrame(FEAT_IMP)) print ('________________________________________________________')
def print_report_t(Y_TRUE_TRAIN, Y_PREDICTED_TRAIN, FEAT_IMP='', dcl = [1,5,10,20,30,40,50,60,70,80,90,100], CNT_TO_PRINT=10, NAME_MODEL='PTB', NFLAG_PRINT_TABLES = True,TARGET_NAME='', DATA_SET='', METHOD='', TIME_PERIOD='', UNDER='', CALIBR='', PYTHON_SCRIPT='', ORACLE_TABLE='',CSV_FILE='', SQL_FILE=''): gs = __GridSpec(9, 1, left=0, right=0.2, hspace=0.3, wspace=0.3) __plt.figure(figsize=(55,75)) __plt.subplot(gs[0,0]) #int_dcl = lift_df_ptb.k.astype(int) _show_acclift([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train')], NUM_SUBPLOT=gs[0, 0]) __plt.subplot(gs[1, 0]) _showlift([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train')], NUM_SUBPLOT=gs[1, 0]) __plt.subplot(gs[2, 0]) _showroccurve([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train')], NUM_SUBPLOT=gs[2, 0]) __plt.subplot(gs[3, 0]) average_precision = __average_precision_score(Y_TRUE_TRAIN,Y_PREDICTED_TRAIN) precision_x, recall_x, _ = __precision_recall_curve(Y_TRUE_TRAIN,Y_PREDICTED_TRAIN) __plt.step(recall_x, precision_x, alpha=0.2,where='post', label=NAME_MODEL) __plt.fill_between(recall_x, precision_x, step='post', alpha=0.2) __plt.xlabel('Recall') __plt.ylabel('Precision') __plt.ylim([0.0, 1.05]) __plt.xlim([0.0, 1.0]) __plt.title('2-class Precision-Recall curve: AUC={0:0.2f}'.format(average_precision)) __plt.legend(loc='best', fontsize=11) __plt.subplot(gs[4, 0]) _gainchart([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train')], NUM_SUBPLOT=gs[4, 0]) __plt.subplot(gs[5, 0]) _kschart([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train')], NUM_SUBPLOT=gs[5, 0]) __plt.subplot(gs[6, 0]) _showcc([(Y_PREDICTED_TRAIN, Y_TRUE_TRAIN, 'Train')], NUM_SUBPLOT=gs[6, 0]) info = __pd.DataFrame(columns=['BASE', 'TARGET', 'TARGET, %', 'ROC_AUC', 'GINI', 'Target name', 'Data set name','Model method', 'Time period','Undersampling usage', 'Calibrated', 'Python script location', 'Data set oracle location', 'Data set csv location', 'SQL code location'], index=range(1)) info.loc[0,'BASE'] = len(Y_TRUE_TRAIN) info.loc[0,'TARGET'] = int(sum(Y_TRUE_TRAIN)) info.loc[0,'TARGET, %'] = float(sum(Y_TRUE_TRAIN)/len(Y_TRUE_TRAIN))*100 info.loc[0,'ROC_AUC'] = __roc_auc_score(Y_TRUE_TRAIN,Y_PREDICTED_TRAIN) info.loc[0,'GINI'] = 2*__roc_auc_score(Y_TRUE_TRAIN,Y_PREDICTED_TRAIN) - 1 info.loc[0,'Target name'] = TARGET_NAME info.loc[0,'Data set name'] = DATA_SET info.loc[0,'Model method'] = METHOD info.loc[0,'Time period'] = TIME_PERIOD info.loc[0,'Undersampling usage'] = UNDER info.loc[0,'Calibrated'] = CALIBR info.loc[0,'Python script location'] = PYTHON_SCRIPT info.loc[0,'Data set oracle location'] = ORACLE_TABLE info.loc[0,'Data set csv location'] = CSV_FILE info.loc[0,'SQL code location'] = SQL_FILE info_t = info.T info_t.rename(columns={0: 'Train'}, inplace=True) if NFLAG_PRINT_TABLES==True: print ('Model info') print (__pd.DataFrame(info_t)) print ('________________________________________________________') print ('Feature importance') print (__pd.DataFrame(FEAT_IMP)) print ('________________________________________________________')
def _showroccurve(arr, NUM_SUBPLOT=111): colors = __itertools.cycle(["royalblue", "darkorange", "forestgreen"]) for tr in arr: df_scores = __pd.DataFrame(list(zip(tr[0], tr[1])), columns=['score', 'target']) fpr_ptb, tpr_ptb, _ = __roc_curve(df_scores['target'], df_scores['score']) __plt.plot([0, 1], [0, 1], 'k--') __plt.plot(fpr_ptb, tpr_ptb, label=str(tr[2])+' ROC AUC={} GINI={}'.format( round(__roc_auc_score(df_scores['target'], df_scores['score']),3), str(round((__roc_auc_score(df_scores['target'], df_scores['score'])-0.5)*2.0,3)) ), linewidth= 2 ,linestyle='--', color=next(colors), alpha=0.7) __plt.xlabel('False positive rate') __plt.ylabel('True positive rate') __plt.title('ROC', fontsize=13) __plt.legend(loc='best', fontsize=12)
def print_lift_roc_pr_plot(Y_TRUE, Y_PREDICTED, dcl = [1,5,10,20,30,40,50,60,70,80,90,100], CNT_TO_PRINT=10, NAME_MODEL='PTB', NFLAG_PRINT_TABLE = True, NUM_FIGURE=1, NUM_upper_text=0.05): lift_df_ptb = __lift1(Y_TRUE,Y_PREDICTED,dcl) if NFLAG_PRINT_TABLE: print (NAME_MODEL+ '\n' +str(lift_df_ptb[['k', 'lift_cumulative', 'precision', 'tp', 'num_samples','threshold']])+ '\n') s_lift_df_ptb = __lift1(Y_TRUE,Y_PREDICTED,[10,20,30,40,50,60,70,80,90,100]) #__plt.figure(NUM_FIGURE) __plt.figure(NUM_FIGURE, figsize=(20,20)) __plt.subplot(321) int_dcl = lift_df_ptb.k.astype(int) __plt.plot( int_dcl , lift_df_ptb['lift_cumulative'], label=NAME_MODEL,linewidth= 3 ) __plt.xticks(int_dcl) #__plt.plot(int_dcl[0:CNT_TO_PRINT+1], lift_df_ptb['lift_cumulative'][0:CNT_TO_PRINT+1],'g^',color='r', label='Lift Value') __plt.plot(int_dcl[0:CNT_TO_PRINT+1], lift_df_ptb['lift_cumulative'][0:CNT_TO_PRINT+1],'o', label='Lift Value ' + NAME_MODEL) for i in zip(int_dcl[0:CNT_TO_PRINT+1], lift_df_ptb['lift_cumulative'][0:CNT_TO_PRINT+1]): __plt.text(i[0], i[1] + NUM_upper_text, str(round(i[1],2)), fontsize=11) # i[1] - 0.15 __plt.legend(loc='upper right', fontsize=10, frameon=False) #plt.legend(frameon=False, fontsize=15) __plt.title("__LIFT_CUMULATIVE__") __plt.ylabel('Lift') __plt.xlabel('Decile') __plt.subplot(322) int_dcl = s_lift_df_ptb.k.astype(int) __plt.plot( int_dcl , s_lift_df_ptb['lift_k'], label=NAME_MODEL,linewidth= 3 ) __plt.xticks(int_dcl) __plt.plot(int_dcl[0:CNT_TO_PRINT+1], s_lift_df_ptb['lift_k'][0:CNT_TO_PRINT+1],'o', label='Lift Value ' + NAME_MODEL) for i in zip(int_dcl[0:CNT_TO_PRINT+1], s_lift_df_ptb['lift_k'][0:CNT_TO_PRINT+1]): __plt.text(i[0], i[1] + NUM_upper_text, str(round(i[1],2)), fontsize=11) __plt.legend(loc='upper right', fontsize=10, frameon=False) __plt.title("__LIFT__") __plt.ylabel('Lift') __plt.xlabel('Decile') __plt.subplot(323) __plt.plot([0, 1], [0, 1], 'k--') fpr_ptb, tpr_ptb, _ = __roc_curve(Y_TRUE,Y_PREDICTED) __plt.plot(fpr_ptb, tpr_ptb , label=NAME_MODEL + ' ROC AUC={} GINI={}'.format( round(__roc_auc_score(Y_TRUE,Y_PREDICTED),3), str(round((__roc_auc_score(Y_TRUE,Y_PREDICTED)-0.5)*2.0,3)) ) , linewidth= 2 ,linestyle='--') #plt.plot(fpr_ptb, tpr_ptb,linewidth= 2 ,linestyle='--') __plt.xlabel('False positive rate') __plt.ylabel('True positive rate') __plt.title('ROC', fontsize=15) __plt.legend(loc='best', fontsize=10) __plt.subplot(324) average_precision = __average_precision_score(Y_TRUE,Y_PREDICTED) #print('Average precision-recall score: {0:0.2f}'.format(average_precision)) precision_x, recall_x, _ = __precision_recall_curve(Y_TRUE,Y_PREDICTED) __plt.step(recall_x, precision_x, alpha=0.2,where='post', label=NAME_MODEL) __plt.fill_between(recall_x, precision_x, step='post', alpha=0.2) __plt.xlabel('Recall') __plt.ylabel('Precision') __plt.ylim([0.0, 1.05]) __plt.xlim([0.0, 1.0]) __plt.title('2-class Precision-Recall curve: AUC={0:0.2f}'.format(average_precision)) __plt.legend(loc='best', fontsize=10) _showcc([(Y_PREDICTED, Y_TRUE, NAME_MODEL)],NUM_SUBPLOT=325)