# seed=1000 #随机种子
# #eval_metric= 'auc')

# In[300]:


#模型保存
#bst.save_model('0001.model')

##计算训练和测试的ks
train_pred = bst.predict(dtrain)
train_labels = dtrain.get_label()
test_pred = bst.predict(dtest)
train_labels = dtest.get_label()

train_perf = sc.perf_eva(y_train, train_pred, title = "train")
test_perf = sc.perf_eva(y_test, test_pred, title = "test")

# PSI
# 建立机器学习模型
#train_p = train_pred.copy()
#test_p  = test_pred.copy()
print("各样本prediction为", preds)


# In[285]:


train_pred.shape

예제 #2
0
    cm =confusion_matrix(y_train, train_pred_new)
    TPR = round(cm[0,0]/(cm[0,0]+cm[0,1]),4)
    FPR = round(cm[1,0]/(cm[1,0]+cm[1,1]),4)
    ks = round(TPR - FPR,4)
    #dpd_rate = round(cm[1,0]/(cm[0,0]+cm[1,0]),4)
    acc = round((cm[0,0]+cm[1,1])/cm.sum(),4)
    print(i, TPR, FPR, ks, acc)

#测试集
#test_pred_new = np.array(pd.DataFrame(test_pred).iloc[:, 0].apply(lambda x: 0 if x<0.35 else 1))
proba_range = np.arange(0.2,0.9,0.01)
#print(proba_range)
print('阈值', '真正率', '假正率', 'ks', '准确率')
for i in proba_range:
    i = round(i,2)
    test_pred_new = np.array(pd.DataFrame(test_pred).iloc[:, 0].apply(lambda x: 0 if x<i else 1))
    cm =confusion_matrix(y_test, test_pred_new)
    TPR = round(cm[0,0]/(cm[0,0]+cm[0,1]),4)
    FPR = round(cm[1,0]/(cm[1,0]+cm[1,1]),4)
    ks = round(TPR - FPR,4)
    #dpd_rate = round(cm[1,0]/(cm[0,0]+cm[1,0]),4)
    acc = round((cm[0,0]+cm[1,1])/cm.sum(),4)
    dpd = round(cm[1,0]/(cm[0,0]+cm[1,0]),4)
    print(i, TPR, FPR, ks, acc, dpd)


train_perf = sc.perf_eva(y_train, train_pred, title='train')
test_perf = sc.perf_eva(y_test, test_pred, title='test')

#保存模型
joblib.dump(lr, 'insight.m')
예제 #3
0
X_train = train_woe.loc[:, train_woe.columns != 'BAD']
y_test = test_woe.loc[:, 'BAD']
X_test = test_woe.loc[:, train_woe.columns != 'BAD']

# Fit logit model
lr = sm.GLM(y_train, X_train, family=sm.families.Binomial())
fit = lr.fit()

fit.summary()

# Get probabilities
train_pred = fit.predict(X_train)
test_pred = fit.predict(X_test)

# Plot diagnositcs
test_perf = sc.perf_eva(y_test, test_pred, title="test", plot_type=['ks'])
test_perf = sc.perf_eva(y_test, test_pred, title="test", plot_type=['roc'])


class ModelDetails():
    def __init__(self, intercept, coefs):
        """Because the scorecardpy package can only take a model class of
        LogisticRegression from the scikit-learn package this class is needed
        to hold the values from the statsmodels package.
        """

        self.intercept_ = [intercept]
        self.coef_ = [coefs.tolist()]


model = ModelDetails(fit.params[0], fit.params[1:])
예제 #4
0
    def evaluate_main(self):
        writer = pd.ExcelWriter("{}_report.xlsx".format(self.filename))

        odds0 = float(self.df_train_woe[self.target_name].value_counts()[1]) / float(self.df_train_woe[self.target_name].value_counts()[0])
        b = self.double_score / np.log(2)
        a = self.base_score + b * np.log(odds0)
        card = sc.scorecard(self.bins_adj, self.model, self.final_features, points0=self.base_score,
                            odds0=odds0,
                            pdo=self.double_score)
        card_df = pd.DataFrame(columns=["variable", "bin", "points"])
        for key, value in card.items():
            card_df = pd.concat([card_df, value])
        card_df.to_excel(writer, 'card_result')

        self.train_pred = self.model.predict_proba(self.df_train_woe[self.final_features])[:, 1]
        perf = sc.perf_eva(self.df_train_woe[self.target_name], self.train_pred, title="train")
        print("On train-data, the evaluation follows:\nks={}, auc={}, gini={}".format(perf["KS"], perf["AUC"],
                                                                                      perf["Gini"]))
        perf["pic"].savefig("{}_train.png".format(self.filename))

        _score = sc.scorecard_ply(self.df_train, card, print_step=0)
        _score["flag"] = self.df_train_woe[self.target_name]
        _score["pred"] = self.train_pred

        _rs = self._get_score_table(_score, a, b)
        _rs.to_excel(writer, 'train_result')

        if self.df_test.any().any():
            y_test = self.df_test_woe[self.target_name]
            self.test_pred = self.model.predict_proba(self.df_test_woe[self.final_features])[:, 1]
            perf = sc.perf_eva(y_test, self.test_pred, title="test")
            print("On test-data, the evaluation follows:\nks={}, auc={}, gini={}".format(perf["KS"],
                                                                                          perf["AUC"],
                                                                                          perf["Gini"]))
            perf["pic"].savefig("{}_test.png".format(self.filename))

            _score = sc.scorecard_ply(self.df_test, card, print_step=0)
            _score["flag"] = self.df_test_woe[self.target_name]
            _score["pred"] = self.test_pred

            _rs = self._get_score_table(_score, a, b)
            _rs.to_excel(writer, 'test_result')


        if self.df_ott.any().any():
            y_ott = self.df_ott_woe[self.target_name]
            self.ott_pred = self.model.predict_proba(self.df_ott_woe[self.final_features])[:, 1]
            try:
                perf = sc.perf_eva(y_ott, self.ott_pred , title="ott")
                print("On ott-data, the evaluation follows:\nks={}, auc={}, gini={}".format(perf["KS"],
                                                                                        perf["AUC"],
                                                                                        perf["Gini"]))
                perf["pic"].savefig("{}_test.png".format(self.filename))

                _score = sc.scorecard_ply(self.df_ott, card, print_step=0)
                _score["flag"] = self.df_ott_woe[self.target_name]
                _score["pred"] = self.ott_pred

                _rs = self._get_score_table(_score,a,b)
                _rs.to_excel(writer, 'ott_result')

            except:
                self.log.info("Cannot caculation the ott data!")


        importance = {x: y for x, y in zip(self.final_features, self.model.coef_[0])}

        iv_df = self.iv_df[self.iv_df['variable'].isin(self.final_features)]
        iv_df["coef"] = iv_df.variable.map(lambda x: importance[x])
        iv_df.to_excel(writer, 'feature_importance')

        writer.close()

        self.log.info("全部环节结束,请查看相关文件!")