示例#1
0
    def woe_transform(self, train, test):
        # includes var filtering and one-hot encoding of 'INDUSTRY' column in all data
        train = sc.var_filter(train, 'DEFAULT_FLAG', var_kp='INDUSTRY')
        self.encode_categorical(train)
        bins = sc.woebin(train, 'DEFAULT_FLAG')
        train_woe = sc.woebin_ply(train, bins)
        train_columns = [
            'ACCESS_CREDIT', 'ASSESSMENT_YEAR', 'MEDIUM_TERM_LIQUIDITY',
            'OWNERS_MANAGEMENT', 'PRODUCT_DEMAND', 'PROFITABILITY',
            'SHORT_TERM_LIQUIDITY', 'TURNOVER', 'DEFAULT_FLAG', 'INDUSTRY'
        ]
        test_selected = test[train_columns]
        self.encode_categorical(test_selected)
        test_woe = sc.woebin_ply(test_selected, bins)

        return train_woe, test_woe
        mode_ratio_threshold=0.95 #mode_ratio_threshold为阈值

        raw_feature_num=len(X.columns)
        if_delete_feature=np.zeros([raw_feature_num,1])
        for i in range(0,raw_feature_num):
            if_delete_feature[i]=(len(np.where(X.iloc[:,i]==mode(X.iloc[np.where(~X.iloc[:,i].isna())[0],i])[0][0])[0])/len(X.iloc[np.where(~X.iloc[:,i].isna())[0],i])>mode_ratio_threshold)
        X=X.iloc[:,np.where(if_delete_feature==0)[0]]


# #### 同一变量使用scorecard内置函数与手动筛选的结果比较

# In[32]:


#立信样本使用scorecard内置函数粗筛之后的效果
dt_s = sc.var_filter(data1, y = "flagy")
print(dt_s.shape)


# In[33]:


#立信样本使用
''' 1.粗筛变量,删掉缺失值超过nan_ratio_threshold的变量 '''

nan_ratio_threshold=0.95 #nan_ratio_threshold为阈值

count_null=np.zeros(np.shape(X))#计算空值量
count_null[np.where(X.isnull())]=1#计算非空值量并赋值为1
count_null_sumfactor=sum(count_null)/np.shape(X)[0]#计算变量空值占比
X=X.iloc[:,np.where(count_null_sumfactor<=nan_ratio_threshold)[0]] #取非空值小于95%的变量赋值给X
示例#3
0
"""
Created on Tue Aug  4 20:03:21 2020

@author: 87875
"""

# Traditional Credit Scoring Using Logistic Regression
import scorecardpy as sc
import matplotlib.pyplot as plt

# data prepare ------
# load germancredit data
dat = sc.germancredit()

# filter variable via missing rate, iv, identical value rate
dt_s = sc.var_filter(dat, y="creditability")

# breaking dt into train and test
train, test = sc.split_df(dt_s, 'creditability').values()

# woe binning ------
bins = sc.woebin(dt_s, y="creditability")
print(type(bins))
for k, v in bins.items():
    print(k)

print(bins["purpose"])
print(bins["purpose"].columns)
print(type(bins["purpose"]))
# sc.woebin_plot(bins["purpose"])
# plt.show()
示例#4
0
#解释变量
data_X = data_dateDiff.iloc[:, 1:]

data = pd.concat([data_X, data_Y], axis=1)
data = data.drop('最近一次贷款时间', axis=1)

#响应变量01编码
labelEncoder = LabelEncoder()
data['status'] = labelEncoder.fit_transform(data['status'].values)
data = data.astype('str')



#默认删除信息只<0.02,缺失率>95%,单类别比例>95%的变量
dt_s = sc.var_filter(data, y='status')
print('变量预处理前后变化:', data.shape, '->', dt_s.shape)
#print(data.columns)
#print(dt_s.columns)



#分箱WOE转换
bins = sc.woebin(dt_s, y='status')
# bins

train, test = sc.split_df(dt_s, 'status').values()
print('训练集、测试集划分比例为:', train.shape[0], ':', test.shape[0])

train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)