def test_select_exclude(): df = select(frame, target='target', empty=0.8, iv=0.2, corr=0.7, exclude=['A']) assert ['A', 'D', 'F', 'target'] == df.columns.tolist()
def test_select(): df = select(frame, target='target', empty=0.8, iv=0.2, corr=0.7) assert ['D', 'F', 'target'] == df.columns.tolist()
# missing1 = data.columns[data.isnull().sum() != 0].tolist() # missing2 = (data.isnull().sum() / len(data) * 100).sort_values(ascending=False) # print(missing1, '\n', missing2) # data.to_csv('data.csv', index=False) print('处理完成,数据共有{}行,{}列'.format(data.shape[0], data.shape[1]), '\n' * 2) # ---------------------------------------------------------------------------------------------------------------------------------------------------- # 特征筛选A print('特征第一次筛选'.center(60, '—')) train = data[data['split'].isin(['Q1', 'Q2', 'Q3'])].drop('split', axis=1) test = data[data['split'].isin(['Q4'])].drop('split', axis=1) train_s, drops = select(train, target='loan_status', iv=0.005, corr=0.8, return_drop=True) test_s = test[train_s.columns] print('IV筛选不通过的特征为:\n', drops['iv'], '\n', 'corr筛选不通过的特征为:\n', drops['corr']) print('处理完成,剩余{}特征'.format(train_s.shape[1]), '\n' * 2) # ---------------------------------------------------------------------------------------------------------------------------------------------------- # 分箱 print('卡方分箱中'.center(60, '—')) comb = Combiner() columns = train_s.columns def combine(data, target, columns=[], exclude=[]): # 精细化分箱 for i in columns[~columns.isin(exclude)]: