示例#1
0
def test_select_exclude():
    df = select(frame,
                target='target',
                empty=0.8,
                iv=0.2,
                corr=0.7,
                exclude=['A'])
    assert ['A', 'D', 'F', 'target'] == df.columns.tolist()
示例#2
0
def test_select():
    df = select(frame, target='target', empty=0.8, iv=0.2, corr=0.7)
    assert ['D', 'F', 'target'] == df.columns.tolist()
示例#3
0
# missing1 = data.columns[data.isnull().sum() != 0].tolist()
# missing2 = (data.isnull().sum() / len(data) * 100).sort_values(ascending=False)
# print(missing1, '\n', missing2)
# data.to_csv('data.csv', index=False)

print('处理完成,数据共有{}行,{}列'.format(data.shape[0], data.shape[1]), '\n' * 2)

# ----------------------------------------------------------------------------------------------------------------------------------------------------
# 特征筛选A
print('特征第一次筛选'.center(60, '—'))
train = data[data['split'].isin(['Q1', 'Q2', 'Q3'])].drop('split', axis=1)
test = data[data['split'].isin(['Q4'])].drop('split', axis=1)

train_s, drops = select(train,
                        target='loan_status',
                        iv=0.005,
                        corr=0.8,
                        return_drop=True)
test_s = test[train_s.columns]
print('IV筛选不通过的特征为:\n', drops['iv'], '\n', 'corr筛选不通过的特征为:\n', drops['corr'])
print('处理完成,剩余{}特征'.format(train_s.shape[1]), '\n' * 2)

# ----------------------------------------------------------------------------------------------------------------------------------------------------
# 分箱
print('卡方分箱中'.center(60, '—'))
comb = Combiner()
columns = train_s.columns


def combine(data, target, columns=[], exclude=[]):  # 精细化分箱
    for i in columns[~columns.isin(exclude)]: