예제 #1
0
def test_stepwise_zero():
    df = pd.DataFrame({
        'X': np.zeros(500),
        'Z': np.random.rand(500),
        'Y': np.random.randint(2, size = 500),
    })
    df = stepwise(df, target = 'Y')
    assert ['Z', 'Y'] == df.columns.tolist()
예제 #2
0
def test_stepwise_return_drop():
    df, drop_list = stepwise(frame.fillna(-1),
                             target='target',
                             return_drop=True)
    assert ['B', 'A', 'D'] == drop_list
예제 #3
0
def test_stepwise_exclude():
    df = stepwise(frame.fillna(-1), target='target', exclude='B')
    assert ['B', 'C', 'E', 'F', 'target'] == df.columns.tolist()
예제 #4
0
def test_stepwise_forward():
    df = stepwise(frame.fillna(-1), target='target', direction='forward')
    assert ['C', 'E', 'F', 'target'] == df.columns.tolist()
예제 #5
0
def test_stepwise_forward_when_best_is_first():
    df = frame[['E', 'F', 'B', 'A', 'D', 'C', 'target']]
    df = stepwise(df.fillna(-1), target = 'target', direction = 'forward')
    assert ['E', 'F', 'C', 'target'] == df.columns.tolist()
예제 #6
0
def test_stepwise_ks():
    df = stepwise(frame.fillna(-1), target = 'target', criterion = 'ks', direction = 'forward')
    assert ['A', 'C', 'target'] == df.columns.tolist()
예제 #7
0
def test_stepwise_lr():
    df = stepwise(frame.fillna(-1), target = 'target', estimator = 'lr', direction = 'forward')
    assert ['C', 'target'] == df.columns.tolist()
예제 #8
0
train_w = pd.read_csv('train_w.csv')
test_w = pd.read_csv('test_w.csv')

train_s2, drops = select(train_w,
                         target='loan_status',
                         iv=0.005,
                         corr=0.8,
                         return_drop=True)
test_s2 = test_w[train_s2.columns]
print('IV筛选不通过的特征为:\n', drops['iv'], '\n', 'corr筛选不通过的特征为:\n', drops['corr'])
print('处理完成,剩余{}特征'.format(train_s2.shape[1]))

print('Logistic逐步回归筛选中')
train_step = stepwise(train_s2,
                      target='loan_status',
                      estimator='ols',
                      direction='both',
                      criterion='aic')
test_step = test_s2[train_step.columns]
print('处理完成,剩余{}特征'.format(train_step.shape[1]), '\n' * 2)
# data_step = pd.concat([train_step, test_step], join='inner')
# data_step.to_csv('data_step.csv', index=False)
train_step.to_csv('train_step.csv', index=False)
test_step.to_csv('test_step.csv', index=False)

# ----------------------------------------------------------------------------------------------------------------------------------------------------
# 模型训练
print('模型训练'.center(60, '—'))
train_step = pd.read_csv('train_step.csv')
test_step = pd.read_csv('test_step.csv')
print(train_step['loan_status'].groupby(train_step['loan_status']).count())