예제 #1
0
def num_bin(df:pd.DataFrame,cols:list=None,target:str='target',specials:list=None,
            bin_num_limit:int=5,count_distr_limit:float=0.05,sc_method='chimerge',
            non_mono_cols:list=None,init_bins=10,init_min_samples=0.05,init_method='chi',**kwargs):

    # 粗分箱,单调检验,分箱结果
    if not cols:
        cols = df.columns.difference([target]).tolist()

    if specials:
        specials = {k: specials for k in cols}

    if not non_mono_cols:
        non_mono_cols = []

    bind, ivd = dict(), dict()
    t0 = time.process_time()

    for col in cols:
        if col in non_mono_cols:
            bind[col] = woebin(dt=df, x=col, y=target, special_values=specials, bin_num_limit=bin_num_limit,
                               count_distr_limit=count_distr_limit, method=sc_method,print_info=False)[col]
            ivd[col] = bind[col]['total_iv'].unique()[0]

        else:
            c = Combiner()
            c.fit(X=df[col], y=df[target],n_bins=init_bins,min_samples=init_min_samples,method=init_method,**kwargs)
            init_points = c.export()[col]
            breaks_list = monotonous_bin(df=df, col=col, target=target,cutOffPoints=init_points, special_values=specials)

            bind[col] = woebin(dt=df, x=col, y=target, special_values=specials, breaks_list=breaks_list,
                               bin_num_limit=bin_num_limit,count_distr_limit=count_distr_limit,method=sc_method,
                               print_info=False)[col]
            ivd[col] = bind[col]['total_iv'].unique()[0]

    print(f'there are bing {len(cols)} using {int((time.process_time() - t0) * 100 / 60)} seconds')
    return bind, ivd
예제 #2
0

def combine(data, target, columns=[], exclude=[]):  # 精细化分箱
    for i in columns[~columns.isin(exclude)]:
        data_i = pd.concat([data[i], data[target]], axis=1)
        comb.fit(data_i, y=target, method='chi', min_samples=0.1)
        bins = comb.export()
        print(bins)
        data_c = comb.transform(data_i, labels=True)
        bin_plot(data_c, x=i, target=target)
        plt.show()


# combine(train_s, target='loan_status', columns=columns, exclude=['loan_status'])

comb.fit(train_s, y='loan_status', method='chi', min_samples=0.1)
rules = {
    'emp_length': [['< 1 year'], ['1 year', '2 years', '3 years'],
                   ['4 years', '5 years', '6 years', '7 years', '8 years'],
                   ['9 years', '10+ years']],
    'percent_bc_gt_75': [11.1, 25.9, 52.0],
    'avg_cur_bal': [6515.0, 10622.0, 19486.0, 36453.0]
}
comb.set_rules(rules)
train_b = comb.transform(train_s, labels=True)
test_b = comb.transform(test_s, labels=True)
for i in columns[~columns.isin(['split', 'loan_status'])]:
    data_i = pd.concat([train_b[i], train_s['loan_status']], axis=1)
    bin_plot(data_i, x=i, target='loan_status')
    plt.show()
print('分箱完成', '\n' * 2)