def num_bin(df:pd.DataFrame,cols:list=None,target:str='target',specials:list=None, bin_num_limit:int=5,count_distr_limit:float=0.05,sc_method='chimerge', non_mono_cols:list=None,init_bins=10,init_min_samples=0.05,init_method='chi',**kwargs): # 粗分箱,单调检验,分箱结果 if not cols: cols = df.columns.difference([target]).tolist() if specials: specials = {k: specials for k in cols} if not non_mono_cols: non_mono_cols = [] bind, ivd = dict(), dict() t0 = time.process_time() for col in cols: if col in non_mono_cols: bind[col] = woebin(dt=df, x=col, y=target, special_values=specials, bin_num_limit=bin_num_limit, count_distr_limit=count_distr_limit, method=sc_method,print_info=False)[col] ivd[col] = bind[col]['total_iv'].unique()[0] else: c = Combiner() c.fit(X=df[col], y=df[target],n_bins=init_bins,min_samples=init_min_samples,method=init_method,**kwargs) init_points = c.export()[col] breaks_list = monotonous_bin(df=df, col=col, target=target,cutOffPoints=init_points, special_values=specials) bind[col] = woebin(dt=df, x=col, y=target, special_values=specials, breaks_list=breaks_list, bin_num_limit=bin_num_limit,count_distr_limit=count_distr_limit,method=sc_method, print_info=False)[col] ivd[col] = bind[col]['total_iv'].unique()[0] print(f'there are bing {len(cols)} using {int((time.process_time() - t0) * 100 / 60)} seconds') return bind, ivd
def combine(data, target, columns=[], exclude=[]): # 精细化分箱 for i in columns[~columns.isin(exclude)]: data_i = pd.concat([data[i], data[target]], axis=1) comb.fit(data_i, y=target, method='chi', min_samples=0.1) bins = comb.export() print(bins) data_c = comb.transform(data_i, labels=True) bin_plot(data_c, x=i, target=target) plt.show() # combine(train_s, target='loan_status', columns=columns, exclude=['loan_status']) comb.fit(train_s, y='loan_status', method='chi', min_samples=0.1) rules = { 'emp_length': [['< 1 year'], ['1 year', '2 years', '3 years'], ['4 years', '5 years', '6 years', '7 years', '8 years'], ['9 years', '10+ years']], 'percent_bc_gt_75': [11.1, 25.9, 52.0], 'avg_cur_bal': [6515.0, 10622.0, 19486.0, 36453.0] } comb.set_rules(rules) train_b = comb.transform(train_s, labels=True) test_b = comb.transform(test_s, labels=True) for i in columns[~columns.isin(['split', 'loan_status'])]: data_i = pd.concat([train_b[i], train_s['loan_status']], axis=1) bin_plot(data_i, x=i, target='loan_status') plt.show() print('分箱完成', '\n' * 2)