def process_woe_trans(in_data_path=None,
                      rst_path=None,
                      out_path=None,
                      config_path=None):
    cfg = config.config()
    cfg.load_file(config_path, in_data_path)

    for var in [
            tmp for tmp in cfg.bin_var_list
            if tmp in list(cfg.dataset_train.columns)
    ]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1

    for var in [
            tmp for tmp in cfg.discrete_var_list
            if tmp in list(cfg.dataset_train.columns)
    ]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(),
                              (var)] = 'missing'

    fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type)

    output = open(rst_path, 'rb')
    rst = pickle.load(output)
    output.close()

    # Training dataset Woe Transformation
    for r in rst:
        cfg.dataset_train[r.var_name] = fp.woe_trans(
            cfg.dataset_train[r.var_name], r)

    cfg.dataset_train.to_csv(out_path)
def process_woe_trans(cfg=None,
                      in_data_path=None,
                      rst_path=None,
                      out_path=None):
    # config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model.csv'
    # data_path = in_data_path
    # cfg = config.config()
    # cfg.load_file(config_path, data_path)

    dataset = pd.read_csv(in_data_path)
    for var in [
            tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns)
    ]:
        # fill null
        dataset.loc[dataset[var].isnull(), (var)] = -1

    for var in [
            tmp for tmp in cfg.discrete_var_list
            if tmp in list(dataset.columns)
    ]:
        # fill null
        dataset.loc[dataset[var].isnull(), (var)] = 'missing'

    fp.change_feature_dtype(dataset, cfg.variable_type)

    output = open(rst_path, 'rb')
    rst = pickle.load(output)
    output.close()

    # Training dataset Woe Transformation
    r = rst[0]
    dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r)
    dataset.to_csv(out_path)
    print('%s\tSUCCESS EXPORT FILE: \n%s' %
          (time.asctime(time.localtime(time.time())), out_path))
def process_woe_trans(in_data_path=None,rst_path=None):
    config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model_201705.csv'
    data_path = in_data_path
    cfg = config.config()
    cfg.load_file(config_path, data_path)

    cfg.dataset_train = cfg.dataset_train.rename(columns={'cs_cpd':'cpd'}) # rename
    # dataset['raw_cs_cpd'] = dataset['cs_cpd']

    for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1

    for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing'

    fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type)

    output = open(rst_path, 'rb')
    rst = pickle.load(output)
    output.close()

    # Training dataset Woe Transformation
    for r in rst:
        cfg.dataset_train[r.var_name] = fp.woe_trans(cfg.dataset_train[r.var_name], r)

    return cfg.dataset_train
示例#4
0
def cal_iv(df, cate_vars, cont_vars, target):
  #%% woe分箱, iv and transform
  df_woe = df 
  civ_list = []
  n_positive = sum(df[target])
  n_negtive = len(df) - n_positive
  for var in cate_vars:
      civ = fp.proc_woe_discrete(df, var, n_positive, n_negtive, 0.05*len(df), alpha=0.05)
      civ_list.append(civ)
      df_woe[var] = fp.woe_trans(df[var], civ)
  
  for var in cont_vars:
      civ = fp.proc_woe_continuous(df, var, n_positive, n_negtive, 0.05*len(df), alpha=0.05)
      civ_list.append(civ)
      df_woe[var] = fp.woe_trans(df[var], civ)
    
  civ_df = eval.eval_feature_detail(civ_list,'output_feature_detail_0927.csv')
  df_iv = civ_df[['var_name','iv']].drop_duplicates()
  return df_iv.sort_values('iv',ascending=False)
def get_iv(df, cols, target, outputfile='./data/feature_detail_iv_list.csv'):
    import woe.feature_process as fp
    import woe.eval as eval
    # 分别用于计算连续变量与离散变量的woe。它们的输入形式相同:
    # proc_woe_discrete(df,var,global_bt,global_gt,min_sample,alpha=0.01)
    # # proc_woe_continuous(df,var,global_bt,global_gt,min_sample,alpha=0.01)
    # 输入:
    # df: DataFrame,要计算woe的数据,必须包含'target'变量,且变量取值为{0,1}
    # var:要计算woe的变量名
    # global_bt:全局变量bad total。df的正样本数量
    # global_gt:全局变量good total。df的负样本数量
    # min_sample:指定每个bin中最小样本量,一般设为样本总量的5%。
    # alpha:用于自动计算分箱时的一个标准,默认0.01.如果iv_划分>iv_不划分*(1+alpha)则划分。
    data = df.copy()
    data_woe = data
    data_woe.rename(columns={target: 'target'}, inplace=True)
    #用于存储所有数据的woe值
    civ_list = []
    n_positive = sum(data['target'])
    n_negtive = len(data) - n_positive
    for column in list(cols):
        if data[column].dtypes == 'object' or 'category':
            civ = fp.proc_woe_discrete(data,
                                       column,
                                       n_positive,
                                       n_negtive,
                                       0.05 * len(data),
                                       alpha=0.05)
        else:
            civ = fp.proc_woe_continuous(data,
                                         column,
                                         n_positive,
                                         n_negtive,
                                         0.05 * len(data),
                                         alpha=0.05)
        civ_list.append(civ)
        data_woe[column] = fp.woe_trans(data[column], civ)

    civ_df = eval.eval_feature_detail(civ_list, outputfile)
    #删除iv值过小的变量
    #     iv_thre = 0.001
    #     iv = civ_df[['var_name','iv']].drop_duplicates()
    #     x_columns = iv.var_name[iv.iv > iv_thre]
    return civ_df
示例#6
0
def process_woe_trans(cfg=None, rst=None, dataset=None, out_path=None):
    # fill null
    for var in [
            tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns)
    ]:
        dataset.loc[dataset[var].isnull(), (var)] = -1

    for var in [
            tmp for tmp in cfg.discrete_var_list
            if tmp in list(dataset.columns)
    ]:
        dataset.loc[dataset[var].isnull(), (var)] = 'missing'

    fp.change_feature_dtype(dataset, cfg.variable_type)

    for r in rst:
        dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r)

    dataset.to_csv(out_path)
def process_woe_trans(in_data_path=None, rst_path=None, out_path=None):
    print time.asctime(time.localtime(time.time())), 'load config file'
    config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model.csv'
    data_path = in_data_path
    cfg = config.config()
    cfg.load_file(config_path, data_path)

    print time.asctime(time.localtime(time.time())), 'fill na'
    dataset = pd.read_csv(in_data_path)

    print time.asctime(time.localtime(
        time.time())), 'fill na continuous variables'
    for var in [
            tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns)
    ]:
        # fill null
        dataset.loc[dataset[var].isnull(), (var)] = -1

    print time.asctime(time.localtime(
        time.time())), 'fill na discrete variables'
    for var in [
            tmp for tmp in cfg.discrete_var_list
            if tmp in list(dataset.columns)
    ]:
        # fill null
        dataset.loc[dataset[var].isnull(), (var)] = 'missing'

    print time.asctime(time.localtime(time.time())), 'change feature dtypes'
    fp.change_feature_dtype(dataset, cfg.variable_type)

    print time.asctime(time.localtime(time.time())), 'load woe rule'
    output = open(rst_path, 'rb')
    rst = pickle.load(output)
    output.close()

    # Training dataset Woe Transformation
    for r in rst:
        print 'woe trans:', r.var_name
        dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r)

    dataset.to_csv(out_path, index=False)
    print('%s\tSUCCESS EXPORT FILE: \n%s' %
          (time.asctime(time.localtime(time.time())), out_path))
示例#8
0
    if data[column].dtypes == 'object':
        info_value = fp.proc_woe_discrete(data,
                                          column,
                                          n_positive,
                                          n_negtive,
                                          0.05 * len(data),
                                          alpha=0.05)
    else:
        info_value = fp.proc_woe_continuous(data,
                                            column,
                                            n_positive,
                                            n_negtive,
                                            0.05 * len(data),
                                            alpha=0.05)
    info_value_list.append(info_value)
    data_woe[column] = fp.woe_trans(data[column], info_value)

info_df = eval.eval_feature_detail(info_value_list, './dataDump/woe_info.csv')

# 删除iv值过小的变量
iv_threshold = 0.001
iv = info_df[['var_name', 'iv']].drop_duplicates()
x_columns = list(iv.var_name[iv.iv > iv_threshold])

data_woe = data_woe[x_columns]
data_woe.to_csv('./dataDump/data_woe.csv')

labels = np.array(data.iloc[:, 0]).reshape(data.shape[0], -1)
data_train = np.array(data_woe)

# Configure input
示例#9
0
        n_positive = sum(data['target'])
        n_negtive = len(data) - n_positive
        for column in list(data.columns[1:]):
            # if data[column].dtypes == 'object':
            #     info_value = fp.proc_woe_discrete(
            #         data, column, n_positive, n_negtive, 0.05*len(data), alpha=0.05)
            # else:

            info_value = fp.proc_woe_continuous(data,
                                                column,
                                                n_positive,
                                                n_negtive,
                                                0.05 * len(data),
                                                alpha=0.05)
            info_value_list.append(info_value)
            data_woe[column] = fp.woe_trans(data[column], info_value)

        folder = os.path.exists('./dataDump/')
        if not folder:
            os.makedirs('./dataDump/')
        info_df = eval.eval_feature_detail(info_value_list,
                                           './dataDump/woe_info.csv')

        folder = os.path.exists('./model/')
        if not folder:
            os.makedirs('./model/')

        with open('./model/woe_info.pkl', 'wb') as fw:
            pickle.dump(info_value_list, fw)

        # 删除iv值过小的变量
示例#10
0
    if alldata[i].dtypes == 'object':
        civ = fp.proc_woe_discrete(alldata,
                                   i,
                                   n_positive,
                                   n_negtive,
                                   0.05 * len(alldata),
                                   alpha=0.05)
    else:
        civ = fp.proc_woe_continuous(alldata,
                                     i,
                                     n_positive,
                                     n_negtive,
                                     0.05 * len(alldata),
                                     alpha=0.05)
    civ_list.append(civ)
    alldata[i] = fp.woe_trans(alldata[i], civ)
    civ_df = eval.eval_feature_detail(civ_list)
    iv_thre = 0.001
    iv = civ_df[['var_name', 'iv']].drop_duplicates()  # 计算特征的iv值,查看特征的重要性
'''

# 3.组合特征
# 根据风控部门提供的风险判断规则,构建报警数据的组合特征
# 思路:在15天内,是否出现多种报警记录
# 1.是否在一段时间内同时发生了低电、断电、离线报警,增加新字段 combineWarn1
alldata['combineWarn1'] = None
for k in range(len(alldata)):
    if alldata['低电总报警次数'][k] != 0 and alldata['断电报警次数'][k] != 0 and alldata['离线超时报警次数'][k] != 0:
        alldata['combineWarn1'][k] = 1
    else:
        alldata['combineWarn1'][k] = 0