def process_woe_trans(in_data_path=None,rst_path=None):
    config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model_201705.csv'
    data_path = in_data_path
    cfg = config.config()
    cfg.load_file(config_path, data_path)

    cfg.dataset_train = cfg.dataset_train.rename(columns={'cs_cpd':'cpd'}) # rename
    # dataset['raw_cs_cpd'] = dataset['cs_cpd']

    for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1

    for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing'

    fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type)

    output = open(rst_path, 'rb')
    rst = pickle.load(output)
    output.close()

    # Training dataset Woe Transformation
    for r in rst:
        cfg.dataset_train[r.var_name] = fp.woe_trans(cfg.dataset_train[r.var_name], r)

    return cfg.dataset_train
def process_woe_trans(in_data_path=None,
                      rst_path=None,
                      out_path=None,
                      config_path=None):
    cfg = config.config()
    cfg.load_file(config_path, in_data_path)

    for var in [
            tmp for tmp in cfg.bin_var_list
            if tmp in list(cfg.dataset_train.columns)
    ]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1

    for var in [
            tmp for tmp in cfg.discrete_var_list
            if tmp in list(cfg.dataset_train.columns)
    ]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(),
                              (var)] = 'missing'

    fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type)

    output = open(rst_path, 'rb')
    rst = pickle.load(output)
    output.close()

    # Training dataset Woe Transformation
    for r in rst:
        cfg.dataset_train[r.var_name] = fp.woe_trans(
            cfg.dataset_train[r.var_name], r)

    cfg.dataset_train.to_csv(out_path)
def process_woe_trans(cfg=None,
                      in_data_path=None,
                      rst_path=None,
                      out_path=None):
    # config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model.csv'
    # data_path = in_data_path
    # cfg = config.config()
    # cfg.load_file(config_path, data_path)

    dataset = pd.read_csv(in_data_path)
    for var in [
            tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns)
    ]:
        # fill null
        dataset.loc[dataset[var].isnull(), (var)] = -1

    for var in [
            tmp for tmp in cfg.discrete_var_list
            if tmp in list(dataset.columns)
    ]:
        # fill null
        dataset.loc[dataset[var].isnull(), (var)] = 'missing'

    fp.change_feature_dtype(dataset, cfg.variable_type)

    output = open(rst_path, 'rb')
    rst = pickle.load(output)
    output.close()

    # Training dataset Woe Transformation
    r = rst[0]
    dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r)
    dataset.to_csv(out_path)
    print('%s\tSUCCESS EXPORT FILE: \n%s' %
          (time.asctime(time.localtime(time.time())), out_path))
示例#4
0
def single_process(var):
    target_path = r'E:\ScoreCard\cs_model\cs_m1_pos_model_daily\raw_data\dataset_split_by_cols\target.csv'
    target = pd.read_csv(target_path)
    fp.change_feature_dtype(target, cfg.variable_type)
    cfg.global_bt = sum(target.target)
    cfg.global_gt = target.shape[0] - cfg.global_bt
    cfg.min_sample = int(target.shape[0] * 0.05)

    return process_train_woe(cfg, feature_name=var, target=target)
示例#5
0
def process_train_woe(cfg=None, feature_name=None, target=None):
    print 'run into process_train_woe: \n', feature_name, time.asctime(
        time.localtime(time.time()))
    feature_path = 'E:\\ScoreCard\\cs_model\\cs_m1_pos_model_daily\\raw_data\\dataset_split_by_cols\\'
    feature_path = feature_path + feature_name + '.csv'
    feature = pd.read_csv(feature_path)
    rst = []
    if feature.columns[0] in list(cfg.bin_var_list):
        feature.loc[feature[feature.columns[0]].isnull()] = -1
        fp.change_feature_dtype(feature, cfg.variable_type)
        dataset = pd.merge(feature.reset_index(),
                           target.reset_index()).drop('index', axis=1)
        var = feature.columns[0]
        del feature
        del target
        riv = fp.proc_woe_continuous(dataset,
                                     var,
                                     cfg.global_bt,
                                     cfg.global_gt,
                                     cfg.min_sample,
                                     alpha=0.05)
    else:  # process woe transformation of discrete variables
        print 'process woe transformation of discrete variables: \n', time.asctime(
            time.localtime(time.time()))
        feature.loc[feature[feature.columns[0]].isnull()] = 'missing'
        fp.change_feature_dtype(feature, cfg.variable_type)
        dataset = pd.merge(feature.reset_index(),
                           target.reset_index()).drop('index', axis=1)
        var = feature.columns[0]
        del feature
        del target
        riv = fp.proc_woe_discrete(dataset,
                                   var,
                                   cfg.global_bt,
                                   cfg.global_gt,
                                   cfg.min_sample,
                                   alpha=0.05)

    rst.append(riv)
    feature_detail = eval.eval_feature_detail(rst)

    rst_path = 'E:\\ScoreCard\\cs_model\\cs_m1_pos_model_daily\\gendata\\WOE_Rule\\'
    rst_path = rst_path + feature_name + '.pkl'

    result = (riv, feature_detail)
    output = open(rst_path, 'wb')
    pickle.dump(result, output)
    output.close()

    return result
示例#6
0
def process_woe01_trans(cfg=None, rst=None, dataset=None, out_path=None):
    # fill null
    for var in [
            tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns)
    ]:
        dataset.loc[dataset[var].isnull(), (var)] = -1

    for var in [
            tmp for tmp in cfg.discrete_var_list
            if tmp in list(dataset.columns)
    ]:
        dataset.loc[dataset[var].isnull(), (var)] = 'missing'

    fp.change_feature_dtype(dataset, cfg.variable_type)

    for r in rst:
        dataset[r.var_name] = woe01_trans(dataset[r.var_name], r)

    dataset.to_csv(out_path)
def process_woe_trans(in_data_path=None, rst_path=None, out_path=None):
    print time.asctime(time.localtime(time.time())), 'load config file'
    config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model.csv'
    data_path = in_data_path
    cfg = config.config()
    cfg.load_file(config_path, data_path)

    print time.asctime(time.localtime(time.time())), 'fill na'
    dataset = pd.read_csv(in_data_path)

    print time.asctime(time.localtime(
        time.time())), 'fill na continuous variables'
    for var in [
            tmp for tmp in cfg.bin_var_list if tmp in list(dataset.columns)
    ]:
        # fill null
        dataset.loc[dataset[var].isnull(), (var)] = -1

    print time.asctime(time.localtime(
        time.time())), 'fill na discrete variables'
    for var in [
            tmp for tmp in cfg.discrete_var_list
            if tmp in list(dataset.columns)
    ]:
        # fill null
        dataset.loc[dataset[var].isnull(), (var)] = 'missing'

    print time.asctime(time.localtime(time.time())), 'change feature dtypes'
    fp.change_feature_dtype(dataset, cfg.variable_type)

    print time.asctime(time.localtime(time.time())), 'load woe rule'
    output = open(rst_path, 'rb')
    rst = pickle.load(output)
    output.close()

    # Training dataset Woe Transformation
    for r in rst:
        print 'woe trans:', r.var_name
        dataset[r.var_name] = fp.woe_trans(dataset[r.var_name], r)

    dataset.to_csv(out_path, index=False)
    print('%s\tSUCCESS EXPORT FILE: \n%s' %
          (time.asctime(time.localtime(time.time())), out_path))
示例#8
0
def process_train_woe(infile_path=None,outfile_path=None,rst_path=None):
    print 'run into process_train_woe: \n',time.asctime(time.localtime(time.time()))
    config_path = 'E:\\Code\\Python_ML_Code\\cs_model\\config\\config_cs_model_pos_m2.csv'
    data_path = infile_path
    cfg = config.config()
    cfg.load_file(config_path,data_path)
    bin_var_list = [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]

    for var in bin_var_list:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1

    # change feature dtypes
    fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type)
    rst = []

    # process woe transformation of continuous variables
    print 'process woe transformation of continuous variables: \n',time.asctime(time.localtime(time.time()))
    print 'cfg.global_bt',cfg.global_bt
    print 'cfg.global_gt', cfg.global_gt

    for var in bin_var_list:
        rst.append(fp.proc_woe_continuous(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05))

    # process woe transformation of discrete variables
    print 'process woe transformation of discrete variables: \n',time.asctime(time.localtime(time.time()))
    for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 'missing'
        rst.append(fp.proc_woe_discrete(cfg.dataset_train,var,cfg.global_bt,cfg.global_gt,cfg.min_sample,alpha=0.05))

    feature_detail = eval.eval_feature_detail(rst, outfile_path)

    print 'save woe transformation rule into pickle: \n',time.asctime(time.localtime(time.time()))
    output = open(rst_path, 'wb')
    pickle.dump(rst,output)
    output.close()

    return feature_detail,rst
示例#9
0
文件: HereWeGo.py 项目: qian2729/woe
import pandas as pd
import woe.config as config
import woe.feature_process as fp
import woe.eval as eval

config_path = os.getcwd() + '\\woe\\examples\\config.csv'
data_path = os.getcwd() + '\\woe\\examples\\UCI_Credit_Card.csv'
cfg = config.config()
cfg.load_file(config_path, data_path)

for var in cfg.bin_var_list:
    # fill null
    cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0

# change feature dtypes
fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type)

rst = []

# process woe transformation of continuous variables
for var in cfg.bin_var_list:
    rst.append(
        fp.proc_woe_continuous(cfg.dataset_train,
                               var,
                               cfg.global_bt,
                               cfg.global_gt,
                               cfg.min_sample,
                               alpha=0.05))

# process woe transformation of discrete variables
for var in cfg.discrete_var_list:
def process_train_woe(infile_path=None, outfile_path=None, rst_path=None):
    print 'run into process_train_woe: \n', time.asctime(
        time.localtime(time.time()))
    config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_daily_model_lr.csv'
    data_path = infile_path
    cfg = config.config()
    cfg.load_file(config_path, data_path)

    # rst = []
    output = open(rst_path, 'rb')
    rst = pickle.load(output)
    output.close()

    exists_var_list = [rst[i].var_name for i in range(rst.__len__())]
    bin_var_list = [
        tmp for tmp in cfg.bin_var_list if
        tmp in list(cfg.dataset_train.columns) and tmp not in exists_var_list
    ]

    for var in bin_var_list:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = -1

    # change feature dtypes
    fp.change_feature_dtype(cfg.dataset_train, cfg.variable_type)

    # process woe transformation of continuous variables
    print 'process woe transformation of continuous variables: \n', time.asctime(
        time.localtime(time.time()))
    print 'cfg.global_bt', cfg.global_bt
    print 'cfg.global_gt', cfg.global_gt

    for var in bin_var_list:
        print var
        if rst.__len__() == 0:
            pass
        else:
            output = open(rst_path, 'rb')
            rst = pickle.load(output)
            output.close()
            print 'load'
        rst.append(
            fp.proc_woe_continuous(cfg.dataset_train,
                                   var,
                                   cfg.global_bt,
                                   cfg.global_gt,
                                   cfg.min_sample,
                                   alpha=0.05))
        output = open(rst_path, 'wb')
        pickle.dump(rst, output)
        output.close()
        print 'dump'

    # process woe transformation of discrete variables
    print 'process woe transformation of discrete variables: \n', time.asctime(
        time.localtime(time.time()))
    for var in [
            tmp for tmp in cfg.discrete_var_list
            if tmp in list(cfg.dataset_train.columns)
            and tmp not in exists_var_list
    ]:
        print var
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(),
                              (var)] = 'missing'
        if rst.__len__() == 0:
            pass
        else:
            output = open(rst_path, 'rb')
            rst = pickle.load(output)
            output.close()
            print 'load'
        rst.append(
            fp.proc_woe_discrete(cfg.dataset_train,
                                 var,
                                 cfg.global_bt,
                                 cfg.global_gt,
                                 cfg.min_sample,
                                 alpha=0.05))
        output = open(rst_path, 'wb')
        pickle.dump(rst, output)
        output.close()
        print 'dump'

    feature_detail = eval.eval_feature_detail(rst, outfile_path)
    return feature_detail, rst
    :return: null value,replace null value inplace
    """
    for var in [tmp for tmp in bin_var_list if tmp in list(dataset.columns)]:
        # fill null
        dataset.loc[dataset[var].isnull(), (var)] = continuous_filler

    for var in [
            tmp for tmp in discrete_var_list if tmp in list(dataset.columns)
    ]:
        # fill null
        dataset.loc[dataset[var].isnull(), (var)] = discrete_filler


fillna(df_train, cfg.bin_var_list, cfg.discrete_var_list)
fillna(df_validation, cfg.bin_var_list, cfg.discrete_var_list)
fp.change_feature_dtype(df_train, cfg.variable_type)
fp.change_feature_dtype(df_validation, cfg.variable_type)

candidate_var_list = [
    'avg_days', 'bptp_ratio', 'city', 'due_periods_ratio',
    'finish_periods_ratio', 'intime_pay', 'kptp_ratio', 'most_contact_3m',
    'person_app_age', 'person_sex', 'recent_contact_day', 'rej_count',
    'rpy_cn', 'seq_delay_days', 'state_sagroup', 'tot_credit_amount',
    'value_balance_ratio', 'value_income_ratio'
]

reload(eval)
eval.eval_feature_stability(civ_list,
                            df_train,
                            df_validation,
                            candidate_var_list,