Пример #1
0
    def AllWoeCals(self, vrs=None):
        """
        vrs for version control
        """
        ivs = {}
        try:
            with tqdm(self.ftrs.keys()) as t:
                for i in t:
                    self.setTgt(self.data[[i, 'label']])
                    if self.ftrs[i] == 'str':
                        self.strWoe_cal()
                    elif self.ftrs[i] in ['int', 'float']:
                        self.woe_cal()
                    elif isinstance(self.ftrs[i], dict):
                        self._setStrValue(self.ftrs[i], ifraise=False)
                        self.woeDetail[i]['str2orders'] = self.ftrs[i]
                        self.woe_cal()

                    ivs[i] = self.getIVinfo()
        except KeyboardInterrupt:
            t.close()
            raise
        t.close()

        if vrs is not None:

            tools.putFile(self.path + '/feature_process_methods/IVstat',
                          'IVs_' + vrs + '.json', ivs)
            tools.putFile(self.path + '/feature_process_methods/IVstat',
                          'woeDetail_' + vrs + '.json', self.woeDetail)
Пример #2
0
            'type_1,type_2': 0
        }
    }
    smy['woeCal']['ft_lbs_dis_label'] = {
        'type_info': {
            'd0': 1,
            'd1_300': 2,
            'd301_800': 3,
            'd801_2500': 4,
            'd2501_8000': 5,
            'd8001_20000': 6,
            'd20000_': 7
        }
    }
    #js_smy = json.dumps(smy)
    tools.putFile(path + '/' + version, 'process_methods.json', smy)

if ifselect:
    #建议用非WOE编码的方式进行逻辑回归,查看模型效果
    #设定逻辑回归参数
    #保留test
    prc_methods = tools.getJson(path + '/' + version + '/process_methods.json')
    data, oot, data_lb, oot_lb = train_test_split(raw_data.drop('label',
                                                                axis=1),
                                                  raw_data['label'],
                                                  test_size=0.2,
                                                  random_state=rnd_seed)
    params = {'ifconst': True, 'ifnull': True}
    print(
        '------------------------------------IV值计算---------------------------------------'
    )
Пример #3
0
        elif k in to_drop_list:
            smy['toDrop'].append({k: 'no feature'})
        elif js[k]['dist'] <= 1:
            smy['toDrop'].append({k: 'unique_value'})
        elif js[k]['type'] == 'str' and js[k]['dist'] > 30:
            smy['toDrop'].append({k: 'too much chars'})
        elif raw_data[k].isnull().sum() / total_data > 0.98:
            smy['toDrop'].append({k: 'too much missing'})
        elif js[k]['type'] == 'str':
            smy['str_col'].append(k)
        elif js[k]['type'] == 'int':
            smy['int_col'].append(k)
        elif js[k]['type'] == 'float':
            smy['float_col'].append(k)

    tools.putFile(path, 'summary.json', smy)

if ftr_stat:
    #读取原始数据
    print("Reading related information...")
    smy = tools.getJson(path + '/summary.json')
    toDrop = smy.get('toDrop')
    toDropList = [
        list(a.keys())[0] for a in toDrop
        if list(a.values())[0] != 'no feature'
    ]
    #ids = smy.get('ids')
    str_col = smy.get('str_col')
    int_col = smy.get('int_col')
    float_col = smy.get('float_col')
    #toOneHot = smy.get('toOneHot')
Пример #4
0
def ft_type_check(path, output, header=True, size_c=1000):
    """
    检查样本特征种类
    """
    fsize = os.path.getsize(path) / 1024**2
    if fsize > size_c:
        warnings.warn('sample data is used to assess data type')
        with open(path, 'r') as f:
            txt = [
                re.sub(r'[\t,|]', ',', a).strip().split(',')
                for a in f.readlines()[:2000 + np.int(header)]
            ]
            f.close()
        if header:
            hds = txt[0]
            vls = txt[1:]
        else:
            hds = ['f' + str(a) for a in range(len(txt[0]))]
            vls = txt
        all_data = pd.DataFrame(vls, columns=hds)
        all_data = all_data.replace({'': None})
    else:
        if header:
            all_data = pd.read_csv(path, sep=',', header=0, engine='python')
        else:
            all_data = pd.read_csv(path, sep=',', header=None, engine='python')
            all_data.columns = [
                'f' + str(a) for a in list(all_data.columns.values)
            ]

#    with open(path, 'r') as f:
#        if fsize > size_c:
#            txt = [re.sub(r'[\t,|]', ',', a).strip().split(',') for a in f.readlines()[:2000+np.int(header)]]
#            warnings.warn('sample data is used to assess data type')
#        else:
#            txt = [re.sub(r'[\t,|]', ',', a).strip().split(',') for a in f.readlines()]
#        f.close()
#
#    if header:
#        hds = txt[0]
#        vls = txt[1:]
#    else:
#        hds = ['f'+str(a) for a in range(len(txt[0]))]
#        vls = txt
#    all_data = pd.DataFrame(vls, columns = hds)
#
#    all_data = all_data.replace({'' : None})

    if all_data.shape[1] == 1:
        warnings.warn("invalid parser probably")
    hds = list(all_data.columns.values)

    rlts = {}
    for i in hds:
        tmp = all_data[i].dropna()
        try:
            tmp = tmp.apply(np.float)
            unq = len(tmp.unique())
            if np.float(unq) > 100:
                rlts[i] = {'type': 'float', 'dist': len(all_data[i].unique())}
            else:
                rlts[i] = {'type': 'int', 'dist': len(all_data[i].unique())}
        except:
            rlts[i] = {'type': 'str', 'dist': len(all_data[i].unique())}

    if fsize > size_c:
        nm = 'type_info_sample.json'
    else:
        nm = 'type_info.json'

    tools.putFile(output, nm, rlts)

    return rlts
    
if if_gnrt_smy:
    smy = {'undo':[], 'fill':{}, 'cap':{}, 'var2char':{}, 'onehot':{}, 'woeCal':{}}
    for i in int_col+float_col:
        smy['undo'] += [i]
        
    """
    对于字符串项特征的特殊处理
    """
    smy['var2char']['ft_tag_age'] = {'0-17':1, '18-24':2, '25-34':3, '35-44':4, '45+':5}
    smy['var2char']['ft_gz_grey_list'] = {'micro_loan_5_':2, 'micro_loan_3_4':1,
                                                     'micro_loan_5_,type_1':2, 'micro_loan_5_,type_2':2, 'micro_loan_3_4,type_1':1, 'micro_loan_3_4,type_2':1,
                                                     'type_1':0, 'type_2':0, 'type_1,type_2':0}
    smy['var2char']['ft_lbs_dis_label'] = {'d0':1, 'd1_300':2, 'd301_800':3, 'd801_2500':4, 'd2501_8000':5, 'd8001_20000':6, 'd20000_':7}
    #js_smy = json.dumps(smy)
    tools.putFile(path+'/'+version, 'process_methods.json', smy)

if ifselect:
    #维信方法
    #随机筛选的方法,主要用gain进行评估
    #使用xgboost
    #feature一般不做特别处理,除非cap
    #通过随机抽取特征的方式计算
    #需要留出OOT
    model_params = {
    #'booster':'gbtree',
    'objective': 'binary:logistic', #多分类的问题
    #'eval_metric': 'auc',
    #'num_class':10, # 类别数,与 multisoftmax 并用
    'gamma':0.2,  # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
    'max_depth':2, # 构建树的深度,越大越容易过拟合
Пример #6
0
        'int_col': [],
        'float_col': [],
        'str_col': [],
        'toDrop': []
    }
    total_data = len(raw_data)
    try:
        js = tools.getJson(path + '/' + 'type_info.json')
    except:
        js = tools.getJson(path + '/' + 'type_info_sample.json')
    print('generating summary')
    for k, v in tqdm(js.items()):
        if k == smy['label'] or k == smy['dayno']:
            continue
        elif k in to_drop_list:
            smy['toDrop'].append({k: 'no feature'})
        elif js[k]['dist'] <= 1:
            smy['toDrop'].append({k: 'unique_value'})
        elif js[k]['type'] == 'str' and js[k]['dist'] > 30:
            smy['toDrop'].append({k: 'too much chars'})
        elif raw_data[k].isnull().sum() / total_data > 0.98:
            smy['toDrop'].append({k: 'too much missing'})
        elif js[k]['type'] == 'str':
            smy['str_col'].append(k)
        elif js[k]['type'] == 'int':
            smy['int_col'].append(k)
        elif js[k]['type'] == 'float':
            smy['float_col'].append(k)

    tools.putFile(path, 'summary.json', smy)
Пример #7
0
                    tmp['bf'] = spurs.getIVinfo()
                    spurs.setTgt(df_af)
                    spurs.woe_cal()
                    tmp['af'] = spurs.getIVinfo()
                    ivs[i] = tmp

                except:
                    ivs[i] = {'bf': None, 'af': None}

except KeyboardInterrupt:
    t.close()
    raise
t.close()
#输出不适合计算WOE的特征
#print(spurs.allInvalid)
tools.putFile(path + '/feature_stat', 'invalidIV_mdf.json', spurs.getInvalid())
tools.putFile(path + '/feature_stat', 'misStat_mdf.json', mis)
tools.putFile(path + '/feature_stat', 'ivStat_mdf.json', ivs)
tools.putFile(path + '/feature_stat', 'ksStat_mdf.json', kses)
tools.putFile(path + '/feature_stat', 'tvStat_mdf.json', tvs)
tools.putFile(path + '/feature_stat', 'psiStat_mdf.json', psis)

final = pd.DataFrame(mis).T
tmp = pd.DataFrame(ivs).T
tmp.columns = ['iv_' + str(a) for a in tmp.columns]
final = pd.merge(left=final,
                 right=tmp,
                 left_index=True,
                 right_index=True,
                 how='outer')
tmp = pd.DataFrame(kses).T