Пример #1
0
def Select_Feature(feature_name, threshold, train_source, data):
    keep_feature = []
    for feature in feature_name:
        ks_value = ks(data['label'], data[feature])['ks']
        if ks_value >= threshold:
            keep_feature.append(feature)
    return keep_feature
Пример #2
0
def get_feature_nan_and_ks(feature_X, Y):
    feature_all = feature_X.columns
    ks_list = []
    miss_ratio_list = []
    for feature in feature_all:
        miss_ratio_list.append(
            round(feature_X[feature].isnull().sum() * 1.0 / len(feature_X), 5))
        X_y = pd.DataFrame(zip(feature_X[feature], Y)).dropna().values
        ks_list.append(round(ks(X_y[:, 1], X_y[:, 0])['ks'], 5))
    nan_dict = dict(zip(feature_all, miss_ratio_list))
    ks_dict = dict(zip(feature_all, ks_list))
    return nan_dict, ks_dict
Пример #3
0
def feature_summary(data, feature_use, split_source=False):
    feature_use_nan = pd.DataFrame(feature_use)
    feature_zero = pd.DataFrame(feature_use)
    feature_ks = pd.DataFrame(feature_use)

    data_length = len(data)
    feature_use_nan_list = []
    feature_zero_list = []
    feature_ks_list = []
    for f in feature_use:
        feature_use_nan_list.append(sum(data[f].isnull()) * 1.0 / data_length)
        feature_zero_list.append(sum(data[f] == 0) * 1.0 / data_length)
        ks_dict = ks(data['label'], data[f])
        feature_ks_list.append(ks_dict['ks'])
        plot_ks(ks_dict, f, 'all')
    feature_use_nan['all'] = feature_use_nan_list
    feature_zero['all'] = feature_zero_list
    feature_ks['all'] = feature_ks_list

    if split_source:
        source = data.source.unique()
        for s in source:
            data_s = data[data.source == s]
            data_s_length = len(data_s)
            feature_use_nan_list = []
            feature_zero_list = []
            feature_ks_list = []
            for f in feature_use:
                feature_use_nan_list.append(
                    sum(data_s[f].isnull()) * 1.0 / data_s_length)
                feature_zero_list.append(
                    sum(data_s[f] == 0) * 1.0 / data_s_length)
                ks_dict = ks(data_s['label'], data_s[f])
                feature_ks_list.append(ks_dict['ks'])
                plot_ks(ks_dict, f, s)
            feature_use_nan[s] = feature_use_nan_list
            feature_zero[s] = feature_zero_list
            feature_ks[s] = feature_ks_list

    return feature_use_nan, feature_zero, feature_ks
Пример #4
0
def evaluate_cv(X_train,y_train,model,pars,fold_num = 5,to_balance = False,num_round = 100):
    try:
        X_train = X_train.values
        y_train = y_train.values
    except:
        pass
    #from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=310)
    ks_value_list = []
    auc_value_list = []
    for i,(train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        
        train_x = X_train[train_index]
        train_y = y_train[train_index]
        if to_balance == True:
            train_x,train_y = balance_data(train_x,train_y)
        test_x = X_train[test_index]
        test_y = y_train[test_index]
        if model == 'gbdt':
            gbdt = GBDT_Fit(train_x,train_y,pars)
            test_y_predict = GBDT_Predict(gbdt,test_x)
        elif model == 'xgb':
            xgb_model = XGBoost_Fit(train_x,train_y,pars,num_round = num_round ,X_val=test_x,y_val=test_y)
            test_y_predict = XGBoost_Predict(xgb_model,test_x)
        elif model == 'lgb' :
            lgb_model = LightGBM_Fit(train_x,train_y,pars,num_round = num_round ,X_val=test_x,y_val=test_y)
            test_y_predict = LightGBM_Predict(lgb_model,test_x)
        elif model == 'gbdt_lr' :
            gbdt_lr,model_onehot,gbdt = GBDTLR_Fit(train_x,train_y,pars)
            test_y_predict = GBDTLR_Predict(gbdt_lr,model_onehot,gbdt,test_x)

        ks_value = ks(test_y,test_y_predict)['ks']
        auc_value = roc_auc_score(test_y, test_y_predict)
        ks_value_list.append(ks_value)
        auc_value_list.append(auc_value)
        print 'now fold %d , all %d flods , ks : %.3f , auc : %.3f'%(i+1,fold_num,ks_value,auc_value) 
    
    ks_mean = np.mean(ks_value_list)
    ks_std = np.std(ks_value_list)
    auc_mean = np.mean(auc_value_list )
    auc_std = np.std(auc_value_list )
    #print 'cv | ks mean : %.3f , ks std : %.3f , auc mean : %.4f'%(ks_mean,ks_std,auc_mean)

    cv_result = 'cv | ks mean : %.3f , ks std : %.3f , auc mean : %.4f , auc std : %.4f'%(ks_mean,ks_std,auc_mean,auc_std)
    return cv_result
Пример #5
0
def fill_nan(X_train, y_train, way):
    fill_nan_val = []
    for feature in X_train.columns:
        index_null = pd.isnull(X_train[feature])
        if index_null.sum() > 0:

            if way == 'dis':
                index_null = pd.isnull(X_train[feature])
                label_miss = y_train[index_null == True]
                miss_overdue_ratio = sum(label_miss) * 100 / (
                    float(len(label_miss)) + 10e-8)
                ks_info = ks(y_train[index_null == False],
                             X_train[feature][index_null == False], 20)
                delta_list = abs(ks_info['overdue_ratio'] - miss_overdue_ratio)
                span = ks_info['span_list'][delta_list.argmin()]
                try:
                    val1 = float(span.strip().split(',')[0].split('(')[1])
                except:
                    val1 = float(span.strip().split(',')[0].split('[')[1])
                val2 = float(span.strip().split(',')[1].split(']')[0])
                val = (val1 + val2) / 2.0

            elif way == 'avg':
                val = X_train[feature].mean()
            elif way == 'mid':
                val = X_train[feature].median()
            elif isinstance(way, int) or isinstance(way, float):
                val = way
            else:
                print 'error input , try again'
                return None, None
        else:
            val = None

        X_train[feature] = X_train[feature].fillna(value=val)
        fill_nan_val.append(val)

    fill_nan_dict = dict(zip(X_train.columns, fill_nan_val))

    return X_train, fill_nan_dict
Пример #6
0
def evaluate_stack_cv(X_train,y_train,gbdt_pars,xgb_pars,lgb_pars,stacking_model,to_balance = False,fold_num = 5,stack_fold = 2):
    try:
        X_train = X_train.values
        y_train = y_train.values
    except:
        pass
    #from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=310)
    ks_value_list = []
    auc_value_list = []
    for i,(train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        
        train_x = X_train[train_index]
        train_y = y_train[train_index]
        if to_balance == True:
            train_x,train_y = balance_data(train_x,train_y)
        test_x = X_train[test_index]
        test_y = y_train[test_index]

        gbdt_model_list,xgb_model_list,lgb_model_list,stacking_model = StackModel_Fit(train_x,train_y,
            gbdt_pars,xgb_pars,lgb_pars,stacking_model,stack_fold = stack_fold)

        test_y_predict = StackModel_Predict(gbdt_model_list,xgb_model_list,lgb_model_list,stacking_model,test_x)

        ks_value = ks(test_y,test_y_predict)['ks']
        auc_value = roc_auc_score(test_y, test_y_predict)
        ks_value_list.append(ks_value)
        auc_value_list.append(auc_value)
        print 'now fold %d , all %d flods , ks : %.3f , auc : %.3f'%(i+1,fold_num,ks_value,auc_value) 
    
    ks_mean = np.mean(ks_value_list)
    ks_std = np.std(ks_value_list)
    auc_mean = np.mean(auc_value_list )
    auc_std = np.std(auc_value_list )
    #print 'cv | ks mean : %.3f , ks std : %.3f , auc mean : %.4f'%(ks_mean,ks_std,auc_mean)

    cv_result = 'cv | ks mean : %.3f , ks std : %.3f , auc mean : %.4f , auc std : %.4f'%(ks_mean,ks_std,auc_mean,auc_std)
    return cv_result
Пример #7
0
cv_result = evaluate_cv(X_train,
                        y_train,
                        model='xgb',
                        pars=xgb_best_pars,
                        fold_num=5,
                        to_balance=False,
                        num_round=60)
print cv_result
xgb_model = XGBoost_Fit(X_train, y_train, xgb_best_pars, num_round=60)
ks_all = {}
for test_source in source_list:
    print test_source
    X_test, y_test = get_X_y(data, feature_use, test_source)
    pred = XGBoost_Predict(xgb_model, X_test)
    ks_result = ks(y_test, pred)
    ks_all[test_source] = ks_result

for source, ks_value in ks_all.items():
    print source
    print ks_value['ks']

gc.collect()

# GBDT
print '-------------------------GBDT Result-------------------------'
gbdt_best_pars = {
    'learning_rate': 0.2,
    'max_depth': 3,
    'n_estimators': 100,
    'subsample': 0.8,
Пример #8
0
            feature_use_nan[s] = feature_use_nan_list
            feature_zero[s] = feature_zero_list
            feature_ks[s] = feature_ks_list

    return feature_use_nan, feature_zero, feature_ks


feature_use_nan, feature_zero, feature_ks = feature_summary(data,
                                                            feature_use,
                                                            split_source=True)

feature_use_nan.to_csv('../Data/feature_use_nan.csv')
feature_zero.to_csv('../Data/feature_zero.csv')
feature_ks.to_csv('../Data/feature_ks.csv')

feature_ks = open('../Data/Feature_KS_Detail.txt', 'w')
for feature in feature_use:
    ks_dict = ks(data['label'], data[feature])
    line = print_ks(ks_dict, feature)
    feature_ks.write(line)
    feature_ks.write(
        '---------------------------------------------------------------------\n'
    )
feature_ks.close()
'''
import seaborn as sns
import matplotlib.pyplot as plt
for i,s in enumerate(source_list):
    sns.kdeplot(pred_list[i],label=s)
plt.legend()
'''
Пример #9
0
                                             str(test_data_describe_select[test_data_describe_select.source == s].t_max.values[0])[:10]
                                             )
    log_text = log_text + '----- source : %s   cnt : %d   overdue_rate : %.3f   start_day : %s   end_day : %s -----'%(s,
                                             test_data_describe_select[test_data_describe_select.source == s].sample_cnt.values[0],
                                             test_data_describe_select[test_data_describe_select.source == s].overdue_rate.values[0],
                                             str(test_data_describe_select[test_data_describe_select.source == s].t_min.values[0])[:10],
                                             str(test_data_describe_select[test_data_describe_select.source == s].t_max.values[0])[:10]
                                             ) + '\n'


    data_temp = data_test_select[(data_test_select.source == s)]
    X_test,y_test = get_X_y(data_temp)
    #self_cv = evaluate_cv(X_test,y_test,model='xgb',pars=xgb_pars,fold_num = 5,num_round=60)
    #print 'self cv :'
    #log_text = log_text + 'self cv : \n'
    #print self_cv
    #log_text = log_text + str(self_cv) + '\n'
    pred_xgb = XGBoost_Predict(xgb_model,X_test)
    pred_list.append(pred_xgb)
    ks_result = ks(y_test,pred_xgb)
    auc_value = roc_auc_score(y_test, pred_xgb)
    print 'cross sample :'
    print 'ks : %.3f   auc : %.3f '%(ks_result['ks'],auc_value)
    log_text = log_text + 'cross sample : \n'
    log_text = log_text + 'ks : %.3f   auc : %.3f '%(ks_result['ks'],auc_value) + '\n'
    print print_ks(ks_result)
np.save(pred_path,pred_list)
f = open(log_path,'w')
f.write(log_text)
f.close()