Exemplo n.º 1
0
def xgb_online(train, cv, test):

    train_data = train.copy()
    cv_data = cv.copy()
    test_data = test.copy()

    train_data = pd.concat([train_data, cv_data], axis=0)
    train_data.reset_index(inplace=True, drop=True)
    train_Y = train_data['is_trade']

    drop_cols = ['is_trade']
    train_data.drop(drop_cols, axis=1, inplace=True)
    test_data.drop(drop_cols, axis=1, inplace=True)

    folds = 5
    kf = KFold(len(train_data), n_folds=folds, shuffle=True, random_state=7)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((test_data.shape[0], 5))
    for i, (train_index, cv_index) in enumerate(kf):
        print('第{}次训练...'.format(i))
        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]

        train_feat = xgb.DMatrix(train_feat.values, label=train_Y[train_index])
        cv_feat = xgb.DMatrix(cv_feat.values, label=train_Y[cv_index])
        test_feat = xgb.DMatrix(test_data.values)
        watchlist = [(train_feat, 'train'), (cv_feat, 'val')]


        clf = xgb.train(params=params, dtrain=train_feat,num_boost_round=n_round,\
            evals=watchlist,early_stopping_rounds=7,verbose_eval=False)

        predict_train = clf.predict(train_feat)
        predict_cv = clf.predict(cv_feat)
        predict_test = clf.predict(test_feat)

        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:, i] = predict_test
        #特征重要度
        features = train_data.columns
        ceate_feature_map(features)
        importance = clf.get_fscore(fmap='xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1))
        feat_imp = pd.DataFrame(importance, columns=['feature', 'fscore'])
        feat_imp['fscore'] = feat_imp['fscore'] / feat_imp['fscore'].sum()
        print(clf.best_iteration)
        print(clf.best_score)
        print('   训练损失:', cal_log_loss(predict_train,
                                       train_Y.loc[train_index]))
        print('   测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index]))
    predict_test = np.median(test_preds, axis=1)
    predict_test = predict_test / (predict_test + (1 - predict_test) / rate)
    print(params)
    print('训练损失:', cal_log_loss(train_preds / (folds - 1), train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))
    print('test mean:', np.mean(predict_test))
    submmit_result(predict_test, 'XGB')
    return feat_imp, predict_test
Exemplo n.º 2
0
def lgb_online(train, cv, test):

    train_data = train.copy()
    cv_data = cv.copy()
    test_data = test.copy()

    train_data = pd.concat([train_data, cv_data], axis=0)
    train_data.reset_index(inplace=True, drop=True)
    train_Y = train_data['is_trade']

    drop_cols = ['is_trade']
    train_data.drop(drop_cols, axis=1, inplace=True)
    test_data.drop(drop_cols, axis=1, inplace=True)

    folds = 5
    kf = KFold(len(train_data), n_folds=folds, shuffle=True, random_state=7)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((test_data.shape[0], 5))
    for i, (train_index, cv_index) in enumerate(kf):
        print('第{}次训练...'.format(i))
        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]
        lgb_train = lgb.Dataset(train_feat.values, train_Y.loc[train_index])
        lgb_cv = lgb.Dataset(cv_feat.values, train_Y.loc[cv_index])
        gbm = lgb.train(params=params,
                        train_set=lgb_train,
                        num_boost_round=6000,
                        valid_sets=lgb_cv,
                        verbose_eval=False,
                        early_stopping_rounds=50)
        #评价特征的重要性
        feat_imp = pd.Series(
            gbm.feature_importance(),
            index=train_data.columns).sort_values(ascending=False)

        predict_train = gbm.predict(train_feat.values)
        predict_cv = gbm.predict(cv_feat.values)
        test_preds[:, i] = gbm.predict(test_data.values)

        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv

        feat_imp = pd.Series(
            gbm.feature_importance(),
            index=train_data.columns).sort_values(ascending=False)
        print(gbm.best_iteration)
        print(gbm.best_score)
        print('   训练损失:', cal_log_loss(predict_train,
                                       train_Y.loc[train_index]))
        print('   测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index]))
    predict_test = np.median(test_preds, axis=1)
    predict_test = predict_test / (predict_test + (1 - predict_test) / rate)
    print(params)
    print('训练损失:', cal_log_loss(train_preds / (folds - 1), train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))
    print('test mean:', np.mean(predict_test))
    submmit_result(predict_test, 'LGB')
    return feat_imp, predict_test
Exemplo n.º 3
0
def LR_online(train_data, cv_data, test_data):
    print('on line')
    train_data = pd.concat([train_data, cv_data], axis=0)
    train_data.reset_index(inplace=True, drop=True)
    train_Y = train_data['is_trade']

    fold = 5
    kf = KFold(len(train_data), n_folds=fold, shuffle=True, random_state=520)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((test_data.shape[0], fold))

    data_in_process = [(train_data.loc[train_index], train_data.loc[cv_index],
                        test_data)
                       for i, (train_index, cv_index) in enumerate(kf)]
    index_all = [(train_index, cv_index)
                 for i, (train_index, cv_index) in enumerate(kf)]
    with multiprocessing.Pool(fold) as p:
        k_val_list = p.map(_LR_train, data_in_process)
    for i, index, val in zip(range(fold), index_all, k_val_list):
        print('no %d train' % (i))
        train_index, cv_index = index
        predict_train, predict_cv, predict_test = val
        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:, i] = predict_test
        print('  训练损失:', cal_log_loss(predict_train, train_Y[train_index]))
        print('  测试损失:', cal_log_loss(predict_cv, train_Y[cv_index]))
    '''
    for i, (train_index, cv_index) in enumerate(kf):
        print('no %d train:'%(i))
        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]
        
        predict_train,predict_cv,predict_test = _LR_train((train_feat,cv_feat,test_data))
        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:,i] = predict_test
        
        print('  训练损失:',cal_log_loss(predict_train, train_Y[train_index]))
        print('  测试损失:',cal_log_loss(predict_cv, train_Y[cv_index]))
    '''
    predict_test = np.median(test_preds, axis=1)
    print('mean:', np.mean(predict_test))
    print('训练损失:', cal_log_loss(train_preds / (fold - 1), train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))
    submmit_result(predict_test, 'LR')
Exemplo n.º 4
0
def LR_online(train_data, cv_data, test_data):
    train_data = pd.concat([train_data, cv_data],axis=0)
    train_data = build_train_dataset(train_data, rate)
    train_data.reset_index(inplace=True,drop=True)
    train_Y = train_data['is_trade']
    
    drop_cols = ['is_trade']
    train_data.drop(drop_cols,axis=1,inplace=True)
    test_data.drop(drop_cols,axis=1,inplace=True)
    
    fold = 5
    kf = KFold(len(train_data), n_folds = fold, shuffle=True, random_state=520)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((test_data.shape[0], fold))
    for i, (train_index, cv_index) in enumerate(kf):
        
        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]
        
        clf = LogisticRegression(C=1.2, fit_intercept=True, max_iter=3000,class_weight={0:0.5, 1:0.5})
        clf.fit(X=train_feat.values, y=train_Y[train_index])
        
        predict_train = clf.predict_proba(train_feat.values)[:,1]
        predict_cv = clf.predict_proba(cv_feat.values)[:,1]
        predict_test = clf.predict_proba(test_data.values)[:,1]
        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:,i] = predict_test
        
        print('  训练损失:',cal_log_loss(predict_train, train_Y[train_index]))
        print('  测试损失:',cal_log_loss(predict_cv, train_Y[cv_index]))
    predict_test = np.median(test_preds,axis=1)
    predict_test = predict_test/(predict_test+(1-predict_test)/rate)
    print('训练损失:',cal_log_loss(train_preds/4, train_Y))
    print('测试损失:',cal_log_loss(cv_preds, train_Y))
    submmit_result(predict_test, 'LR')
Exemplo n.º 5
0
def online(train_data, cv_data, test_data):

    #剔除历史数据,保留老用户的历史数据
    cv_data.index += len(train_data)
    train_data = pd.concat([train_data, cv_data], axis=0)
    history_cols = ['user_id_cvr_smooth', 'user_id_buy_count']
    old_user_data_train = train_data[history_cols]
    old_user_data_test = test_data[history_cols]

    #对数据集进行训练
    train_Y = train_data['is_trade']

    drop_cols = ['is_trade']
    train_data.drop(drop_cols, axis=1, inplace=True)
    test_data.drop(drop_cols, axis=1, inplace=True)

    kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=520)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((test_data.shape[0], 5))
    for i, (train_index, cv_index) in enumerate(kf):

        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]

        print('第{}次训练...'.format(i))
        lgb_train = lgb.Dataset(train_feat.values, train_Y.loc[train_index])
        lgb_cv = lgb.Dataset(cv_feat.values, train_Y.loc[cv_index])
        gbm = lgb.train(params=params,
                        train_set=lgb_train,
                        num_boost_round=6000,
                        valid_sets=lgb_cv,
                        verbose_eval=False,
                        early_stopping_rounds=200)
        #评价特征的重要性
        feat_imp = pd.Series(
            gbm.feature_importance(),
            index=train_data.columns).sort_values(ascending=False)

        predict_train = gbm.predict(train_feat.values)
        predict_cv = gbm.predict(cv_feat.values)

        test_preds[:, i] = gbm.predict(test_data.values)
        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv

        feat_imp = pd.Series(
            gbm.feature_importance(),
            index=train_data.columns).sort_values(ascending=False)
        print(gbm.best_iteration)
        print(gbm.best_score)
        print('   训练损失:', cal_log_loss(predict_train,
                                       train_Y.loc[train_index]))
        print('   测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index]))
    test_preds = np.median(test_preds, axis=1)
    print(params)
    print('训练损失:', cal_log_loss(train_preds / 4, train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))

    #划分出新老用户的分数并计算损失情况
    train_old_user_index = old_user_data_train.loc[
        old_user_data_train.user_id_cvr_smooth != -1, :].index
    test_old_user_index = old_user_data_test.loc[
        old_user_data_test.user_id_cvr_smooth != -1, :].index
    #    train_new_user_index = old_user_data_train.loc[old_user_data_train.user_id_cvr_smooth==-1,:].index
    #    test_new_user_index = old_user_data_test.loc[old_user_data_test.user_id_cvr_smooth==-1,:].index

    train_old_score = cv_preds[train_old_user_index]
    test_old_score = test_preds[test_old_user_index]
    #    train_new_score = cv_preds[train_new_user_index]
    #    test_new_score = test_preds[test_new_user_index]

    new_train_data = old_user_data_train.loc[
        old_user_data_train.user_id_cvr_smooth != -1, :]
    new_test_data = old_user_data_test.loc[
        old_user_data_test.user_id_cvr_smooth != -1, :]
    new_train_data['y'] = train_old_score
    new_test_data['y'] = test_old_score
    new_train_Y = train_Y[train_old_user_index]

    #对老用户单独训练
    clf = LogisticRegression(C=12,
                             fit_intercept=True,
                             max_iter=3000,
                             class_weight={
                                 0: 0.5,
                                 1: 0.5
                             })
    clf.fit(X=new_train_data.values, y=new_train_Y)

    train_LR_score = clf.predict_proba(new_train_data.values)[:, 1]
    test_LR_score = clf.predict_proba(new_test_data.values)[:, 1]

    cv_preds[train_old_user_index] = train_LR_score
    test_preds[test_old_user_index] = test_LR_score
    #记录老用户的损失情况
    print('LR train:', cal_log_loss(train_LR_score, new_train_Y))
    #拼接结果看看总体的损失情况
    print('All train:', cal_log_loss(cv_preds, train_Y))

    submmit_result(test_preds, 'old_and_new')
    return test_preds, feat_imp
Exemplo n.º 6
0
    clf = LinearRegression(fit_intercept=True, normalize=True, n_jobs=-1)
    clf.fit(X=predict_day7, y=train_Y)

    predict_train_2 = clf.predict(predict_day7)
    predict_cv_2 = clf.predict(predict_cv)

    print('train:', cal_log_loss(predict_train_2, train_Y))
    print('test:', cal_log_loss(predict_cv_2, cv_Y))
    print('train mean:', np.mean(predict_train_2))
    print('cv mean:', np.mean(predict_cv_2))

    return predict_cv_2


if __name__ == '__main__':

    t0 = time.time()

    day7_data = _load_splited_df(path=cache_pkl_path + 'Tree_day/train_7')
    cv_data = _load_splited_df(path=cache_pkl_path + 'Tree_day/cv')
    test_data = _load_splited_df(path=cache_pkl_path + 'Tree_day/test')

    print('off line')
    _ = XGB_model_second(day7_data, cv_data)

    print('on line')
    #    day7_data = pd.concat([day7_data, cv_data],axis=0)
    predict_cv_2 = XGB_model_second(day7_data, test_data)
    submmit_result(predict_cv_2, 'XGB_LR')
Exemplo n.º 7
0
    gbc.fit(train_data.values, train_Y)
    predict_train = gbc.predict_proba(train_data.values)[:,1]
    predict_cv = gbc.predict_proba(cv_data.values)[:,1]
    predict_test = gbc.predict_proba(test_data.values)[:,1]
    
#    print(gbc.get_params)
    print('训练损失:',cal_log_loss(predict_train, train_Y))
    print('测试损失:',cal_log_loss(predict_cv, cv_Y))
    t1 = time.time()
    print('训练用时:',t1-t0)
    
    new_train, new_cv, new_test = gen_gbdt_feature(gbc, train_data, cv_data, test_data)
    print('train shap:',new_train.shape)
    print('cv shape', new_cv.shape)
    print('test shape', new_test.shape)
    
#    LR预测
    clf = LogisticRegression(C=0.8, fit_intercept=True, max_iter=3000,class_weight={0:0.5, 1:0.5})
    clf.fit(X=new_train, y=np.squeeze(train_Y))
    
    predict_train = clf.predict_proba(new_train)[:,1]
    predict_cv = clf.predict_proba(new_cv)[:,1]
    predict_test = clf.predict_proba(new_test)[:,1]
    
    print('训练损失:',cal_log_loss(predict_train, train_Y))
    print('测试损失:',cal_log_loss(predict_cv, cv_Y))
    t1 = time.time()
    print('训练用时:',t1-t0)
    
    submmit_result(predict_test,'GBDT_LR')
Exemplo n.º 8
0
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 16 22:13:45 2018

@author: weiqing
"""

import pandas as pd
import numpy as np
from utils import load_pickle, raw_data_path, feature_data_path, cache_pkl_path, result_path, model_path, submmit_result

if __name__ == '__main__':

    XGB = pd.read_csv('../result/XGB_20180421_211412.txt', sep=' ')
    LGB = pd.read_csv('../result/LGB_20180421_172434.txt', sep=' ')
    FFM = pd.read_csv('../result/FFM_20180421_215653.txt', sep=' ')

    result = np.zeros((len(XGB), 3))
    result[:, 0] = XGB['predicted_score'].values
    result[:, 1] = LGB['predicted_score'].values
    result[:, 2] = FFM['predicted_score'].values
    median = np.median(result, axis=1)

    submmit_result(median, 'median')
Exemplo n.º 9
0
def change_to_result():
    preds = pd.read_csv('../result/ffm_online_result.csv', header=None)
    submmit_result(np.squeeze(preds.values), 'FFM')
Exemplo n.º 10
0

if __name__ == '__main__':

    t0 = time.time()

    day7_data = _load_splited_df(path=cache_pkl_path + 'LR_day/train_7')
    cv_data = _load_splited_df(path=cache_pkl_path + 'LR_day/cv')
    test_data = _load_splited_df(path=cache_pkl_path + 'LR_day/test')

    predict_day7, predict_cv, predict_test = LR_model_first(
        day7_data, cv_data, test_data)

    #model 2
    train_Y = day7_data['is_trade'].values
    cv_Y = cv_data['is_trade'].values

    clf = LinearRegression(fit_intercept=True, normalize=True, n_jobs=-1)
    clf.fit(X=predict_day7, y=train_Y)

    predict_train_2 = clf.predict(predict_day7)
    predict_cv_2 = clf.predict(predict_cv)
    predict_test_2 = clf.predict(predict_test)

    print('train:', cal_log_loss(predict_train_2, train_Y))
    print('test:', cal_log_loss(predict_cv_2, cv_Y))
    print('train mean:', np.mean(predict_train_2))
    print('cv mean:', np.mean(predict_cv_2))
    print('test mean:', np.mean(predict_test_2))
    submmit_result(predict_test_2, 'LR_LR')