def xgb_offline(train_data, cv_data): train_data = build_train_dataset(train_data, rate) train_data.reset_index(inplace=True, drop=True) train_Y = train_data['is_trade'].values cv_Y = cv_data['is_trade'].values drop_cols = ['is_trade'] train_data.drop(drop_cols, axis=1, inplace=True) cv_data.drop(drop_cols, axis=1, inplace=True) print('train shap:', train_data.shape) print('cv shape', cv_data.shape) kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=520) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((cv_data.shape[0], 5)) for i, (train_index, cv_index) in enumerate(kf): print('第{}次训练...'.format(i)) train_feat = train_data.iloc[train_index] cv_feat = train_data.iloc[cv_index] train_feat = xgb.DMatrix(train_feat.values, label=train_Y[train_index]) cv_feat = xgb.DMatrix(cv_feat.values, label=train_Y[cv_index]) test_feat = xgb.DMatrix(cv_data.values) watchlist = [(train_feat, 'train'), (cv_feat, 'val')] clf = xgb.train(params=params, dtrain=train_feat,num_boost_round=n_round,\ evals=watchlist,early_stopping_rounds=7,verbose_eval=False) predict_train = clf.predict(train_feat) predict_cv = clf.predict(cv_feat) predict_test = clf.predict(test_feat) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:, i] = predict_test print(clf.best_iteration) print(clf.best_score) print(' 训练损失:', cal_log_loss(predict_train, train_Y[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y[cv_index])) #特征重要度 features = train_data.columns ceate_feature_map(features) importance = clf.get_fscore(fmap='xgb.fmap') importance = sorted(importance.items(), key=operator.itemgetter(1)) df = pd.DataFrame(importance, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() predict_test = np.median(test_preds, axis=1) predict_test = predict_test / (predict_test + (1 - predict_test) / rate) print('训练损失:', cal_log_loss(train_preds / 4, train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) print('验证损失:', cal_log_loss(predict_test, cv_Y)) return df, clf
def lgb_offline(train_data, cv_data): train_data = build_train_dataset(train_data, rate) train_data.reset_index(inplace=True, drop=True) train_Y = train_data['is_trade'] cv_Y = cv_data['is_trade'] drop_cols = ['is_trade'] train_data.drop(drop_cols, axis=1, inplace=True) cv_data.drop(drop_cols, axis=1, inplace=True) kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=520) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((cv_data.shape[0], 5)) for i, (train_index, cv_index) in enumerate(kf): train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] print('第{}次训练...'.format(i)) lgb_train = lgb.Dataset(train_feat.values, train_Y.loc[train_index]) lgb_cv = lgb.Dataset(cv_feat.values, train_Y.loc[cv_index]) gbm = lgb.train(params=params, train_set=lgb_train, num_boost_round=6000, valid_sets=lgb_cv, verbose_eval=False, early_stopping_rounds=100) #评价特征的重要性 feat_imp = pd.Series( gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False) predict_train = gbm.predict(train_feat.values) predict_cv = gbm.predict(cv_feat.values) test_preds[:, i] = gbm.predict(cv_data.values) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv feat_imp = pd.Series( gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False) print(gbm.best_iteration) print(gbm.best_score) print(' 训练损失:', cal_log_loss(predict_train, train_Y.loc[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index])) predict_test = np.median(test_preds, axis=1) predict_test = predict_test / (predict_test + (1 - predict_test) / rate) print(params) print('训练损失:', cal_log_loss(train_preds / 4, train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) print('验证损失:', cal_log_loss(predict_test, cv_Y)) return gbm, feat_imp
def LR_online(train_data, cv_data, test_data): train_data = pd.concat([train_data, cv_data],axis=0) train_data = build_train_dataset(train_data, rate) train_data.reset_index(inplace=True,drop=True) train_Y = train_data['is_trade'] drop_cols = ['is_trade'] train_data.drop(drop_cols,axis=1,inplace=True) test_data.drop(drop_cols,axis=1,inplace=True) fold = 5 kf = KFold(len(train_data), n_folds = fold, shuffle=True, random_state=520) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((test_data.shape[0], fold)) for i, (train_index, cv_index) in enumerate(kf): train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] clf = LogisticRegression(C=1.2, fit_intercept=True, max_iter=3000,class_weight={0:0.5, 1:0.5}) clf.fit(X=train_feat.values, y=train_Y[train_index]) predict_train = clf.predict_proba(train_feat.values)[:,1] predict_cv = clf.predict_proba(cv_feat.values)[:,1] predict_test = clf.predict_proba(test_data.values)[:,1] train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:,i] = predict_test print(' 训练损失:',cal_log_loss(predict_train, train_Y[train_index])) print(' 测试损失:',cal_log_loss(predict_cv, train_Y[cv_index])) predict_test = np.median(test_preds,axis=1) predict_test = predict_test/(predict_test+(1-predict_test)/rate) print('训练损失:',cal_log_loss(train_preds/4, train_Y)) print('测试损失:',cal_log_loss(cv_preds, train_Y)) submmit_result(predict_test, 'LR')