'eta': 0.1, 'eval_metric': 'auc', 'max_depth': 4, 'objective': 'binary:logistic', 'silent': 1, 'tree_method': 'hist', 'nthread': 64, 'seed': SEED } #dtrain = xgb.DMatrix(X, y) #col = X.columns #del X, y; gc.collect() gc.collect() yhat, imp, ret = ex.stacking(X, y, param, 9999, nfold=5, esr=30) imp.to_csv('imp_{}.csv'.format(datetime.today().date()), index=False) # ============================================================================= # cv # ============================================================================= #model = xgb.train(param, dbuild, NROUND, watchlist, verbose_eval=10, # early_stopping_rounds=50) # #imp = ex.getImp(model) #imp.to_csv('imp.csv', index=False) #============================================================================== utils.end(__file__)
]) return df X = pd.concat([question_id] + [utils.read_pickles(f) for f in train_files] + [vec], axis=1) #X = pd.concat([question_id]+[subsample(f) for f in train_files], axis=1) y = utils.read_pickles('../data/label')['answer_score'].map(np.log1p) # ============================================================================= # xgb # ============================================================================= params = { 'max_depth': 5, 'eta': 0.1, 'colsample_bytree': 0.7, 'silent': 1, 'eval_metric': 'rmse', 'objective': 'reg:linear', 'tree_method': 'hist' } yhat, imp, ret = ex.stacking(X, y, params, 9999, esr=50, by='question_id', seed=SEED)
} y = X.is_duplicate col = ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'] lls = [] iters = [] for i in range(loop): X_, y_ = utils.down_sampling(X, y, p) sub = X_[col] X_.drop(col, axis=1, inplace=True) yhat, imp, ret = ex.stacking(X_, y_, param, nround, esr=30, nfold=5) ll = log_loss(y_, yhat) iter_ = ret['best_iter'] print('998-1_cv LOGLOSS:', ll) print('998-1_cv best iter:', iter_) lls.append(ll) iters.append(iter_) print(np.mean(lls), np.mean(iters)) sub['yhat'] = yhat sub['d'] = abs(sub.is_duplicate - sub.yhat) sub.sort_values('d', ascending=False, inplace=True)
test_sub = test[['test_id']] for i in range(1): train = pd.merge(train, utils.load_010([0, 1, 2], 1), on='id', how='left') test = pd.merge(test, utils.load_010([0, 1, 2], 0), on='test_id', how='left') col = get_valid_col(train.columns) train.drop(col, axis=1, inplace=1) test.drop(col, axis=1, inplace=1) yhat, imp, ret = ex.stacking(train.drop('id', axis=1), y_train, param, nround, esr=30, test=test.drop('test_id', axis=1)) gc.collect() print('log_loss:', log_loss(y_train, yhat)) train_sub['yhat'] = yhat test_sub['yhat'] = ret.get('test') train_sub.drop('is_duplicate', axis=1).to_csv('../feature/train_f009_xgb.csv.gz', index=False, compression='gzip') train_sub.to_csv('../feature/test_f009_xgb.csv.gz', index=False, compression='gzip')
y_train = X.is_duplicate sub = X[['id', 'is_duplicate']] X.drop(col, axis=1, inplace=1) def get_valid_col(col): return [ c for c in col if c.count(',') > 0 or c.count('[') > 0 or c.count(']') > 0 or c.count('>') > 0 ] col = get_valid_col(X.columns) X.drop(col, axis=1, inplace=1) yhat, imp, ret = ex.stacking(X, y_train, param, nround, esr=30) sub['yhat'] = yhat sub[['id', 'yhat']].to_csv('../output/cv.csv.gz', index=False, compression='gzip') imp.to_csv('../output/imp.csv.gz', index=False, compression='gzip') print(log_loss(y_train, yhat)) """ eta: 0.02 CV: 0.317928303359 ==> LB: 0.28018 eta: 0.2 CV: 0.363839232148