def xgb_evaluate(self, **Tunparams): warnings.filterwarnings("ignore") Tunparams['max_depth'] = int(Tunparams['max_depth']) Tunparams['min_child_weight'] = int(Tunparams['min_child_weight']) Tunparams['cosample_bytree'] = max( min(Tunparams['colsample_bytree'], 1), 0) Tunparams['subsample'] = max(min(Tunparams['subsample'], 1), 0) Tunparams['gamma'] = max(Tunparams['gamma'], 0) Tunparams['alpha'] = max(Tunparams['alpha'], 0) folds = data_prepare.get_folds(df=self.x_train[['totals.pageviews' ]].reset_index(), n_splits=2) if 'fullVisitorId' in self.x_train.columns: self.x_train.drop('fullVisitorId', axis=1, inplace=True) if 'fullVisitorId' in self.x_test.columns: self.x_test.drop('fullVisitorId', axis=1, inplace=True) if 'fullVisitorId' in self.y_train.columns: self.y_train.drop('fullVisitorId', axis=1, inplace=True) oof_preds = np.zeros(self.x_train.shape[0]) for n_fold, (train_idx, valid_idx) in enumerate(folds): train_x, train_y = self.x_train.iloc[train_idx], self.y_train.iloc[ train_idx] valid_x, valid_y = self.x_train.iloc[valid_idx], self.y_train.iloc[ valid_idx] # LGBMRegressor parameters found by Bayesian optimization self.clf.fit(train_x, np.log1p(train_y), eval_set=[(valid_x, np.log1p(valid_y))], eval_metric='rmse', verbose=100, early_stopping_rounds=200) oof_preds[valid_idx] = self.clf.predict( valid_x, ntree_limit=self.clf.best_iteration) #remove negative and transform un log oof_preds[oof_preds < 0] = 0 del train_x, train_y, valid_x, valid_y gc.collect() return -mean_squared_error(np.log1p(self.y_train), oof_preds)**.5
def cv(self, nfolds=5, submission=True): self.regressors.clear() self.feature_importance_df = pd.DataFrame() if not submission: folds = data_prepare.get_folds(df=self.x_train, n_splits=nfolds) else: folds = data_prepare.get_folds( df=self.x_train[['totals.pageviews']].reset_index(), n_splits=nfolds) if 'fullVisitorId' in self.x_train.columns: self.x_train.drop('fullVisitorId', axis=1, inplace=True) if 'fullVisitorId' in self.x_test.columns: self.x_test.drop('fullVisitorId', axis=1, inplace=True) #if 'fullVisitorId' in self.y_train.columns: #self.y_train.drop('fullVisitorId', axis=1, inplace=True) oof_preds = np.zeros(self.x_train.shape[0]) preds_test = np.empty((nfolds, self.x_test.shape[0])) self.logfile.write('param: {}\n'.format(self.param)) self.logfile.write('fold: {}\n'.format(nfolds)) self.logfile.write('data shape: {}\n'.format(self.x_train.shape)) self.logfile.write('features: {}\n'.format( self.x_train.columns.tolist())) if self.comment is not None: self.logfile.write('comment: {}\n'.format(self.comment)) self.logfile.write('output: ../output/{}.csv\n'.format(self.name)) self.logfile.flush() for n_fold, (train_idx, valid_idx) in enumerate(folds): fstart = time.time() train_x, train_y = self.x_train.iloc[train_idx], self.y_train.iloc[ train_idx] valid_x, valid_y = self.x_train.iloc[valid_idx], self.y_train.iloc[ valid_idx] # lgbRegressor parameters found by Bayesian optimization clf = LGBMRegressor(**self.param) clf.fit(train_x, np.log1p(train_y), eval_set=[(valid_x, np.log1p(valid_y))], eval_metric='rmse', verbose=100, early_stopping_rounds=200) oof_preds[valid_idx] = clf.predict( valid_x, num_iteration=clf.best_iteration_) preds_test[n_fold, :] = clf.predict( self.x_test, num_iteration=clf.best_iteration_) #remove negative and transform un log oof_preds[oof_preds < 0] = 0 preds_test[preds_test < 0] = 0 fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = self.x_train.columns.tolist() fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 self.feature_importance_df = pd.concat( [self.feature_importance_df, fold_importance_df], axis=0) strlog = '[{}][{:.1f} sec] Fold {} RMSE : {:.6f}'.format( str(datetime.now()), time.time() - fstart, n_fold + 1, mean_squared_error(np.log1p(valid_y), oof_preds[valid_idx])**.5) print(strlog) self.logfile.write(strlog + '\n') self.logfile.flush() self.regressors.append(clf) del clf, train_x, train_y, valid_x, valid_y gc.collect() full_rmse = mean_squared_error(np.log1p(self.y_train), oof_preds)**.5 strlog = 'Full RMSE score {:.6f}'.format(full_rmse) print(strlog) self.logfile.write(strlog + '\n') preds = preds_test.mean(axis=0) if submission: #sub = pd.read_csv('../input/sample_submission.csv') #sub['PredictedLogRevenue'] = preds preds.to_csv('../output/submission/{}.csv'.format(self.name), index=True) cols = self.feature_importance_df[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:100].index self.logfile.write('top features:\n') for c in cols: self.logfile.write('{}\n'.format(c)) self.logfile.flush() self.display_importances(self.feature_importance_df, self.name) # for stack np.save('../output/feats/{}_trn_prd_feats'.format(self.name), oof_preds) np.save('../output/feats/{}_tes_prd_feats'.format(self.name), preds) return self.feature_importance_df, full_rmse, oof_preds, preds
def __init__(self, name, clf, comment=None, remove_columns=None, param=None, lm_seed=None, log=None, predict_feats=False, debug=True, BASE_X_PATH=None, TRN_PRED_FEATS=None, TES_PRED_FEATS=None, nfolds=5, scale=False): self.name = name self.comment = comment if log is None: self.logfile = open('../output/log/{}.txt'.format(name), 'w') else: self.logfile = open('../output/log/{}.txt'.format(log), 'w') if param is None: self.param = { 'alpha': 0.5, } else: self.param = param if lm_seed is not None: self.param['random_state'] = lm_seed self.feature_importance_df = None self.Tunparams = None if BASE_X_PATH is not None: self.x = feather.read_dataframe(BASE_X_PATH) if TRN_PRED_FEATS is not None: self.trn_preds_feats = np.load(TRN_PRED_FEATS) if TES_PRED_FEATS is not None: self.tes_preds_feats = np.load(TES_PRED_FEATS) if remove_columns is not None: drop_features = [ _f for _f in self.x.columns if _f in remove_columns ] self.x.drop(drop_features, axis=1, inplace=True) del drop_features gc.collect() self.clf = clf(**self.param) #read & prepare datasets print('read & prepare datasets shape: {}'.format(self.x.shape)) #split train & test sets self.x_train, self.y_train, self.x_test, self.y_train_ag = prep_and_split( self.x) #debug if debug: x_train_s = self.x_train.sample(frac=0.3) x_test_s = self.x_test.sample(frac=0.3) y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin( x_train_s.index)] else: x_train_s = self.x_train.sample(frac=1) x_test_s = self.x_test.sample(frac=1) y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin( x_train_s.index)] if predict_feats: self.x_train, trn_feats = add_pred_feats(x_train_s, self.trn_preds_feats, None) self.x_test, _ = add_pred_feats(x_test_s, self.tes_preds_feats, trn_feats) self.y_train = y_train_s.groupby('fullVisitorId').sum() self.x_train.fillna(0, inplace=True) self.x_test.fillna(0, inplace=True) self.x_train.replace(np.inf, 99999, inplace=True) self.x_train.replace(-np.inf, -99999, inplace=True) self.x_test.replace(-np.inf, -99999, inplace=True) self.x_test.replace(np.inf, 99999, inplace=True) del x_train_s, x_test_s, y_train_s gc.collect() else: self.x_train = x_train_s.groupby('fullVisitorId').mean() self.x_test = x_test_s.groupby('fullVisitorId').mean() self.y_train = y_train_s.groupby('fullVisitorId').sum() self.x_train.fillna(0, inplace=True) self.x_test.fillna(0, inplace=True) self.x_train.replace(np.inf, 99999, inplace=True) self.x_train.replace(-np.inf, -99999, inplace=True) self.x_test.replace(-np.inf, -99999, inplace=True) self.x_test.replace(np.inf, 99999, inplace=True) self.folds = data_prepare.get_folds( df=self.x_train[['totals.pageviews']].reset_index(), n_splits=nfolds) if scale: print("scale...") scaler = MinMaxScaler(feature_range=(0, 1)) self.x_train[self.x_train.columns] = scaler.fit_transform( self.x_train) self.x_test[self.x_test.columns] = scaler.fit_transform( self.x_test)
def cv(self, nfolds=5, submission=True, tune=True, nthread=16): self.feature_importance_df = pd.DataFrame() self.Tunparams = { 'colsample_bytree': (0.5, 1), 'learning_rate': (.01, .05), 'subsample': (0.8, 1), 'max_depth': (1, 10), 'alpha': (.02, .10), 'lambda': (.02, .08), 'gamma': (0, 10), 'min_child_weight': (1, 40) } folds = data_prepare.get_folds(df=self.x_train[['totals.pageviews' ]].reset_index(), n_splits=nfolds) if 'fullVisitorId' in self.x_train.columns: self.x_train.drop('fullVisitorId', axis=1, inplace=True) if 'fullVisitorId' in self.x_test.columns: self.x_test.drop('fullVisitorId', axis=1, inplace=True) if 'fullVisitorId' in self.y_train.columns: self.y_train.drop('fullVisitorId', axis=1, inplace=True) print('y_train columns:', self.y_train.columns) print('y_train shape drop id?:', self.y_train.shape) np.where((self.x_test.applymap(type) == object)) oof_preds = np.zeros(self.x_train.shape[0]) preds_test = np.empty((nfolds, self.x_test.shape[0])) #Tune parameters if tune: bo = BayesianOptimization(self.xgb_evaluate, self.Tunparams) bo.maximize(init_points=5, n_iter=50, acq='rnd') best_params = bo.res['max']['max_params'] best_params['num_leaves'] = int(best_params['num_leaves']) best_params['max_depth'] = int(best_params['max_depth']) self.logfile.write('best param process: {}\n'.format( bo.res['max']['max_val'])) self.param.update(best_params) self.logfile.write('best param: {}\n'.format(self.param)) self.logfile.write('param: {}\n'.format(self.param)) self.logfile.write('fold: {}\n'.format(nfolds)) self.logfile.write('data shape: {}\n'.format(self.x_train.shape)) self.logfile.write('features: {}\n'.format( self.x_train.columns.tolist())) if self.comment is not None: self.logfile.write('comment: {}\n'.format(self.comment)) self.logfile.write('output: ../output/{}.csv\n'.format(self.name)) self.logfile.flush() for n_fold, (train_idx, valid_idx) in enumerate(folds): fstart = time.time() train_x, train_y = self.x_train.iloc[train_idx], self.y_train.iloc[ train_idx] valid_x, valid_y = self.x_train.iloc[valid_idx], self.y_train.iloc[ valid_idx] # XGBRegressor parameters found by Bayesian optimization self.clf.fit(train_x, np.log1p(train_y), eval_set=[(valid_x, np.log1p(valid_y))], eval_metric='rmse', verbose=100, early_stopping_rounds=200) oof_preds[valid_idx] = self.clf.predict( valid_x, ntree_limit=self.clf.best_iteration) preds_test[n_fold, :] = self.clf.predict( self.x_test[self.x_train.columns], ntree_limit=self.clf.best_iteration) #remove negative and transform un log oof_preds[oof_preds < 0] = 0 preds_test[preds_test < 0] = 0 fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = self.x_train.columns.tolist() fold_importance_df["importance"] = self.clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 self.feature_importance_df = pd.concat( [self.feature_importance_df, fold_importance_df], axis=0) strlog = '[{}][{:.1f} sec] Fold {} RMSE : {:.6f}'.format( str(datetime.now()), time.time() - fstart, n_fold + 1, mean_squared_error(np.log1p(valid_y), oof_preds[valid_idx])**.5) print(strlog) self.logfile.write(strlog + '\n') self.logfile.flush() del train_x, train_y, valid_x, valid_y gc.collect() full_rmse = mean_squared_error(np.log1p(self.y_train), oof_preds)**.5 strlog = 'Full RMSE score {:.6f}'.format(full_rmse) print(strlog) self.logfile.write(strlog + '\n') preds = preds_test.mean(axis=0) if submission: #sub = pd.read_csv('./input/sample_submission.csv') #sub['PredictedLogRevenue'] = preds self.x_test['PredictedLogRevenue'] = preds self.x_test[['PredictedLogRevenue']].to_csv( '../output/submission/{}.csv'.format(self.name), index=True) #preds.to_csv('../output/{}.csv'.format(self.name), index=True) cols = self.feature_importance_df[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:20].index self.logfile.write('top features:\n') for c in cols: self.logfile.write('{}\n'.format(c)) self.logfile.flush() self.display_importances(self.feature_importance_df, self.name) # for stack np.save('../output/log/{}_oof_train'.format(self.name), oof_preds) np.save('../output/log/{}_oof_test'.format(self.name), preds) return self.feature_importance_df, full_rmse, oof_preds, preds
def __init__(self, name, clf, comment=None, remove_columns=None, param=None, lgb_seed=None, n_estimators=1000, log=None, predict_feats=False, debug=True, BASE_X_PATH=None, TRN_PRED_FEATS=None, TES_PRED_FEATS=None, nfolds=5): self.name = name self.comment = comment if log is None: self.logfile = open('../output/log/{}.txt'.format(name), 'w') else: self.logfile = open('../output/log/{}.txt'.format(log), 'w') if param is None: self.param = { 'objective': 'regression', 'metric': 'rmse', 'num_leaves': 32, 'learning_rate': 0.03, 'colsample_bytree': 0.9, 'n_estimators': 1000, 'boosting_type': 'goss' } else: self.param = param if lgb_seed is not None: self.param['seed'] = lgb_seed self.param['n_estimators'] = n_estimators self.feature_importance_df = None self.Tunparams = None if BASE_X_PATH is not None: self.x = feather.read_dataframe(BASE_X_PATH) if TRN_PRED_FEATS is not None: self.trn_preds_feats = np.load(TRN_PRED_FEATS) if TES_PRED_FEATS is not None: self.tes_preds_feats = np.load(TES_PRED_FEATS) if remove_columns is not None: drop_features = [ _f for _f in self.x.columns if _f in remove_columns ] self.x.drop(drop_features, axis=1, inplace=True) del drop_features gc.collect() self.clf = clf(**self.param) #read & prepare datasets print('read & prepare datasets shape: {}'.format(self.x.shape)) #split train & test sets self.x_train, self.y_train, self.x_test, self.y_train_ag = prep_and_split( self.x) #debug if debug: x_train_s = self.x_train.sample(frac=0.3) x_test_s = self.x_test.sample(frac=0.3) y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin( x_train_s.index)] else: x_train_s = self.x_train.sample(frac=1) x_test_s = self.x_test.sample(frac=1) y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin( x_train_s.index)] if predict_feats: self.x_train, trn_feats = add_pred_feats(x_train_s, self.trn_preds_feats, None) self.x_test, _ = add_pred_feats(x_test_s, self.tes_preds_feats, trn_feats) self.y_train = y_train_s.groupby('fullVisitorId').sum() del x_train_s, x_test_s, y_train_s gc.collect() else: self.x_train.reset_index(drop=True, inplace=True) self.x_test.reset_index(drop=True, inplace=True) self.folds = data_prepare.get_folds( df=self.x_train[['totals.pageviews']].reset_index(), n_splits=nfolds)