def model_function( dataset, location_array_W, pred_ahead, target_col, extended_data=True, impute_missing=True, do_extract=True, shift_features=True, use_early_stopping=False, lgb_boosting_type="gbdt", lgb_num_leaves=31, lgb_learning_rate=0.1, lgb_max_depth=-1, split_train_pct=.7, split_test_pct=.85 ): """ This is the main model function that prepares a dataset based on some hyperparams, and trains a lightgbm model It returns both the predicted validation and test dataframe in a tuple :param dataset: name of the dataset, based on file names in data/kaggle-preprocessed/xxx.feather :param location_array_W: location array W :param pred_ahead: how far ahead the targetvariable should be shifted :param target_col: the name of the target column in the dataframe :param extended_data: (bool) whether to extend the features with the extended dataset :param impute_missing: (bool) impute missing features or leave them empty :param do_extract: (bool) use the original features or use the dimension reduction :param shift_features: (bool) enhance the data with shifted features :param use_early_stopping: (bool) whether to let the final model be the tree with the lowest error in the validation dataset :param lgb_boosting_type: lgb hyperparam :param lgb_num_leaves: lgb hyperparam :param lgb_learning_rate: lgb hyperparam :param lgb_max_depth: lgb hyperparam :param split_train_pct: percentage of data where to split train :param split_test_pct: pertentage of the data where to split the test data :return: (predicted_df_validation, predicted_df_test) """ df = gather_df(dataset, extended_data, ) df = prepare_df(df, impute_missing, do_extract, shift_features, location_array_W) y = df[target_col].pct_change(pred_ahead).shift(-pred_ahead) # splitting here is some hassle since we only want our training data to consist of days # where the target is non null, some of the datasets have years of missing target variables available_indexes = df.index[~pd.isna(y)] train_split = available_indexes[int(len(available_indexes) * split_train_pct)] val_split = available_indexes[int(len(available_indexes) * split_test_pct)] # add pred_ahead here because at time t, we do not know what the target variable will do in the future test_split_start = available_indexes[ int(len(available_indexes) * split_test_pct) + pred_ahead ] X_train = df[(df.index <= train_split)] X_val = df[(df.index > train_split) & (df.index < val_split)] X_test = df[(df.index >= test_split_start)] y_train = y[(df.index <= train_split)] # log transform y_train to reduce long tail effects and np.clip just to be safe y_train = np.log(np.clip(y_train, -1 + 1e-6, 100) + 1) y_val = y[(df.index > train_split) & (df.index < val_split)] y_val = np.log(np.clip(y_val, -1 + 1e-6, 100) + 1) y_test = y[(df.index >= test_split_start)] # for training, filter all days with missing targets X_train = X_train[~pd.isna(y_train)] y_train = y_train[~pd.isna(y_train)] rmodel = lgb.LGBMRegressor( boosting_type=lgb_boosting_type, num_leaves=lgb_num_leaves, learning_rate=lgb_learning_rate, max_depth=lgb_max_depth, ) eval_set = None if use_early_stopping: eval_set = (X_val, y_val) rmodel.fit(X_train, y_train, eval_set=eval_set, verbose=0) dfp_test = pd.DataFrame( { "p": np.exp(np.clip(rmodel.predict(X_test), -1000, 1000)) - 1, "y": y_test, "original": df[target_col][(df.index > test_split_start)], } ) # since we used percentages as targets, we need to reverse this to be able to compute objective scores dfp_test["y"] = (dfp_test.y + 1) * dfp_test.original dfp_test["p"] = (dfp_test.p + 1) * dfp_test.original dfp_test = dfp_test[~pd.isna(dfp_test.y)] dfp_val = pd.DataFrame( { "p": np.exp(np.clip(rmodel.predict(X_val), -1000, 1000)) - 1, "y": y_val, "original": df[target_col][ (df.index > train_split) & (df.index < val_split) ], } ) dfp_val["y"] = (dfp_val.y + 1) * dfp_val.original dfp_val["p"] = (dfp_val.p + 1) * dfp_val.original dfp_val = dfp_val[~pd.isna(dfp_val.y)] return dfp_val, dfp_test, rmodel, X_test, y_test
# Input data files are available in the "../input/" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir("../input")) # Any results you write to the current directory are saved as output # Feature importance #lightGBM model fit gbm = lgb.LGBMRegressor() gbm.fit(train, target) gbm.booster_.feature_importance() # importance of each attribute fea_imp_ = pd.DataFrame({'cols':train.columns, 'fea_imp':gbm.feature_importances_}) fea_imp_.loc[fea_imp_.fea_imp > 0].sort_values(by=['fea_imp'], ascending = False) TRAINING_SIZE = 300000
# get this months indices trn_idx = np.where(np.isin(train.month, validation_months, invert=True))[0] val_idx = np.where(np.isin(train.month, validation_months, invert=False))[0] #print(f"split primary_use: train size {len(trn_idx)} val size {len(val_idx)}") # remove indices not in this primary_use trn_idx = np.intersect1d(trn_idx, np.where(np.isin(train.primary_use, primary_use_group))[0]) val_idx = np.intersect1d(val_idx, np.where(np.isin(train.primary_use, primary_use_group))[0]) #print(f"split primary_use: train size {len(trn_idx)} val size {len(val_idx)}") # initialize model model = lgb.LGBMRegressor(random_state=seed+9999*args.normalize_target, n_estimators=9999, learning_rate=args.lr, feature_fraction=args.feature_fraction, subsample=args.subsample, num_leaves=args.n_leaves, metric="rmse", silent=False) # fit model msg = f'Training {full_sub_model_name} - train# {len(trn_idx)} val# {len(val_idx)}' #print(f'{datetime.now()} - Training {full_sub_model_name} - train# {len(trn_idx)} val# {len(val_idx)}') with timer(msg): model.fit(train.loc[trn_idx, FEATURES], train.loc[trn_idx, "target"], eval_set=[(train.loc[val_idx, FEATURES], train.loc[val_idx, "target"])], early_stopping_rounds=50, verbose=50) model.booster_.save_model(full_sub_model_name)
def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000, mol_type=-1, fold_group=None): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: folds - folds to split data :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ columns = X.columns if columns is None else columns X_test = X_test[columns] # to set up scoring parameters metrics_dict = { 'mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error }, 'group_mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae }, 'mse': { 'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error } } result_dict = {} # out-of-fold predictions on train data oof = np.zeros(len(X)) # averaged predictions on train data prediction = np.zeros(len(X_test)) # list of scores on folds scores = [] feature_importance = pd.DataFrame() model_list = [] # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups=fold_group)): print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[ valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] params["objective"] = "reg:linear" params["eval_metric"] = metrics_dict[eval_metric][ 'lgb_metric_name'] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict(X_test).reshape(-1, ) if model_type == 'cat': model = CatBoostRegressor( iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, loss_function=metrics_dict[eval_metric] ['catboost_metric_name']) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid.reshape(-1, ) if eval_metric != 'group_mae': scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) else: scores.append(metrics_dict[eval_metric]['scoring_function']( y_valid, y_pred_valid, X_valid['type'])) prediction += y_pred if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) model_list += [model] prediction /= folds.n_splits try: cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + 'CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores)) print(cv_score_msg) send_message(cv_score_msg) except Exception as e: print(e) pass result_dict["models"] = model_list result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores if model_type == 'lgb': if plot_feature_importance: feature_importance["importance"] /= folds.n_splits cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LGB Features (avg over folds)') feature_importance.to_csv(log_path / f"importance_{mol_type}.csv") result_dict['feature_importance'] = feature_importance return result_dict
learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=1000, reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213, silent=1, random_state=7, nthread=-1) model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11) n_folds = 5 score = rmsle_cv(lasso, train, y_train, n_folds=n_folds) print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) score = rmsle_cv(ENet, train, y_train, n_folds=n_folds) print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) score = rmsle_cv(KRR, train, y_train, n_folds=n_folds) print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) score = rmsle_cv(GBoost, train, y_train, n_folds=n_folds) print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(
def merf(normalise = False): hyper_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': ['l1', 'rmse'], 'learning_rate': 0.001, 'feature_fraction': 0.8, "max_depth": 6, "max_bin": 512, "num_leaves": 40, "num_iterations": 100000, "n_estimators": 300, "verbose": -1 } # 'bagging_fraction': 0.7, # 'bagging_freq': 10, "num_leaves": 12, gbm = lgb.LGBMRegressor(**hyper_params) ap2 = ap.fillna(method = "pad") ap2.isna().sum().sum() X_train, Y_train, X_test, Y_test = preprocessing(ap2, hour2int = True, onehotencode = False) Z_train = np.ones((len(X_train), 1)) clusters_train = X_train['hours'] clusters_test= X_test['hours'] X_train1 = X_train.drop(["hours"],axis = 1) X_test1 = X_test.drop(["hours"],axis = 1) if normalise: X_train1 =(X_train1-X_train1.mean())/X_train1.std() X_test1 =(X_test1-X_test1.mean())/X_test1.std() # we should not nornalise the Y (response) # Y_train1 =(Y_train-Y_train.mean())/Y_train.std() #my_imputer = SimpleImputer() #X_train1 = my_imputer .fit_transform(X_train1) # fit missing #X_test1 = my_imputer .fit_transform(X_test1) # normalising for boosting is commonly not necessary, but for the mixed effect models # we actually may want to normalise. But we only normalise X (predictors)! # check if missing print( Y_train1.isnull().any().any(),X_train1.isnull().any().any(),X_test.isnull().any().any()) merf = MERF(gbm, max_iterations = 4) merf.fit(X_train1, Z_train, clusters_train, Y_train1) Z_test = np.ones((len(X_test1), 1)) y_pred_ = merf.predict(X_test1, Z_test, clusters_test) # also normalise the response and prediction wont work #if normalise: # y_pred = y_pred_*Y_train.std()+Y_train.mean() mae = abs(y_pred - Y_test).mean() rmse = math.sqrt(((y_pred - Y_test)*(y_pred - Y_test)).mean()) rrmse = rmse / Y_test.median() r2 = get_r2_numpy_corrcoef(Y_test, y_pred) return(mae, rmse, rrmse, r2)
bond_type]['molecule_name'] oof = np.zeros(len(X_type)) prediction_type = np.zeros(len(X_test_type)) bond_scores = [] for fold_n, (train_idx, valid_idx) in enumerate( folds.split(X_type, groups=mol_group_type)): if fold_n == 1: # ONLY TRAIN FOR FOLD 0 continue fold_start = timer() logger.info('Running Type {} - Fold {} of {}'.format( bond_type, fold_count, folds.n_splits)) X_train, X_valid = X_type.iloc[train_idx], X_type.iloc[valid_idx] y_train, y_valid = y_type.iloc[train_idx], y_type.iloc[valid_idx] model = lgb.LGBMRegressor(**lgb_params, n_estimators=N_ESTIMATORS, n_jobs=N_THREADS) model.fit( X_train.drop('type', axis=1), y_train, eval_set=[ #(X_train.drop('type', axis=1), y_train), (X_valid.drop('type', axis=1), y_valid) ], eval_metric=EVAL_METRIC, verbose=VERBOSE, early_stopping_rounds=EARLY_STOPPING_ROUNDS) now = timer() update_tracking(run_id, '{}_tr_sec_f{}'.format(bond_type, fold_n + 1), (now - fold_start), integer=True)
def fit_lgb(i, X_train_cv, y_train_cv): model = lgb.LGBMRegressor(**lgb_param) model.fit(X_train_cv, y_train_cv) return model
''' # LGBR = GridSearchCV(model_lgb,lgb_params,cv=3,scoring='roc_auc',verbose=5,return_train_score = True) # LGBR.fit(train_data,y_train) # LGB_best = LGBR.best_estimator_ # print(LGB_best) # print(LGBR.best_score_) model_lgb = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', learning_rate=0.01, n_estimators=3000, reg_alpha = 0.1,reg_lambda = 0.01, max_bin = 200, num_leaves=150,max_depth=-1, subsample_freq=5, colsample_bytree = 0.6,subsample = 0.8, min_child_samples=50,min_split_gain=0,random_state=1024,n_jobs=-1) #lgb_1 auc = rmsle_cv(model_lgb,train_data,y_train) print('model_lgb AUC : ',auc) model_lgb.fit(train_data,y_train) test_y_prob = model_lgb.predict(test_data) test_y_prob = scale(test_y_prob) test_y_cat=[int(item>0.25) for item in list(test_y_prob)] test_result = pd.DataFrame(EID.values,columns=["EID"]) test_result['FORTARGET']=test_y_cat
def gbm_model(): model_gbm = gbm.LGBMRegressor() return model_gbm
# In[8]: from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA, NMF from sklearn.feature_selection import SelectKBest, mutual_info_regression pipe = Pipeline([ ('reduce_dim', PCA()), ('regressor', lgb.LGBMRegressor( objective='regression', num_leaves=31, learning_rate=0.01, silent=False )) ]) N_FEATURES_OPTIONS = [50, 100, 300] param_grid = [ { 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'regressor__boosting_type': ['gbdt', 'dart'], #'goss', 'rf'], 'regressor__n_estimators': [50, 100, 500] }, { 'reduce_dim': [SelectKBest(mutual_info_regression)],
def light_gbm_regression(self): model = lgb.LGBMRegressor() return self.fiting_model(model)
"bagging_fraction": 0.75, "bagging_seed": 11, "metric": 'mae', "verbosity": -1, 'reg_alpha': 0.1302650970728192, 'reg_lambda': 0.3603427518866501 } for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train_scaled)): print(fold_n) X_train, X_valid = X_train_scaled.iloc[train_index], X_train_scaled.iloc[ valid_index] y_train, y_valid = y_tr.iloc[train_index], y_tr.iloc[valid_index] model = lgb.LGBMRegressor(**params, n_estimators=50000, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae', verbose=10000, early_stopping_rounds=200) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) oof[valid_index] = y_pred_valid scores.extend(mean_absolute_error(y_valid.values.reshape(-1), y_pred_valid)) prediction += y_pred prediction /= n_fold
df, mean, std = lib.normalize(df) (x_train_lstm, y_train_lstm), (x_test_lstm, y_test_lstm), columns = lib.train_test_split_lstm( df["price"].values, df.index, PAST_HISTORY, TRAIN_RATIO) (x_train, y_train), (x_test, y_test), columns = lib.train_test_split( df["price"].values, df.index, PAST_HISTORY, TRAIN_RATIO) # モデルを定義 lstm = create_model(x_train_lstm.shape[-2:]) rfr = RandomForestRegressor(max_depth=5, random_state=RANDOM_STATE, n_estimators=100) xgb = xgboost.XGBRegressor(n_estimators=100, random_state=RANDOM_STATE) lgbm = lightgbm.LGBMRegressor(n_estimators=100, random_state=RANDOM_STATE) svm = SVR() # モデルを学習 # lstm.fit(x_train_lstm, y_train_lstm, batch_size=BATCH_SIZE, epochs=EPOCHS, # verbose=1, validation_data=(x_test_lstm, y_test_lstm)) rfr.fit(x_train, y_train) xgb.fit(x_train, y_train) lgbm.fit(x_train, y_train) svm.fit(x_train, y_train) # モデルで予測 # y_pred_lstm = lstm.predict(x_test_lstm) y_pred_rfr = rfr.predict(x_test) y_pred_xgb = xgb.predict(x_test) y_pred_lgbm = lgbm.predict(x_test) y_pred_svm = svm.predict(x_test)
def hold_out_lgb_validation(X, y, params, eval_metric='mae', columns=None, plot_feature_importance=False, verbose=10000, early_stopping_rounds=200, n_estimators=50000, ): columns = X.columns if columns is None else columns # to set up scoring parameters metrics_dict = {'mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error}, 'group_mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae}, 'mse': {'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error} } result_dict = {} X_train, X_valid, y_train, y_valid = train_test_split(X[columns], y, test_size=0.1, random_state=42) eval_result = {} callbacks = [lgb.record_evaluation(eval_result)] model:lgb.LGBMRegressor = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1, importance_type='gain') print(model) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds, callbacks=callbacks) y_pred_valid = model.predict(X_valid) if eval_metric != 'group_mae': score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid) else: score = metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']) if plot_feature_importance: # feature importance feature_importance = pd.DataFrame() feature_importance["feature"] = columns feature_importance["importance"] = model.feature_importances_ else: feature_importance = None try: cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + f' HOLD_OUT score: {score:.4f} .' print(cv_score_msg) if not DEBUG and LINE_MSG: send_message(cv_score_msg) except Exception as e: print(e) pass result_dict["model"] = model result_dict['y_pred_valid'] = pd.DataFrame(y_pred_valid, index=X_valid.index, columns=["scalar_coupling_constant"]) result_dict['score'] = score result_dict["importance"] = feature_importance result_dict["eval_result"] = eval_result result_dict["best_iteration"] = model.best_iteration_ return result_dict
"rougher.input.floatbank11_copper_sulfate", "rougher.input.feed_sol", "rougher.input.feed_size" ]) # # COLS_TO_DIFF_TOP20 = [ # # ] level0_models_rougher = {} obj = 'mae' level0_models_rougher['LGBM_rougher_base_a'] = lgb.LGBMRegressor( objective=obj, learning_rate=0.05, n_estimators=500, random_state=91, **{ 'max_depth': 5, 'num_leaves': 100, 'feature_fraction': '0.363', 'bagging_fraction': '0.262' }) level0_models_rougher['LGBM_rougher_base_b'] = lgb.LGBMRegressor( objective=obj, learning_rate=0.05, n_estimators=500, random_state=92, **{ 'max_depth': 4, 'num_leaves': 110, 'feature_fraction': '0.448', 'bagging_fraction': '0.445'
from sklearn.cross_validation import StratifiedKFold seed_ls = [] # 五折交叉训练,构造五个模型 skf = list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024)) baseloss = [] loss = 0 for i, (train_index, test_index) in enumerate(skf): print("Fold", i) model = lgb.LGBMRegressor(objective='regression', num_leaves=125, learning_rate=0.05, n_estimators=2500, boosting_type="gbdt", max_depth=-1, seed=2018, num_thread=-1, max_bin=425, bagging_fraction=0.8, colsample_bytree=0.9, subsample=0.8, lambda_l2=0.20) # #------------------------------------# lgb_model = model.fit(X[train_index], y[train_index], eval_names=['train', 'valid'], eval_metric='rmse', eval_set=[(X[train_index], y[train_index]), (X[test_index], y[test_index])], early_stopping_rounds=100)
def model(): """ 处理过程,在示例中,使用随机方法生成结果,并将结果文件存储到预测结果路径下。 :return: """ # import xgboost as xgb # dtrain=xgb.DMatrix(X_train, label=y_train) # dtest=xgb.DMatrix(X_test, label=y_test) # dval = xgb.DMatrix(X_val) # param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3, # 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, # 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'reg:linear'} # num_round = 283 # param['nthread'] = 4 # param['eval_metric'] = "auc" # param.update({'eval_metric': 'logloss'}) # plst = param.items() # evallist = [(dtest, 'eval'), (dtrain, 'train')] # xgbr=xgb.train(plst, dtrain, num_round, evallist) # from sklearn.model_selection import GridSearchCV ## import xgboost as xgb # from xgboost.sklearn import XGBRegressor # # cv_params = { 'max_depth':list(range(10,2,-1)),'min_child_weight':list(range(6,1,-1)} # other_params = {'learning_rate': 0.1, 'seed': 0, 'n_estimators': 500,'subsample': 0.8, # 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1} # # model = XGBRegressor(**other_params) # xgbr = GridSearchCV(estimator=model, param_grid=cv_params, cv=5, verbose=1, n_jobs=4) # xgbr.fit(X_train, y_train) # print('每轮迭代运行结果:{0}'.format(xgbr.grid_scores_)) # print('参数的最佳取值:{0}'.format(xgbr.best_params_)) # print('最佳模型得分:{0}'.format(xgbr.best_score_)) # from sklearn.ensemble import RandomForestRegressor # xgbr = RandomForestRegressor() # xgbr.fit(X_train, y_train) from xgboost import XGBRegressor xgbr = XGBRegressor(max_depth=4) print(xgbr) xgbr.fit(X_train, y_train) import lightgbm as lgb lgbr = lgb.LGBMRegressor(max_depth=6) print(lgbr) lgbr.fit(X_train, y_train) # xgbr = XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, # learning_rate=0.1, max_delta_step=0, max_depth=5, # min_child_weight=1, missing=None, n_estimators=100, nthread=-1, # objective='reg:linear', reg_alpha=0, reg_lambda=1, # scale_pos_weight=1, seed=0, silent=True, subsample=1) # from xgboost import plot_importance # from matplotlib import pyplot # plot_importance(xgbr,importance_type = 'cover') # pyplot.show() # from sklearn import preprocessing # Pred = preprocessing.scale(xgbr.predict(X_val)) # Pred = xgbr.predict(X_val) # Pred=(xgbr.predict(X_val)-xgbr.predict(X_val).min())/((xgbr.predict(X_val).max()-xgbr.predict(X_val).min())) # prep_1=np.log(xgbr.predict(X_val)) Id_pred = pd.DataFrame() Id_pred['Id'] = X_val_df.index Id_pred['pred_1'] = xgbr.predict(X_val) Id_pred['pred_2'] = lgbr.predict(X_val) # Id_pred['Pred']=prep_1 Id_pred['Pred'] = 0.6 * Id_pred['pred_1'].rank( ) + 0.4 * Id_pred['pred_2'].rank() del Id_pred['pred_1'], Id_pred['pred_2'] Id_pred.to_csv(path_test_out + "test.csv", index=None) # print(Id_pred['Pred'])#.value_counts().sort_values() from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error print('The score is:', xgbr.score(X_test, y_test)) print('The r2_score is:', r2_score(y_test, xgbr.predict(X_test))) print('The mean_squared_error is:', mean_squared_error(y_test, xgbr.predict(X_test))) print('The mean_absolute_error is:', mean_absolute_error(y_test, xgbr.predict(X_test)))
def to_local(self): model = lightgbm.LGBMRegressor(**self.get_params()) self._copy_extra_params(self, model) return model
def identify_zero_importance(self, task, eval_metric=None, n_iterations=10, early_stopping=True): """ Identify the features with zero importance according to a gradient boosting machine. The gbm can be trained with early stopping using a validation set to prevent overfitting. The feature importances are averaged over `n_iterations` to reduce variance. Uses the LightGBM implementation (http://lightgbm.readthedocs.io/en/latest/index.html) Parameters -------- eval_metric : string Evaluation metric to use for the gradient boosting machine for early stopping. Must be provided if `early_stopping` is True task : string The machine learning task, either 'classification' or 'regression' n_iterations : int, default = 10 Number of iterations to train the gradient boosting machine early_stopping : boolean, default = True Whether or not to use early stopping with a validation set when training Notes -------- - Features are one-hot encoded to handle the categorical variables before training. - The gbm is not optimized for any particular task and might need some hyperparameter tuning - Feature importances, including zero importance features, can change across runs """ if early_stopping and eval_metric is None: raise ValueError( """eval metric must be provided with early stopping. Examples include "auc" for classification or "l2" for regression.""") if self.labels is None: raise ValueError("No training labels provided.") # One hot encoding features = pd.get_dummies(self.data) self.one_hot_features = [ column for column in features.columns if column not in self.base_features ] # Add one hot encoded data to original data self.data_all = pd.concat([features[self.one_hot_features], self.data], axis=1) # Extract feature names feature_names = list(features.columns) # Convert to np array features = np.array(features) labels = np.array(self.labels).reshape((-1, )) # Empty array for feature importances feature_importance_values = np.zeros(len(feature_names)) print('Training Gradient Boosting Model\n') # Iterate through each fold for _ in range(n_iterations): if task == 'classification': model = lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.05, verbose=-1) elif task == 'regression': model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, verbose=-1) else: raise ValueError( 'Task must be either "classification" or "regression"') # If training using early stopping need a validation set if early_stopping: train_features, valid_features, train_labels, valid_labels = train_test_split( features, labels, test_size=0.15, stratify=labels) # Train the model with early stopping model.fit(train_features, train_labels, eval_metric=eval_metric, eval_set=[(valid_features, valid_labels)], early_stopping_rounds=100, verbose=-1) # Clean up memory gc.enable() del train_features, train_labels, valid_features, valid_labels gc.collect() else: model.fit(features, labels) # Record the feature importances feature_importance_values += model.feature_importances_ / n_iterations feature_importances = pd.DataFrame({ 'feature': feature_names, 'importance': feature_importance_values }) # Sort features according to importance feature_importances = feature_importances.sort_values( 'importance', ascending=False).reset_index(drop=True) # Normalize the feature importances to add up to one feature_importances['normalized_importance'] = feature_importances[ 'importance'] / feature_importances['importance'].sum() feature_importances['cumulative_importance'] = np.cumsum( feature_importances['normalized_importance']) # Extract the features with zero importance record_zero_importance = feature_importances[ feature_importances['importance'] == 0.0] to_drop = list(record_zero_importance['feature']) self.feature_importances = feature_importances self.record_zero_importance = record_zero_importance self.ops['zero_importance'] = to_drop print('\n%d features with zero importance after one-hot encoding.\n' % len(self.ops['zero_importance']))
np.where(rmse_oob_all == np.min(rmse_oob_all))[0][0]] regression_model = RandomForestRegressor( n_estimators=random_forest_number_of_trees, max_features=int( max( math.ceil(autoscaled_x_train.shape[1] * optimal_random_forest_x_variables_rate), 1)), oob_score=True) elif method == 'gp': # Gaussian process regression_model = GaussianProcessRegressor(ConstantKernel() * RBF() + WhiteKernel(), alpha=0) elif method == 'lgb': # LightGBM import lightgbm as lgb regression_model = lgb.LGBMRegressor() elif method == 'xgb': # XGBoost import xgboost as xgb regression_model = xgb.XGBRegressor() elif method == 'gbdt': # scikit-learn from sklearn.ensemble import GradientBoostingRegressor regression_model = GradientBoostingRegressor() regression_model.fit(autoscaled_x_train, autoscaled_y_train) # calculate y calculated_ytrain = np.ndarray.flatten( regression_model.predict(autoscaled_x_train)) if do_autoscaling: calculated_ytrain = calculated_ytrain * y_train.std(
def train_lgb_regression_alldata(X, X_test, y, params, eval_metric='mae', columns=None, plot_feature_importance=False, model=None, verbose=10000, n_estimators=50000, mol_type=-1): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ columns = X.columns if columns is None else columns X_test = X_test[columns] X_train, y_train = X[columns], y # to set up scoring parameters metrics_dict = { 'mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error }, 'group_mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae }, 'mse': { 'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error } } result_dict = {} model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose) result_dict['prediction'] = model.predict(X_test) if plot_feature_importance: # feature importance feature_importance = pd.DataFrame() feature_importance["feature"] = columns feature_importance["importance"] = model.feature_importances_ result_dict['feature_importance'] = feature_importance return result_dict
def make_regressor(iterations=6000, clf=None): if clf == 'cat': clf = CatBoostRegressor( loss_function='RMSE', # eval_metric="WKappa", task_type="CPU", #learning_rate=0.01, iterations=iterations, od_type="Iter", #depth=4, early_stopping_rounds=500, #l2_leaf_reg=10, #border_count=96, random_seed=SEED, #use_best_model=True ) if clf == 'xgb': clf = xgb.XGBRegressor( n_estimators = 5000, max_depth = 10, min_child_weight = 3, gamma = 0.25, n_jobs = -1, #verbosity=3, random_state=SEED ) if clf == 'lgb': clf = lgb.LGBMRegressor( learning_rate = 0.01, n_estimators = 2000, max_depth = 15, #reg_alpha = 1, #reg_lambda = 1, random_state=SEED ) if clf == 'ngb': clf = NGBRegressor( Dist=Normal, Score=MLE, Base=default_tree_learner, natural_gradient=True, n_estimators=2000, learning_rate=0.01, minibatch_frac=0.6, verbose=True, verbose_eval=50 ) if clf == 'ext': clf = ExtraTreesRegressor( #learning_rate = 0.01, n_estimators = 2000, max_depth = 15, #reg_alpha = 1, #reg_lambda = 1, random_state=SEED, verbose=3, #n_jobs=-1 ) return clf
def optimize(self): dataset = self._get_data() remove_columns = ['id', 'score'] x_columns = [column for column in dataset.columns if column not in remove_columns] x_data = dataset[x_columns] y_data = dataset['score'] """n_estimators: best:239 best_score: 14.847823004441079""" # params = { # 'boosting_type': 'gbdt', # 'objective': 'mae', # 'learning_rate': 0.1, # 'num_leaves': 50, # 'max_depth': 6, # 'subsample': 0.8, # 'colsample_bytree': 0.8, # } # dtrain = lgb.Dataset(x_data, y_data) # cv_results = lgb.cv(params, dtrain, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, # metrics='mae', early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=2018) # print('best n_estimators:', len(cv_results['l1-mean'])) # print('best cv score:', cv_results['l1-mean'][-1]) """max_depth:6 num_leaves:31 best_score:14.803535507027162""" # params = { # 'boosting_type': 'gbdt', # 'objective': 'mae', # 'n_estimators': 239, # 'metric': 'mae', # 'learning_rate': 0.1, # 'num_leaves': 50, # 'max_depth': 6, # 'subsample': 0.8, # 'colsample_bytree': 0.8, # } # grid_params = { # 'max_depth': [6], # 'num_leaves': [28, 29, 30, 31, 32, 33, 34, 35] # } # gbm = lgb.LGBMRegressor(**params) # grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1, # n_jobs=5) # grid_search.fit(x_data, y_data) # print(f'best params: {grid_search.best_params_}') # print(f'best score: {grid_search.best_score_}') """min_child_samples:43 min_child_weight:0 best_score:14.783911433202508""" # params = { # 'boosting_type': 'gbdt', # 'objective': 'mae', # 'n_estimators': 239, # 'metric': 'mae', # 'learning_rate': 0.1, # 'num_leaves': 31, # 'max_depth': 6, # 'subsample': 0.8, # 'colsample_bytree': 0.8, # } # grid_params = { # 'min_child_samples': [43], # 'min_child_weight': [0, 0.001] # } # gbm = lgb.LGBMRegressor(**params) # grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1, # n_jobs=5) # grid_search.fit(x_data, y_data) # print(f'best params: {grid_search.best_params_}') # print(f'best score: {grid_search.best_score_}') """subsample:0.32 colsample_bytree:0.36 best_score:14.771928920921576""" # params = { # 'boosting_type': 'gbdt', # 'objective': 'mae', # 'n_estimators': 239, # 'metric': 'mae', # 'learning_rate': 0.1, # 'min_child_samples': 43, # 'min_child_weight': 0, # 'num_leaves': 65, # 'max_depth': 6, # 'subsample': 0.8, # 'colsample_bytree': 0.8, # } # grid_params = { # 'subsample': [0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5], # 'colsample_bytree': [0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5] # } # gbm = lgb.LGBMRegressor(**params) # grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1, # n_jobs=5) # grid_search.fit(x_data, y_data) # print(f'best params: {grid_search.best_params_}') # print(f'best score: {grid_search.best_score_}') """reg_alpha:2 reg_lambda:0.1 best_score:14.75515862949816""" # params = { # 'boosting_type': 'gbdt', # 'objective': 'mae', # 'n_estimators': 239, # 'metric': 'mae', # 'learning_rate': 0.1, # 'min_child_samples': 43, # 'min_child_weight': 0, # 'num_leaves': 65, # 'max_depth': 6, # 'subsample': 0.32, # 'colsample_bytree': 0.36, # } # grid_params = { # 'reg_alpha': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 3], # 'reg_lambda': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 3], # } # gbm = lgb.LGBMRegressor(**params) # grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1, # n_jobs=5) # grid_search.fit(x_data, y_data) # print(f'best params: {grid_search.best_params_}') # print(f'best score: {grid_search.best_score_}') """learning_rate:0.1 best_score:14.778696016248404""" # params = { # 'boosting_type': 'gbdt', # 'objective': 'mae', # 'n_estimators': 239, # 'metric': 'mae', # 'learning_rate': 0.1, # 'min_child_samples': 43, # 'min_child_weight': 0, # 'num_leaves': 65, # 'max_depth': 6, # 'subsample': 0.32, # 'colsample_bytree': 0.36, # 'reg_alpha': 2, # 'reg_lambda': 0.1, # } # grid_params = { # 'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1], # } # gbm = lgb.LGBMRegressor(**params) # grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1, # n_jobs=5) # grid_search.fit(x_data, y_data) # print(f'best params: {grid_search.best_params_}') # print(f'best score: {grid_search.best_score_}') params = { 'boosting_type': 'gbdt', 'objective': 'mae', 'n_estimators': 10000, 'metric': 'mae', 'learning_rate': 0.01, 'min_child_samples': 46, 'min_child_weight': 0.01, 'subsample_freq': 1, 'num_leaves': 40, 'max_depth': 7, 'subsample': 0.42, 'colsample_bytree': 0.48, 'reg_alpha': 2, 'reg_lambda': 0.1, 'verbose': -1, 'seed': 4590 } grid_params = { 'subsample': [0.45, 0.5, 0.55], 'colsample_bytree': [0.85, 0.9, 0.95] } gbm = lgb.LGBMRegressor(**params) grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1, n_jobs=5) grid_search.fit(x_data, y_data) print(f'best params: {grid_search.best_params_}') print(f'best score: {grid_search.best_score_}')
d_train = pd.concat([y_train, X_train], ignore_index=True, axis=1) print("X_train={}, y_train={} d_train={}".format( X_train.shape, y_train.shape, d_train.shape)) np.savetxt("D:/LightGBM-master/examples/regression/geo_test.csv", d_train, delimiter='\t') if model_type == 'mort': model = LiteMORT(params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)]) #y_pred_valid = model.predict(X_valid) #y_pred = model.predict(X_test) if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='auc', verbose=5) model.booster_.save_model('geo_test_.model') #y_pred_valid = model.predict(X_valid) #y_pred = model.predict(X_test, num_iteration=model.best_iteration_) break input("loss is {} time={:.3g} model={}...".format(0, time.time() - t0, model_type)) sys.exit(-1)
from sklearn.metrics import mean_squared_error from sklearn.model_selection import GridSearchCV print('Loading data...') # load or create your dataset df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') y_train = df_train[0] y_test = df_test[0] X_train = df_train.drop(0, axis=1) X_test = df_test.drop(0, axis=1) print('Starting training...') # train gbm = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=20) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l1', early_stopping_rounds=5) print('Starting predicting...') # predict y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # eval print('The rmse of prediction is:', mean_squared_error(y_test, y_pred)**0.5) # feature importances print('Feature importances:', list(gbm.feature_importances_))
def main(sum_of_logs=False, nrows=None): try: trn_users, trn_y, indexers = get_user_data(file_name=TRN_PATH, cat_indexers=None, nrows=nrows, sum_of_logs=sum_of_logs) sub_users, _, _ = get_user_data(file_name=SUB_PATH, cat_indexers=indexers, nrows=nrows) folds = KFold(n_splits=5, shuffle=True, random_state=7956112) sub_preds = np.zeros(sub_users.shape[0]) oof_preds = np.zeros(trn_users.shape[0]) oof_scores = [] lgb_params = { 'learning_rate': 0.03, 'n_estimators': 2000, 'num_leaves': 128, 'subsample': 0.2217, 'colsample_bytree': 0.6810, 'min_split_gain': np.power(10.0, -4.9380), 'reg_alpha': np.power(10.0, -3.2454), 'reg_lambda': np.power(10.0, -4.8571), 'min_child_weight': np.power(10.0, 2), 'silent': True } for fold_, (trn_, val_) in enumerate(folds.split(trn_users)): model = lgb.LGBMRegressor(**lgb_params) model.fit(trn_users.iloc[trn_], trn_y.iloc[trn_], eval_set=[(trn_users.iloc[trn_], trn_y.iloc[trn_]), (trn_users.iloc[val_], trn_y.iloc[val_])], eval_metric='rmse', early_stopping_rounds=100, verbose=0) oof_preds[val_] = model.predict(trn_users.iloc[val_]) curr_sub_preds = model.predict(sub_users) curr_sub_preds[curr_sub_preds < 0] = 0 sub_preds += curr_sub_preds / folds.n_splits # preds[preds <0] = 0 logger.info('Fold %d RMSE (raw output) : %.5f' % (fold_ + 1, rmse(trn_y.iloc[val_], oof_preds[val_]))) oof_preds[oof_preds < 0] = 0 oof_scores.append(rmse(trn_y.iloc[val_], oof_preds[val_])) logger.info('Fold %d RMSE : %.5f' % (fold_ + 1, oof_scores[-1])) logger.info('Full OOF RMSE (zero clipped): %.5f +/- %.5f' % (rmse(trn_y, oof_preds), float(np.std(oof_scores)))) # Stay in logs for submission sub_users['PredictedLogRevenue'] = sub_preds sub_users[['PredictedLogRevenue']].to_csv("simple_lgb.csv", index=True) logger.info('Submission data shape : {}'.format( sub_users[['PredictedLogRevenue']].shape)) hist, bin_edges = np.histogram(np.hstack((oof_preds, sub_preds)), bins=25) plt.figure(figsize=(12, 7)) plt.title('Distributions of OOF and TEST predictions', fontsize=15, fontweight='bold') plt.hist(oof_preds, label='OOF predictions', alpha=.6, bins=bin_edges, density=True, log=True) plt.hist(sub_preds, label='TEST predictions', alpha=.6, bins=bin_edges, density=True, log=True) plt.legend() plt.savefig('distributions.png') except Exception as err: logger.exception("Unexpected error")
def run_hyperopt(self, param_space, X_vars, model_params, fmin_max_evals, algo = 'tpe', metric = 'balanced_accuracy_score', trials_obj = None, model_type = 'indicator'): ''' Function to run Bayeisan or Random Search hyperparameter optimization ''' #Builds the model object to conduct hyperparameter tuning on if model_type == 'indicator': hyperopt_model = lightgbm.LGBMModel(**model_params, importance_type = 'gain') elif model_type == 'regressor': hyperopt_model = lightgbm.LGBMRegressor(**model_params, importance_type = 'gain') eval_set = [(self.df_tune[X_vars], self.df_tune[self.target])] hyperopt_model.fit(X =self.df_train[X_vars], y = self.df_train[self.target], eval_set = eval_set, verbose = False) data = self.df_tune def evaluate_metric(params): hyperopt_model.set_params(**params, bagging_freq = 1 ).fit(X =self.df_train[X_vars], y = self.df_train[self.target], eval_set = eval_set, verbose = False) eval_x = data[X_vars] y_true = data[self.target] y_score = hyperopt_model.predict(eval_x) y_pred = [np.argmax(i) for i in y_score] if isinstance(metric, str): sk_scorer = getattr(metrics, metric, None) if sk_scorer is None: print(f"Specified metric {metric} does not exist in sklearn") score = sk_scorer(y_true = y_true, y_pred = y_pred) return {'loss': -score, 'params': params, 'status': STATUS_OK } if trials_obj is None: self.trials = Trials() else: self.trials = trials_obj if algo == 'tpe': algo = tpe.suggest elif algo == 'random': algo = rand.suggest best_params = fmin( evaluate_metric, space = param_space, algo = algo, max_evals = fmin_max_evals, rstate = np.random.RandomState(self.seed), trials = self.trials ) return best_params, self.trials
model_random_state = 5 params = { 'max_depth': range(6,15), 'num_leaves': range(20, 100, 10), # 'min_child_samples': [18, 19, 20, 21, 22], # 'min_child_weight':[0.001, 0.002], 'feature_fraction': [0.5, 0.6, 0.7,0.8], 'bagging_fraction': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], 'reg_alpha': [ 0.6, 0.7, 0.8, 0.9, 1], 'reg_lambda': [ 0.6, 0.7, 0.8, 0.9, 1] } model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50, learning_rate=0.1, n_estimators=40, max_depth=6, metric='rmse', bagging_fraction = 0.5,feature_fraction = 0.4) rc = RandomizedSearchCV(model_lgb, params, cv=5, random_state=model_random_state) rc.fit(train_X_PM25,train_y_PM25) #%% print('best params: ', rc.best_params_) print('best score: ', rc.best_score_) best_position = rc.best_index_ print('best train score:', rc.cv_results_['mean_train_score'][best_position]) print('best train std:', rc.cv_results_['std_train_score'][best_position]) print('best test score:', rc.cv_results_['mean_test_score'][best_position]) print('best test std:', rc.cv_results_['std_test_score'][best_position]) # best params: {'reg_lambda': 0.9, 'reg_alpha': 0.6, 'num_leaves': 60, 'max_depth': 14, 'feature_fraction': 0.5, 'bagging_fraction': 0.3} # best score: 0.8011678911600456 # best train score: 0.9083334290707444
def train_model(X, X_test, y, folds, params=None, model_type='lgb', plot_feature_importance=False, model=None): oof = np.zeros(len(X)) prediction = np.zeros(len(X_test)) scores = [] feature_importance = pd.DataFrame() for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): print('Fold', fold_n, 'started at', time.ctime()) if type(X) == np.ndarray: X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X.iloc[train_index], X.iloc[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_estimators=1000, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae', verbose=100, early_stopping_rounds=200) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = mean_absolute_error(y_valid, y_pred_valid) print(f'Fold {fold_n}. MAE: {score:.4f}.') print('') y_pred = model.predict(X_test).reshape(-1, ) if model_type == 'cat': model = CatBoostRegressor(iterations=20000, eval_metric='MAE', **params) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid.reshape(-1, ) scores.append(mean_absolute_error(y_valid, y_pred_valid)) prediction += y_pred if model_type == 'lgb': # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = X.columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) prediction /= n_fold print('CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores))) if model_type == 'lgb': feature_importance["importance"] /= n_fold if plot_feature_importance: cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LGB Features (avg over folds)') plt.show() return oof, prediction, feature_importance return oof, prediction, scores else: return oof, prediction, scores