def grid_search_para(train_data, label, best_para=0, grid_param=0, is_search_estimator=False, search_lr=0.1, scoring='roc_auc', search_estimators=100, iid=False, cv=skfold): if not is_search_estimator: print("search other parameters") xgb_ = XGBRegressor(**best_para) best_para['objective'] = 'binary:logistic' best_para['nthread'] = 8 grid_search = GridSearchCV(estimator=xgb_, param_grid=grid_param, scoring=scoring, iid=iid, cv=cv) grid_search.fit(train_data, label) best_para.update(grid_search.best_params_) else: print("search n_estimators parameters") xgb_ = XGBRegressor(booster="dart") if best_para == 0: best_para = xgb_.get_params() best_para['n_estimators'] = search_estimators best_para['learning_rate'] = search_lr xgb_ = XGBRegressor(**best_para) best_estimator = xgb_cv(xgb_, train_data, label) best_para['n_estimators'] = best_estimator return best_para
def grid_search(parameters, X_train_res, y_train_res, X_test, y_test, useTrainCV=False): xgbmodel = XGBRegressor() kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=10) grid_search_xg = GridSearchCV(xgbmodel, parameters, scoring='roc_auc', n_jobs=1, cv=kfold, verbose=1) result_gcv_xgb = grid_search_xg.fit(X_train_res, y_train_res) best_params = result_gcv_xgb.best_params_ print("Best params: %s" % (best_params)) # rebuild using best params xg_reg = XGBRegressor(objective=best_params['objective'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], n_estimators=best_params['n_estimators'], min_child_weight=best_params['min_child_weight'], gamma=best_params['gamma'], colsample_bytree=best_params['colsample_bytree'], subsample=best_params['subsample'], reg_alpha=best_params['reg_alpha']) if useTrainCV: xgb_param = xg_reg.get_xgb_params() xgtrain = DMatrix(X_train_res, label=y_train_res) cvresult = cv(xgb_param, xgtrain, num_boost_round=xg_reg.get_params()['n_estimators'], folds=kfold, metrics='auc', early_stopping_rounds=20) xg_reg.set_params(n_estimators=cvresult.shape[0]) print("Best number of estimators: %i" % (cvresult.shape[0])) eval_set = [(X_test, y_test)] xg_reg.fit(X_train_res, y_train_res, eval_metric="error", eval_set=eval_set, verbose=False) y_pred_train = xg_reg.predict(X_train_res) #print("Accuracy train: %f" % (accuracy_score(y_train_res, y_pred_train))) #print("Recall train: %f" % (recall_score(y_train_res, y_pred_train))) #print("Precision train: %f" % (precision_score(y_train_res, y_pred_train))) print("AUC train: %f" % (roc_auc_score(y_train_res, y_pred_train))) y_pred = xg_reg.predict(X_test) #print("Accuracy test: %f" % (accuracy_score(y_test, y_pred))) #print("Recall test: %f" % (recall_score(y_test, y_pred))) #print("Precision test: %f" % (precision_score(y_test, y_pred))) print("AUC test: %f" % (roc_auc_score(y_test, y_pred)))
def get_params(self, deep=True): ''' A hack to make it work through the XGB code. They use the base class 0 to retrieve the parameters. Since I overwrite the base_class[0] as OnehotEncodingClassifierMixin, now I do a hack to temporarily assign the base class as the next one (XGB class). ''' orig_bases = copy.deepcopy(self.__class__.__bases__) self.__class__.__bases__ = (XGBRegressor, ) self.__class__ = XGBRegressor params = XGBRegressor.get_params(self, deep=deep) self.__class__ = MyXGBRegressor self.__class__.__bases__ = orig_bases return params
class XGBaseline(BaseEstimator, RegressorMixin): def __init__(self, **kwargs): self.xgb_mean = XGBRegressor(**kwargs) def fit(self, X, y): self.xgb_mean.fit(X, y) errors = y - self.xgb_mean.predict(X) self.std = np.std(errors) return self def predict(self, X, y=None): pred_mean = self.xgb_mean.predict(X) pred_std = self.std * np.ones(len(pred_mean)) return pred_mean, pred_std def get_params(self, deep=True): return self.xgb_mean.get_params() def set_params(self, **params): self.xgb_mean.set_params(**params) return self
class XGBLogLikelihood(BaseEstimator, RegressorMixin): def __init__(self, **kwargs): self.xgb_mean = XGBRegressor(**kwargs) kwargs["objective"] = ll_objective self.xgb_log_var = XGBRegressor(**kwargs) def fit(self, X, y): self.xgb_mean.fit(X, y) errors = y - self.xgb_mean.predict(X) self.xgb_log_var.fit(X, errors) return self def predict(self, X, y=None): pred_mean = self.xgb_mean.predict(X) pred_std = np.exp(self.xgb_log_var.predict(X) / 2) return pred_mean, pred_std def get_params(self, deep=True): return self.xgb_mean.get_params() def set_params(self, **params): self.xgb_mean.set_params(**params) self.xgb_log_var.set_params(**params) return self
model = XGBRegressor( n_estimators=70, learning_rate=0.15, reg_alpha=10, max_depth=3, missing=np.nan, subsample=0.7, reg_lambda=100, n_jobs=-1, gamma=2, min_child_weight=1, # nthread = -1, seed=555) model.fit(np.array(x), np.array(y)) print('model fitted!') print(model.get_params()) # x_test, y_test = make_val_set(test_rdd) # y_pred = model.predict(np.array(x_test)) # rmse = np.sqrt(mean_squared_error(y_pred, y_test)) # print('oob rmse is : ', rmse) y_pred = model.predict(np.array(x_test)) to_save = list( map(lambda x: (x[0][0], x[0][1], x[1]), zip(test_rdd.collect(), y_pred))) write_csv(to_save, output_path) # y_pred = model.predict(np.array(x_train)) # rmse = np.sqrt(mean_squared_error(y_pred, y_train)) # print('in sample rmse is : ', rmse)
def _xgb_regression_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objectibe='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'), greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'), greater_than_or_equal_to(n_estimators, 1, 'n_estimators')) regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent, objectibe, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) regressor.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = regressor.get_params() feature_importance = regressor.feature_importances_ # plt.rcdefaults() plot_importance(regressor) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() out_model = _model_dict('xgb_regression_model') out_model['feature_cols'] = feature_cols out_model['label_col'] = label_col out_model['parameters'] = get_param out_model['feature_importance'] = feature_importance out_model['regressor'] = regressor out_model['plot_importance'] = fig_plot_importance # out_model['plot_tree_UT'] = fig_plot_tree_UT # out_model['plot_tree_LR'] = fig_plot_tree_LR # out_model['to_graphviz'] = md_to_graphviz # report get_param_list = [] get_param_list.append(['feature_cols', feature_cols]) get_param_list.append(['label_col', label_col]) for key, value in get_param.items(): temp = [key, value] get_param_list.append(temp) get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## XGB Regression Result | | ### Plot Importance | {image_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {table_parameter} | """.format(image_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), table_parameter=pandasDF2MD(get_param_df)))) out_model['_repr_brtc_'] = rb.get() return {'model': out_model}
pred_array[1, 3] = month_map[mon_map[(temp + 2) % 12]] #set month pred_array[1, 2] = df.iloc[-1]['Year'] + 1 #set year else: pred_array[1, 3] = df.iloc[-1]['Month'] + 2 #set month pred_array[1, 2] = df.iloc[-1]['Year'] #set year if temp + 3 > 12: pred_array[2, 3] = month_map[mon_map[(temp + 3) % 12]] #set month pred_array[2, 2] = df.iloc[-1]['Year'] + 1 #set year else: pred_array[2, 3] = df.iloc[-1]['Month'] + 3 #set month pred_array[2, 2] = df.iloc[-1]['Year'] #set year pred_array[0, 4] = df.iloc[-1]['date'] + 1 #set date pred_array[1, 4] = df.iloc[-1]['date'] + 2 pred_array[2, 4] = df.iloc[-1]['date'] + 3 df1 = df[df['APMC'] == int(apmc)] #to get the district name dname = df1.iloc[0]['district_name'] pred_array[:, 5] = dname op = np.array([[0, 0, 0, 0, 0, 0]]) for i in range(0, 3): op[0] = pred_array[i] # y=y.reshape(-1,len(x)) print('Input for prediction: ', np.array(op)) result_array = clf.predict(np.array(op)) print('Output for prediction for future ', i, ' month: ', result_array) print(clf.get_params())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) # MinMaxScaler da mejores resultados que StanderScaler scaler = MinMaxScaler() train_scaled = scaler.fit_transform(X_train) test_scaled = scaler.transform(X_test) model = XGBRegressor() model.fit(train_scaled, y_train) print("Accuracy on train data: ", round(model.score(train_scaled, y_train)*100, 2), "%") print("Accuracy on test data: ", round(model.score(test_scaled, y_test)*100, 2), "%") print("Parameters: ", model.get_params()) print("MAE: ", mean_absolute_error(y_test, model.predict(test_scaled))) # TODO: Se puede mejorar el Grid gridParams = { "n_estimators": np.arange(1100, 1500) } grid = GridSearchCV(model, gridParams, verbose=1, cv=3, n_jobs=5) grid.fit(train_scaled, y_train) print("Best params:", grid.best_params_) print("Best score:", grid.best_score_)
def _xgb_regression_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objectibe='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=None, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None, importance_type='gain'): if random_state is None: random_state = randint(-2**31, 2**31-1) regressor = XGBRegressor(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, silent=silent, objective=objectibe, booster=booster, n_jobs=n_jobs, nthread=nthread, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, reg_alpha=reg_alpha, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, base_score=base_score, random_state=random_state, seed=seed, missing=missing, importance_type=importance_type) feature_names, features = check_col_type(table, feature_cols) label = table[label_col] regressor.fit(features, label, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = regressor.get_params() feature_importance = regressor.feature_importances_ # plt.rcdefaults() # plot_importance(regressor) # plt.tight_layout() # fig_plot_importance = plt2MD(plt) fig_plot_importance = _plot_feature_importances(feature_cols, regressor) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() out_model = _model_dict('xgb_regression_model') out_model['feature_cols'] = feature_cols out_model['label_col'] = label_col out_model['parameters'] = get_param out_model['feature_importance'] = feature_importance out_model['regressor'] = regressor out_model['plot_importance'] = fig_plot_importance # out_model['plot_tree_UT'] = fig_plot_tree_UT # out_model['plot_tree_LR'] = fig_plot_tree_LR # out_model['to_graphviz'] = md_to_graphviz # report get_param_list = [] get_param_list.append(['feature_cols', feature_names]) get_param_list.append(['label_col', label_col]) for key, value in get_param.items(): temp = [key, value] get_param_list.append(temp) get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_names).T rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## XGB Regression Result | | ### Plot Feature Importance | {image_importance} | | ### Normalized Feature Importance Table | {table_feature_importance} | | ### Parameters | {table_parameter} | """.format(image_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), table_parameter=pandasDF2MD(get_param_df) ))) out_model['_repr_brtc_'] = rb.get() feature_importance_table = pd.DataFrame([[feature_cols[i],feature_importance[i]] for i in range(len(feature_cols))],columns = ['feature_name','importance']) out_model['feature_importance_table'] = feature_importance_table return {'model' : out_model}
class RaceStrategyModel(object): def __init__(self, year: int, verbose=False, n_cores=1): print("XGB using {} threads".format(n_cores)) self.regular_model = XGBRegressor(n_jobs=n_cores) self.pit_model = XGBRegressor(n_jobs=n_cores) self.safety_model = XGBRegressor(n_jobs=n_cores) self.test_race = None self.scaler = None self.test_race_pit_model = None self.dummy_columns = None self.n_cores = n_cores # self.start_lap = start_lap if year == 2014: year = "year_1" elif year == 2015: year = "year_2" elif year == 2016: year = "year_3" elif year == 2017: year = "year_4" elif year == 2018: year = "year_5" elif year == 2019: year = "year_6" else: raise ValueError("No race available for year " + str(year)) self.year = year self.verbose = verbose def split_train_test(self, df: pd.DataFrame, split_fraction: float): """ Split the dataset randomly but keeping whole races together """ test_data = pd.DataFrame(columns=df.columns) races = df[df[self.year] == 1]['raceId'].unique() if split_fraction != 0: split_size = int(round(split_fraction * len(races))) else: # Leave only one race out from the training split_size = 1 test_races = np.random.choice(races, size=split_size) for race in test_races: race_laps = df.loc[df['raceId'] == race] test_data = test_data.append(race_laps) df = df[df.raceId != race] return df, test_data def normalize_dataset(self, df): """ Normalize integer-valued columns of the dataset """ data = df.copy() # print(df.columns) # Remove columns not to be normalized zero_one = [ 'battle', 'drs', "circuitId_1", "circuitId_2", "circuitId_3", "circuitId_4", "circuitId_6", "circuitId_7", "circuitId_9", "circuitId_10", "circuitId_11", "circuitId_13", "circuitId_14", "circuitId_15", "circuitId_17", "circuitId_18", "circuitId_22", "circuitId_24", "circuitId_32", "circuitId_34", "circuitId_69", "circuitId_70", "circuitId_71", "circuitId_73", "tyre_1", "tyre_2", "tyre_3", "tyre_4", "tyre_5", "tyre_6", "year_1", "year_2", "year_3", "year_4", "year_5", "year_6", "nextLap", 'pit', 'safety', "unnorm_lap" ] #'milliseconds', #'cumulative', 'unnorm_lap'] temp_df = data[zero_one].copy() data.drop(zero_one, axis=1, inplace=True) # if self.columns is not None and len(data.columns) != len(self.columns): # print(set(data.columns).difference(set(self.columns))) # exit(-1) if not self.scaler: self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(data) scaled = data else: scaled = self.scaler.transform(data) data.loc[:, :] = scaled data = data.join(temp_df) del temp_df return data def __process_dataset(self, dataset): """ Pre-process the dataset to obtain training data and its labels""" # Discard wet and suspended races old_races = len(dataset['raceId'].unique()) dataset = discard_wet(dataset) dataset = discard_suspended_races(dataset) new_races = len(dataset['raceId'].unique()) if self.verbose: print( "{} wet and suspended races were discarded".format(old_races - new_races)) # Eliminate the last lap from the training data, as it has 0 target dataset = dataset[dataset['nextLap'] > 0] # Express the next lap target as a delta to the pole lap dataset['nextLap'] = (dataset['nextLap'] - dataset['pole']) # Duplicate columns to use them after normalization dataset['base'] = dataset['pole'].astype(int) dataset['true'] = dataset['milliseconds'].astype(int) dataset['true_cumulative'] = dataset['cumulative'].astype(int) # Normalize the dataset, but normalize the lap time and cumulative time individually, in order to be able to # normalize them at runtime # Remove the duplicated unnormalized columns from the train data dataset = dataset.drop(columns=['base', 'true', 'true_cumulative']) dataset = self.normalize_dataset(dataset) _, self.test_race = self.split_train_test(dataset, split_fraction=0) self.__compute_pitstop_model(dataset) self.dummy_columns = dataset.columns train_data = self.normalize_dataset(dataset) # train_data = train_data[train_data['unnorm_lap'] > self.start_lap] # Take laps after a threshold # Remove columns used only to identify the laps in testing train_data = train_data.drop( columns=['unnorm_lap', "raceId", "driverId", "race_length"]) # Split the dataset into three separate datasets, one per each model to be trained train_pit = deepcopy(train_data.loc[train_data['pit'] != 0]) train_safety = deepcopy(train_data.loc[(train_data['safety'] != 0) & (train_data['pit'] == 0)]) train_regular = deepcopy(train_data.loc[(train_data['pit'] == 0) & (train_data['safety'] == 0)]) # Remove features related to pit and safety in the "regular" laps model train_regular = train_regular.drop( columns=['safety', 'pit', 'pit-cost', 'pitstop-milliseconds']) # Extract the target labels labels_pit = train_pit.pop('nextLap') labels_safety = train_safety.pop('nextLap') labels_regular = train_regular.pop('nextLap') train_data = { 'regular': train_regular, 'safety': train_safety, 'pit': train_pit } labels = { 'regular': labels_regular, 'safety': labels_safety, 'pit': labels_pit } return train_data, labels def __compute_pitstop_model(self, full_dataset: pd.DataFrame): """Compute a normal distribution's parameters for each driver's pit-stop times""" circuit = get_current_circuit(self.test_race) pits = [] pits_safety = [] stop_laps = full_dataset[(full_dataset['pitstop-milliseconds'] > 0) & ( full_dataset[circuit] == 1)].sort_values('lap') pit_times = stop_laps[stop_laps['safety'] == 0]['pitstop-milliseconds'].values pit_safety_times = stop_laps[ stop_laps['safety'] > 0]['pitstop-milliseconds'].values pits.extend(pit_times.tolist()) pits_safety.extend(pit_safety_times.tolist()) safety_mean = np.mean( pit_safety_times) if len(pit_safety_times) > 0 else 0 safety_std = np.std( pit_safety_times) if len(pit_safety_times) > 0 else 0 mean = np.mean(pit_times) if len(pit_times) > 0 else 0 std = np.std(pit_times) if len(pit_times) > 0 else 0 self.test_race_pit_model = { 'regular': (mean, std), 'safety': (safety_mean, safety_std) } def train(self): """ Train the regression models """ if self.verbose: print('Training models...') self.scaler = None if self.verbose: print("Model uses {} cores".format(self.n_cores)) # self.regular_model = XGBRegressor(n_jobs=self.n_cores) # self.pit_model = XGBRegressor(n_jobs=self.n_cores) # self.safety_model = XGBRegressor(n_jobs=self.n_cores) dataset = load_dataset() datasets, labels = self.__process_dataset(dataset) self.regular_model.fit(datasets['regular'], labels['regular']) self.pit_model.fit(datasets['pit'], labels['pit']) self.safety_model.fit(datasets['safety'], labels['safety']) if self.verbose: print('Done!\n') def resplit(self): # TODO fix the invalidation of scaler to avoid the normalization of test races self.scaler = None dataset = load_dataset() self.__process_dataset(dataset) self._test_race = fix_data_types(self.test_race) self.laps_database = defaultdict(lambda: None) self.race_id = self.test_race["raceId"].values[0] for i in range(self.test_race["lap"].count()): row = self.test_race.iloc[[i]] self.laps_database[(row["driverId"].values[0], row["lap"].values[0])] = row def load(self): """ Restore prediction models from previously pickled files to avoid retraining """ if self.verbose: print("Loading prediction models from pickled files...") if not os.path.isfile( "./envs/race_strategy_model/pickled_models/regular.model"): print("ERROR: regular.model is missing") exit(-1) else: self.regular_model.load_model( './envs/race_strategy_model/pickled_models/regular.model') if not os.path.isfile( "./envs/race_strategy_model/pickled_models/safety.model"): print("ERROR: safety.model is missing") exit(-1) else: self.safety_model.load_model( './envs/race_strategy_model/pickled_models/safety.model') if not os.path.isfile( "./envs/race_strategy_model/pickled_models/pit.model"): print("ERROR: pit.model is missing") exit(-1) else: self.pit_model.load_model( './envs/race_strategy_model/pickled_models/pit.model') if not os.path.isfile( "./envs/race_strategy_model/pickled_models/scaler.pickle"): print("ERROR: scaler.pickle is missing") exit(-1) else: with open( './envs/race_strategy_model/pickled_models/scaler.pickle', 'rb') as scaler_file: self.scaler = pickle.load(scaler_file) scaler_file.close() # if not os.path.isfile("pickled_models/test_race.pickle"): # print("ERROR: test_race.pickle is missing") # exit(-1) # else: # with open('pickled_models/test_race.pickle', 'rb') as pit_file: # self.pit_model = pickle.load(pit_file) # pit_file.close() if self.verbose: print("Done!\n") # self.regular_model.set_params(**{"n_jobs": self.n_cores}) # self.safety_model.set_params(**{"n_jobs": self.n_cores}) # self.pit_model.set_params(**{"n_jobs": self.n_cores}) print(self.regular_model.get_params()) def save(self): """ Pickle the model objects to avoid retraining """ for model, name in zip( [self.regular_model, self.safety_model, self.pit_model], ['regular', 'safety', 'pit']): model.save_model( './envs/race_strategy_model/pickled_models/{}.model'.format( name)) with open('./envs/race_strategy_model/pickled_models/scaler.pickle', 'wb') as savefile: pickle.dump(self.scaler, savefile) savefile.close() #self.test_race.to_csv(".envs/race_strategy_model/dataset/test_race.csv") def predict(self, state, lap_type): if lap_type == 'regular': state.drop( columns=['safety', 'pit', 'pit-cost', 'pitstop-milliseconds']) return self.regular_model.predict(state) elif lap_type == 'pit': return self.regular_model.predict(state) else: return self.safety_model.predict(state) def get_prediction_model(self, state: str): if state == 'regular': return self.regular_model if state == 'safety': return self.safety_model if state == 'pit': return self.pit_model else: raise ValueError( "The specified state is not valid, allowed model states are 'regular', 'safety' and 'pit'" )
from GradienBoosting import format_output from read_data import x_train_split, x_val, y_train_split, y_val, x_test, x_train_aug, x_test_aug from sklearn.model_selection import train_test_split from xgboost import XGBRegressor model = XGBRegressor(max_depth=5, learning_rate=0.02, objective='reg:linear', n_estimators=300, booster="gblinear") print(model.get_params().keys()) eval_set = [(x_val, y_val)] model.fit(x_train_split, y_train_split, eval_metric="rmse", eval_set=eval_set, verbose=True) print(model.feature_importances_) y_pred = model.predict(x_val) print(mean_squared_error(y_val,y_pred)) y_test = model.predict(x_test) y_test = format_output(y_test) y_test.to_csv("submission/result_xgboost.csv")
n_estimators=750, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:gamma', nthread=4, scale_pos_weight=1, seed=27) xgb_param = gb.get_xgb_params() xgtrain = xgb.DMatrix(df[features].values, label=df['SPEED_AVG'].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=gb.get_params()['n_estimators'], nfold=10, metrics='mae', early_stopping_rounds=50) gb.set_params(n_estimators=cvresult.shape[0]) gb.fit(x_train, y_train, eval_metric='mae') def mean_absolute_percentage_error(y_true, y_pred): y_true, y_pred = np.array(y_true), np.array(y_pred) return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 predictions = gb.predict(x_train)
def runXGBRegressorTuning(X_train, X_test, y_train, y_test, scoring='neg_mean_squared_error', cv=5, initial_max_depth=[3, 5, 7, 9], initial_min_child_weight=[1, 3, 5], objective='reg:linear', learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=1, reg_alpha=0, reg_lambda=0, gamma=0, subsample=0.8, colsample_bytree=0.8): # Tune max depth and min child weight - strongest bearing on model tuning best_score = 1000000000 xgb_param_dict = dict(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, reg_alpha=reg_alpha, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, objective=objective, reg_lambda=reg_lambda, nthread=4, scale_pos_weight=1, seed=27) xgb_model = XGBRegressor(**xgb_param_dict) param_test1 = { 'max_depth': initial_max_depth, 'min_child_weight': initial_min_child_weight } gsearch = GridSearchCV(estimator=XGBRegressor(**xgb_param_dict), param_grid=param_test1, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth'] xgb_param_dict['min_child_depth'] = gsearch.best_params_['min_child_depth'] xgb_model = XGBRegressor(**xgb_param_dict) # Decision tree to determine new search ranges if optimal solution found at limit of initial range if gsearch.best_params_['max_depth'] == max(initial_max_depth): print('Best max_depth at max limit of initial range...') new_initial_max_depth = range(max(initial_max_depth), max(initial_max_depth) + 6, 2) elif gsearch.best_params_['max_depth'] == min(initial_max_depth): print('Best max_depth at min limit of initial range...') new_initial_max_depth = range( min(initial_max_depth) - 6, min(initial_max_depth), 2) else: new_initial_max_depth = initial_max_depth if gsearch.best_params_['min_child_weight'] == max( initial_min_child_weight): print('Best min_child_weight at max limit of initial range...') new_initial_min_child_weight = range(max(initial_min_child_weight), max(initial_min_child_weight) + 6, 2) elif gsearch.best_params_['min_child_weight'] == min( initial_min_child_weight): print('Best max_depth at min limit of initial range...') new_initial_min_child_weight = range( min(initial_min_child_weight) - 6, min(initial_min_child_weight), 2) else: new_initial_min_child_weight = initial_min_child_weight # Run various procedures depending on outcome if new_initial_max_depth != initial_min_child_weight or new_initial_max_depth != initial_max_depth: param_test = { 'max_depth': new_initial_max_depth, 'min_child_weight': new_initial_min_child_weight } gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth'] xgb_param_dict['min_child_depth'] = gsearch.best_params_[ 'min_child_depth'] xgb_model = XGBRegressor(**xgb_param_dict) else: # Check either side of best variables to check param_test = { 'max_depth': [ xgb_param_dict['max_depth'] - 1, xgb_param_dict['max_depth'], xgb_param_dict['max_depth'] + 1 ], 'min_child_weight': [ xgb_param_dict['min_child_weight'] - 1, xgb_param_dict['min_child_weight'], xgb_param_dict['min_child_weight'] + 1 ] } gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) # Fine-tuned max_depth and min_child_weight parameters print('Fine-tuned max_depth and min_child_weight parameters...\n') print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['max_depth'] = gsearch.best_params_['max_depth'] xgb_param_dict['min_child_weight'] = gsearch.best_params_[ 'min_child_weight'] xgb_model = XGBRegressor(**xgb_param_dict) warnings = {} # Tune gamma param_test3 = {'gamma': [i / 10.0 for i in range(0, 5)]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test3, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) # Fine-tuned gamma parameters print('Fine-tuned gamma parameters...\n') print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['gamma'] = gsearch.best_params_['gamma'] xgb_model = XGBRegressor(**xgb_param_dict) if xgb_param_dict['gamma'] == max(param_test3['gamma']): warnings[ 'gamma'] = 'gamma: Optimal parameter {} at max of search range'.format( xgb_param_dict['gamma']) # Tune subsample and colsample_bytree param_test4 = { 'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)] } gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test4, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) # Fine-tuned subsample and colsample_bytree parameters print('Tuned subsample and colsample_bytree parameters...\n') print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['subsample'] = gsearch.best_params_['subsample'] xgb_param_dict['colsample_bytree'] = gsearch.best_params_[ 'colsample_bytree'] xgb_model = XGBRegressor(**xgb_param_dict) # while xgb_param_dict['subsample'] == max(param_test4['subsample'] or # xgb_param_dict['colsample_bytree'] == max(param_test4['colsample_bytree']) or # xgb_param_dict['subsample'] == min(param_test4['subsample'] or # xgb_param_dict['colsample_bytree'] == min(param_test4['colsample_bytree']): if xgb_param_dict['subsample'] == max(param_test4['subsample']): warnings[ 'subsample'] = 'subsample: Optimal parameter {} at max of search range'.format( xgb_param_dict['subsample']) elif xgb_param_dict['subsample'] == min(param_test4['subsample']): warnings[ 'subsample'] = 'subsample: Optimal parameter {} at min of search range'.format( xgb_param_dict['subsample']) if xgb_param_dict['colsample_bytree'] == max( param_test4['colsample_bytree']): warnings[ 'colsample_bytree'] = 'colsample_bytree: Optimal parameter {} at max of search range'.format( xgb_param_dict['colsample_bytree']) elif xgb_param_dict['colsample_bytreee'] == min( param_test4['colsample_bytree']): warnings[ 'colsample_bytree'] = 'colsample_bytree: Optimal parameter {} at min of search range'.format( xgb_param_dict['colsample_bytree']) # Tune regularisation parameters param_test6 = {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]} gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test6, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) # Fine-tuned regularisation parameters print('Tuned regularisation parameters...\n') print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score_) xgb_param_dict['reg_alpha'] = gsearch.best_params_['reg_alpha'] xgb_model = XGBRegressor(**xgb_param_dict) # Fine-tune regularisation parameters param_test7 = { 'reg_alpha': [ float(xgb_param_dict['reg_alpha']) / 10, float(xgb_param_dict['reg_alpha']) / 2, float(xgb_param_dict['reg_alpha']), float(xgb_param_dict['reg_alpha']) * 5, float(xgb_param_dict['reg_alpha']) * 2 ] } gsearch = GridSearchCV(estimator=xgb_model, param_grid=param_test6, scoring=scoring, n_jobs=4, iid=False, cv=cv) gsearch.fit(X_train, y_train) # Fine-tuned regularisation parameters print('Tuned regularisation parameters...\n') print('Best params: {}'.format(gsearch.best_params_)) print('Best score: {}'.format(np.sqrt(-gsearch.best_score_))) best_score = np.sqrt(-gsearch.best_score) xgb_param_dict['reg_alpha'] = gsearch.best_params_['reg_alpha'] xgb_model = XGBRegressor(**xgb_param_dict) # Tune the learning rate of the model cvresult = xgb.cv(xgb_model.get_params(), X_train, num_boost_round=xgb_model.get_params()['n_estimators'], nfold=cv, metrics='rmse', early_stopping_rounds=50, show_progress=False) # Set the model to the optimal number of estimators wrt early stopping round limit xgb_param_dict['n_estimators'] = cvresult.shape[0] # Learn final XGBoost model xgb_model = XGBRegressor(**xgb_param_dict) xgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='rmse', verbose=True) return xgb_model, xgb_model.get_params(), xgb_model.evals_result( ), warnings
if PLOTS == True: # plot feature importance importance = model.feature_importances_ plot_feature_importance(importance, cols_to_use) # plot loss curves loss = model.evals_result() epochs = len(loss['validation_0']['rmse']) x_axis = range(0, epochs) plt.plot(x_axis, loss['validation_0']['rmse'], label='Train') plt.plot(x_axis, loss['validation_1']['rmse'], label='Test') plt.legend() plt.ylabel('RMSE') plt.show() ################### # predictions and export ################### score = model.best_score features = cols_to_use params = model.get_params() pred_val = model.predict(X_val).clip(0, 20) pred_test = model.predict(X_test).clip(0, 20) ids = np.array(df.loc[df['date_block_num'] == 34, 'ID']) submission = make_submission(ids, pred_test) if DEBUG == False: export_model(OUT_FOLDER, score, features, params, pred_val, pred_test, submission)
'gamma': 0, 'importance_type': 'gain', 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 1450, 'n_jobs': 1, 'nthread': None, 'objective': 'reg:squarederror', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': None, 'subsample': 1, 'verbosity': 1 } rfr = XGBRegressor(**params) rfr.fit(X_train, y_train) print('fitted', '--- %s seconds ---' % (time.time() - start_time)) y_pred = rfr.predict(X_test) print('R^2=', rfr.score(X_test, y_test)) print('RFR_params:', rfr.get_params()) print('Finished', time.ctime()) # save model joblib.dump(rfr, datadir + 'JLmodel_' + \ str(rfr.get_params()['n_estimators']) + '_' + \ str(rfr.get_params()['max_depth']) + '.json')
#finding numof boosting rounds and learning rate alg = XGBRegressor( learning_rate =0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'reg:squarederror', seed=27) xgb_param = alg.get_xgb_params() cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=5,metrics='rmse', early_stopping_rounds=50) n_estimators = cvresult.shape[0] param_test1 = { 'max_depth':range(3,10,2), 'min_child_weight':range(1,6,2) } gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=n_estimators, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'reg:squarederror', nthread=4, seed=27), param_grid = param_test1, scoring=make_scorer(mean_squared_error),n_jobs=4,iid=False, cv=5) gsearch1.fit(train_df,target) gsearch1.best_params_, gsearch1.best_score_