def __init__(self, model, test_file): self.model = model self.test_file = cd.clean_dataframe(test_file) self.x_actual, self.y_actual = cd.X_Y_split(self.test_file) self.y_actual = self.y_actual.values.ravel() self.predictions = self.make_predictions() self.test_file["predictions"] = self.predictions
def __init__(self, params, dataframe): self.params = params self.dataframe = dataframe self.model = xgb.XGBRegressor(**params) self.train_data, self.validation_data = train_test_split( self.dataframe, test_size=0.3, random_state=100) train_x, train_y = cd.X_Y_split(self.train_data) validation_x, validation_y = cd.X_Y_split(self.validation_data) self.dtrain = xgb.DMatrix(data=train_x, label=train_y, feature_names=train_x.columns) self.dvalidation = xgb.DMatrix(data=validation_x, label=validation_y, feature_names=validation_x.columns) self.eval_matrix = [(self.dtrain, "train"), (self.dvalidation, "validation")] self.eval_set = [(train_x, train_y), (validation_x, validation_y)]
def tune_all(data, estimator, param_grid, n_iter=10, n_splits=5): train_x, train_y = cd.X_Y_split(data) kfold = KFold(n_splits=n_splits) param_search = RandomizedSearchCV(estimator, param_grid, n_iter=n_iter, scoring="neg_mean_squared_error", cv=kfold) grid_result = param_search.fit(train_x, train_y, verbose=0) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) return grid_result
def add_evalset(self, dataframe): """ Function to add additional dataset for validation during training. dataframe must be cleaned before with clean_dataframe function. """ dataframe = cd.clean_dataframe(dataframe) new_val_x, new_val_y = cd.X_Y_split(dataframe) new_val_mat = xgb.DMatrix(data=new_val_x, label=new_val_y, feature_names=new_val_x.columns) self.eval_matrix.append((new_val_mat, "validation_2")) self.eval_set.append((new_val_x, new_val_y))
def tune_parameter(data, parameter, param_range, save_plot=False, randomized=False, save_path=None, n_iter=None, n_splits=5, estimator=None): """ Function to tune a parameter using either gridsearch or randomized search with possibility of cross validation. Input: - data = dataset to be used tuning, usually the training dataset. - parameter = string of parameter to be tuned. (works with XGBoost for now) - param_range = parameter search space - estimator = model to be tuned if existing already, if not a new default XGBRegressor model wil be created """ train_x, train_y = cd.X_Y_split(data) param_grid = {parameter: list(param_range)} kfold = KFold(n_splits=n_splits, random_state=7) if not estimator: estimator = xgb.XGBRegressor(objective="reg:squarederror", ) if randomized: assert n_iter != None, "Missing number of iterations" param_search = RandomizedSearchCV(estimator, param_grid, n_iter=n_iter, scoring="neg_mean_squared_error", cv=kfold) else: param_search = GridSearchCV(estimator, param_grid, verbose=0, cv=kfold, scoring="neg_mean_squared_error") grid_result = param_search.fit(train_x, train_y, verbose=0) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] #for mean, sdev, param in zip(means, stds, params): #print("%f (%f) with: %r" % (mean, stdev, param)) if randomized: param_range = [list(i.values())[0] for i in params] fig, ax = plt.subplots() ax.errorbar(param_range, -1 * means, yerr=stds) ax.set_title("XGBoost %s vs RMSE" % parameter) ax.set_xlabel('%s' % parameter) ax.set_ylabel('RMSE') if save_plot: if save_path: fig.savefig("%s/%s.png" % (save_path, parameter)) else: fig.savefig("%s.png" % parameter) return grid_result