def xgb_randomcv(reg, params, x_train, y_train, x_test, n_iters=10, cv=3, random_state=0): np.random.seed(random_state) seed1 = np.random.randint(10000) seed2 = np.random.randint(10000) param_list = list(model_selection.ParameterSampler(params, n_iters, seed1)) y_test_pred_list = [] y_train_pred_list = [] mae_list = [] ntree_list = [] for p in param_list: for k, v in p.items(): setattr(reg, k, v) y_test_pred_, y_train_pred_, mae_, ntree_ = \ cv_predict_xgb(reg, x_train, y_train, x_test, cv, seed2) y_test_pred_list.append(y_test_pred_) y_train_pred_list.append(y_train_pred_) mae_list.append(mae_) ntree_list.append(ntree_) return y_test_pred_list, y_train_pred_list, mae_list, ntree_list, param_list
def xgb_randomcv(reg, params, x_train, y_train, x_test, n_iters=10, cv=3, random_state=0): np.random.seed(random_state) seed1 = np.random.randint(10000) seed2 = np.random.randint(10000) param_list = list(model_selection.ParameterSampler(params, n_iters, seed1)) y_test_pred_list = [] y_train_pred_list = [] obj_list = [] ntree_list = [] for p in param_list: reg.set_params(**p) y_test_pred_, y_train_pred_, obj_, ntree_ = \ cv_predict_xgb(reg, x_train, y_train, x_test, cv, seed2) y_test_pred_list.append(y_test_pred_) y_train_pred_list.append(y_train_pred_) obj_list.append(obj_) ntree_list.append(ntree_) return y_test_pred_list, y_train_pred_list, obj_list, ntree_list, param_list
def _get_param_iterator(self): """Return ParameterSampler instance for the given distributions""" return model_selection.ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state)
train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') train.drop(['id', 'loss'], axis=1, inplace=True) test.drop('id', axis=1, inplace=True) train_test = pd.concat((train, test)) values = set() cols = list(train.columns) cols_cat = [i for i in cols if 'cat' in i] for c in cols_cat: values = list(np.unique(train_test[c].values)) to_replace = {v: encode(v) for v in values} train_test[c].replace(to_replace, inplace=True) print(c, list(np.unique(train_test[c].values))) save_data('train_test_encoded.pkl', train_test) #%% parameter list params = {} params['max_depth'] = [9, 12, 15, 18, 21, 24] #params['max_depth'] = [1, 2] params['learning_rate'] = [0.01] params['subsample'] = [0.2, 0.3, 0.4, 0.5, 0.6, 0.8] params['colsample_bytree'] = [0.2, 0.4, 0.6, 0.8] params['min_child_weight'] = [1, 3, 5] #params['base_score'] = [1, 2, 4, 8] params['alpha'] = [1, 2, 5] params['gamma'] = [1, 3, 5, 10] parameter_list = list(model_selection.ParameterSampler(params, 100, 0)) save_data('parameterList.pkl', parameter_list)
def do_search(self, param_distributions, record, budget=100, n_runs=3, n_folds=5, verbose=False): with tqdm.tqdm_notebook(total=budget * n_runs * n_folds) as progress: configs = list( model_selection.ParameterSampler(param_distributions, budget, record.randomSeed)) for c in configs: config = self.__get_rounded_config(c) score = record.getScore(config) # skip this set of parameters if we already have a record of it if (score is not None): if verbose: print(config, score) progress.update(n_runs * n_folds) continue scores = [] for run in range(0, n_runs): folds = self.dataset.build_folds(n_folds) gold = [] pred = [] for train_index, test_index in folds: X_train, X_test = self.dataset.items[ train_index], self.dataset.items[test_index] if self.dataset.type == 'c': y_train, y_test = self.dataset.labels[ train_index], self.dataset.labels[test_index] else: y_train, y_test = self.dataset.scores[ train_index], self.dataset.scores[test_index] gold = np.concatenate((gold, y_test)) alg = self.initialize_algorithm(config) alg.fit(X_train, y_train) pred = np.concatenate((pred, alg.predict(X_test))) progress.update(1) scores.append( helpers.get_score(gold, pred, self.scorer, self.scorer_config)) score = np.mean(scores) record.update(config, score) if verbose: print(config, score) return record.bestConfig, record.bestScore