def main(): p = get_cli_args(args) x_train, y_train, qid_train = load_svmlight_file( p.train.xgboost_train_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_test, y_test, qid_test = load_svmlight_file(p.train.xgboost_test_path, query_id=True) # pylint: disable=unbalanced-tuple-unpacking x_train = x_train.todense() x_train = np.concatenate([ x_train, x_train[:, -2] / x_train[:, 2], x_train[:, -1] / x_train[:, 4] ], 1) x_test = x_test.todense() x_test = np.concatenate( [x_test, x_test[:, -2] / x_test[:, 2], x_test[:, -1] / x_test[:, 4]], 1) train_dmatrix = DMatrix(x_train, y_train) test_dmatrix = DMatrix(x_test, y_test) train_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_train)]) test_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_test)]) params = { 'objective': 'rank:pairwise', 'eval_metric': ['error', 'map@1'], 'tree_method': 'exact', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6 } xgb_model = xgb.train(params, train_dmatrix, num_boost_round=100, evals=[(test_dmatrix, 'validation')]) xgb_train_str = items_to_str(_.omit(params, 'objective', 'eval_metric').items(), sort_by=itemgetter(0)) xgb_model.save_model(xgb_train_str + '_model.xgb')
def predict_xgboost_answers(xgb_model): # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность) load_tfidf_1 = TFIDF.load(os.path.join(PATH_TO_TF_IDF, 'tf_idf_1')) x_test, y_test = sklearn.datasets.load_svmlight_file( os.path.join(PATH_TO_LEARNING_TO_RANK, 'x_test.txt')) group_test = [] with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"), "r", encoding="utf-8") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) test_dmatrix = DMatrix(x_test) test_dmatrix.set_group(group_test) pred = xgb_model.predict(test_dmatrix) prediction_answer = [] for i, p in enumerate(pred): prediction_answer.append( (load_tfidf_1.num_to_num_dict[i % CNT_ARTICLES], p)) predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK, 'prediction_file.txt') if os.path.exists(predict_file): os.remove(predict_file) f = open(predict_file, 'w+', encoding="utf-8") predictions = [str(pred) for pred in prediction_answer] f.write('\n'.join(predictions)) f.close()
def apply(self, X, ntree_limit=0): """Return the predicted leaf every tree for each sample. Parameters ---------- X : array_like, shape=[n_samples, n_features] Input features matrix. ntree_limit : int Limit number of trees in the prediction; defaults to 0 (use all trees). Returns ------- X_leaves : array_like, shape=[n_samples, n_trees] For each datapoint x in X and for each tree, return the index of the leaf x ends up in. Leaves are numbered within ``[0; 2**(self.max_depth+1))``, possibly with gaps in the numbering. """ sizes, group_indices, X_features, _, _ = _preprare_data_in_groups(X) test_dmatrix = DMatrix(X_features, missing=self.missing) test_dmatrix.set_group(sizes) X_leaves = self.get_booster().predict(test_dmatrix, pred_leaf=True, ntree_limit=ntree_limit) revert_group_indices = np.arange( len(group_indices))[group_indices.argsort()] X_leaves = X_leaves[revert_group_indices, :] return X_leaves
def predict_xgboost_answers(xgb_model): # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность) features = pd.read_csv(f"{PATH_TO_LEARNING_TO_RANK}/x_test.csv", sep=',') x_test = features.drop(['doc_id', 'is_rel', '7'], axis=1) group_test = [] with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"), "r", encoding="utf-8") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) test_dmatrix = DMatrix(x_test) test_dmatrix.set_group(group_test) pred = xgb_model.predict(test_dmatrix) corpus = SimpleCorp.load("codexes_corp_articles", os.path.join(PATH_TO_FILES, "corp")) prediction_answer = [] for p, doc_id in zip( pred, list(corpus.corpus.keys()) * (len(pred) // CNT_ARTICLES)): prediction_answer.append((doc_id, p)) predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK, 'prediction_file.txt') if os.path.exists(predict_file): os.remove(predict_file) f = open(predict_file, 'w+', encoding="utf-8") predictions = [str(pred) for pred in prediction_answer] f.write('\n'.join(predictions)) f.close()
def predict(self, X, output_margin=False, ntree_limit=0): sizes, _, X_features, _, _ = _preprare_data_in_groups(X) test_dmatrix = DMatrix(X_features, missing=self.missing) test_dmatrix.set_group(sizes) rank_values = self.get_booster().predict(test_dmatrix, output_margin=output_margin, ntree_limit=ntree_limit) return rank_values
def predict(self, X, group=None, output_margin=False, ntree_limit=0): if group == None: group = [X.shape[0]] test_dmatrix = DMatrix(X, missing=self.missing) test_dmatrix.set_group(group) rank_values = self.booster().predict(test_dmatrix, output_margin=output_margin, ntree_limit=ntree_limit) return rank_values
def get_pairs_rank_score(loaded_model, text_list): test_group_list, test_data_list, test_target_list = test_data_generation( text_list) # print(test_group_list, '\n*******test_group_list************') # print(test_data_list, '\n*********test_data_list**********') # print(test_target_list, '\n*********test_target_list**********') xgbTest = DMatrix(np.asmatrix(test_data_list), label=test_target_list) xgbTest.set_group(test_group_list) results = loaded_model.predict(xgbTest) return results
def train(model_file): # This script demonstrate how to do ranking with xgboost.train x_train, y_train = load_svmlight_file("mq2008.train") x_valid, y_valid = load_svmlight_file("mq2008.vali") x_test, y_test = load_svmlight_file("mq2008.test") group_train = [] with open("mq2008.train.group", "r", encoding="utf8") as f: data = f.readlines() for line in data: group_train.append(int(line.split("\n")[0])) group_valid = [] with open("mq2008.vali.group", "r", encoding="utf8") as f: data = f.readlines() for line in data: group_valid.append(int(line.split("\n")[0])) group_test = [] with open("mq2008.test.group", "r", encoding="utf8") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) train_dmatrix = DMatrix(x_train, y_train) valid_dmatrix = DMatrix(x_valid, y_valid) test_dmatrix = DMatrix(x_test) train_dmatrix.set_group(group_train) valid_dmatrix.set_group(group_valid) params = { 'objective': 'rank:pairwise', 'eta': 0.01, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 8 } xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4, evals=[(valid_dmatrix, 'validation')]) pred = xgb_model.predict(test_dmatrix) xgb_model.dump_model(model_file + ".txt") xgb_model.save_model(model_file) # save figures plt.clf() xgb.plot_importance(xgb_model) plt.savefig('feature_importance.png', dpi=800, format='png')
def predict(self, X, output_margin=False, ntree_limit=0): ''' X (array_like) – The dmatrix storing the input. output_margin (bool) – Whether to output the raw untransformed margin value. ntree_limit (int) – Limit number of trees in the prediction; defaults to 0 (use all trees). ''' sizes, group_indices, X_features, _, _ = _preprare_data_in_groups(X) test_dmatrix = DMatrix(X_features, missing=self.missing) test_dmatrix.set_group(sizes) rank_values = self.get_booster().predict(test_dmatrix, output_margin=output_margin, ntree_limit=ntree_limit) revert_group_indices = np.arange( len(group_indices))[group_indices.argsort()] rank_values = rank_values[revert_group_indices] return rank_values
def predict(self, X, group=None, output_margin=False, ntree_limit=0): unsort = (group is not None) if group == None: group = [X.shape[0]] else: idx = np.argsort(group) X = X[idx, :] group = group[idx] unique, counts = np.unique(group, return_counts=True) group = counts[np.argsort(unique)] test_dmatrix = DMatrix(X, missing=self.missing) test_dmatrix.set_group(group) rank_values = self.get_booster().predict(test_dmatrix, output_margin=output_margin, ntree_limit=ntree_limit) if unsort: rank_values = rank_values[np.argsort(idx)] return rank_values
def train_ranking(): train_group_list, train_data_list, train_target_list = data_generation({}) test_group_list, test_data_list, test_target_list = train_group_list, train_data_list, train_target_list eval_group_list, eval_data_list, eval_target_list = train_group_list, train_data_list, train_target_list xgbTrain = DMatrix(np.asmatrix(train_data_list), label=train_target_list) xgbTrain.set_group(train_group_list) xgbEval = DMatrix(np.asmatrix(eval_data_list), label=eval_target_list) xgbEval.set_group(eval_group_list) evallist = [(xgbTrain, 'train'), (xgbEval, 'eval')] rankModel = train(xgb_rank_params2, xgbTrain, num_boost_round=50, evals=evallist) rankModel.save_model('xgb.model') loaded_model = xgb.Booster(model_file='xgb.model') xgbTest = DMatrix(np.asmatrix(test_data_list), label=test_target_list) xgbTest.set_group(test_group_list) results = loaded_model.predict(xgbTest) with open('results.txt', mode='w', encoding='utf-8') as f: for item in results: f.write(str(item) + '\n')
def train(): train_dmatrix = DMatrix(x_train, y_train) valid_dmatrix = DMatrix(x_valid, y_valid) test_dmatrix = DMatrix(x_test) train_dmatrix.set_group(group_train) valid_dmatrix.set_group(group_valid) params = { 'objective': 'rank:pairwise', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6 } xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4, evals=[(valid_dmatrix, 'validation')]) with open(curr_dir + './data/data_model/pairwise_origin_version.model', 'wb') as f: pickle.dump(xgb_model, f, pickle.HIGHEST_PROTOCOL) return 1
def predict_norm(request, features=None, iifh=None): new_request = Request(request, "", "").create_dict() xgb_model = joblib.load("final_xgb_model.sav") create_feature_files_for_all_requests([new_request], "files/", features) features = pd.read_pickle(f"{PATH_TO_FILES}/0.pickle") x_test = features.drop(['is_rel', '7'], axis=1) group_test = [CNT_ARTICLES] test_dmatrix = DMatrix(x_test) test_dmatrix.set_group(group_test) pred = xgb_model.predict(test_dmatrix) corpus = SimpleCorp.load("codexes_corp_articles", os.path.join(PATH_TO_FILES, "corp")) art_names = SimpleCorp.load('codexes_corp_art_names', f'{PATH_TO_FILES}/corp') prediction_answer = [] for p, doc_id in zip(pred, list(corpus.corpus.keys())): prediction_answer.append((doc_id, p)) prediction_answer.sort(key=lambda x: x[1], reverse=True) valid_answers = [] for i, res in enumerate(prediction_answer[:5]): doc_id = res[0] cod = name_codexes[int(doc_id[0])] answer = f"<p>Cтатья {doc_id[1]}. {art_names.get_doc(doc_id)} // {cod[0]}{cod[1:].lower()}.</p>" print(answer) if iifh is not None: snippet = iifh.hightlight_words(request, doc_id) answer += f"<p style='padding-left:50px; border-left: 3px gray;'>{snippet}</p>" print(snippet) valid_answers.append(Markup(answer)) return valid_answers
def main(): # Import training data x_train, y_train, qid_train = load_svmlight_file("hn.train", query_id=True) x_valid, y_valid, qid_valid = load_svmlight_file("hn.vali", query_id=True) x_test, y_test, qid_test = load_svmlight_file("hn.test", query_id=True) group_train = group_qid(qid_train) group_valid = group_qid(qid_valid) group_test = group_qid(qid_test) train_dmatrix = DMatrix(x_train, y_train) valid_dmatrix = DMatrix(x_valid, y_valid) test_dmatrix = DMatrix(x_test) train_dmatrix.set_group(group_train) valid_dmatrix.set_group(group_valid) test_dmatrix.set_group(group_test) # Train Xgboost with basic parameters params = {'objective': 'rank:pairwise', 'eta': 0.1, # 'gamma': 1.0, # 'min_child_weight': 0.1, 'max_depth': 3} params['eval_metric'] = ['ndcg@1', 'ndcg@3', 'ndcg@5', 'ndcg@10'] xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4, evals=[(valid_dmatrix, 'validation')]) pred = xgb_model.predict(test_dmatrix) data_predict = regroup_results(group_test, pred, y_test) # Testing random sample # Simple debug function that print algolia results and predictions def print_random_sample(line): prevsum = sum(group_test[:line]) print('Algolia clicks are: {}'.format(y_test[prevsum:prevsum + group_test[line]])) print('Predictions are: {}'.format(pred[prevsum:prevsum + group_test[line]])) print('Xgboost clicks are: {}'.format(data_predict[line])) print_random_sample(1) print('> Mean reciprocal rank is : {}'.format( mean_reciprocal_rank(data_predict))) print('> Mean average position is : {}'.format( mean_average_precision(data_predict))) # nDCG for i in [1, 3, 5, 10]: ndcg_ = [] for query in data_predict: ndcg_.append(ndcg_at_k(query, i)) print('> nDCG@{} is : {}'.format(i, pd.Series(ndcg_).mean()))
def train(model_file): trans_data(data_path) # This script demonstrate how to do ranking with xgboost.train x_train, y_train = load_svmlight_file(data_path + TASK + ".train") x_valid, y_valid = load_svmlight_file(data_path + TASK + ".valid") x_test, y_test = load_svmlight_file(data_path + TASK + ".test") print("train data shape: [%d, %d]" % (x_train.shape[0], x_train.shape[1])) group_train = load_group_data(data_path + TASK + ".train.group") group_valid = load_group_data(data_path + TASK + ".valid.group") group_test = load_group_data(data_path + TASK + ".test.group") train_dmatrix = DMatrix(x_train, y_train) valid_dmatrix = DMatrix(x_valid, y_valid) test_dmatrix = DMatrix(x_test, y_test) train_dmatrix.set_group(group_train) valid_dmatrix.set_group(group_valid) test_dmatrix.set_group(group_test) params = {'booster':'gbtree', 'objective': 'rank:pairwise', 'eta': 0.01, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 2, \ 'eval_metric':'ndcg@1'} # ndcg@1, logloss xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4, evals=[(train_dmatrix, 'train'), (valid_dmatrix, 'valid'), (test_dmatrix, 'test')]) pred = xgb_model.predict(test_dmatrix) print("save model to %s" % (model_file)) xgb_model.dump_model(model_file + ".txt") xgb_model.save_model(model_file) joblib.dump(xgb_model, 'rank_model/xgb_clf.m') # save figures plt.clf() xgb.plot_importance(xgb_model) plt.savefig('rank_model/feature_importance.png', dpi=800, format='png')
################################################################## ## DMatrix ## generate training dataset # 一共 2 组 * 每组 3 条, 6 条样本, 特征维数是 2 n_group = 2 n_choice = 3 dtrain = np.random.uniform(0, 100, [n_group * n_choice, 2]); print(dtrain.shape) # (6, 2) # numpy.random.choice(a, size=None, replace=True, p=None) dtarget = np.array([np.random.choice([0, 1, 2], 3, False) for i in range(n_group)]).flatten(); print(dtarget) # [1 0 2 1 0 2] # n_group 用于表示从前到后每组各自有多少样本, 前提是样本中各组是连续的, [3, 3] 表示一共 6 条样本中前 3 条是第一组, 后 3 条是第二组 dgroup = np.array([n_choice for i in range(n_group)]).flatten(); print(dgroup) # [3 3] # concate Train data, very import here ! xgbTrain = DMatrix(dtrain, label=dtarget) xgbTrain.set_group(dgroup) # generate eval data dtrain_eval = np.random.uniform(0, 100, [n_group * n_choice, 2]); print(dtrain_eval.shape) # (6, 2) xgbTrain_eval = DMatrix(dtrain_eval, label=dtarget) xgbTrain_eval .set_group(dgroup) evallist = [(xgbTrain, 'train'), (xgbTrain_eval, 'eval')] # train model # xgb_rank_params1 加上 evals 这个参数会报错, 还没找到原因 # rankModel = train(xgb_rank_params1, xgbTrain, num_boost_round=10) rankModel = train(xgb_rank_params2, xgbTrain, num_boost_round=20, evals=evallist) # test dataset dtest = np.random.uniform(0, 100, [n_group*n_choice, 2]); print(dtest.shape) # (6, 2) dtestgroup = np.array([n_choice for i in range(n_group)]).flatten(); print(dtestgroup) # [3 3]
class xgbLtr: def __init__(self): self.train_file = DATA_PATH + TASK + ".train" self.valid_file = DATA_PATH + TASK + ".valid" self.test_file = DATA_PATH + TASK + ".test" self.model_path = "rank_model/" self.model_name = TASK + "_xgb1.model" def load_data(self): x_train, y_train = load_svmlight_file(self.train_file) x_valid, y_valid = load_svmlight_file(self.valid_file) x_test, y_test = load_svmlight_file(self.test_file) print("train data shape: [%d, %d]" % (x_train.shape[0], x_train.shape[1])) group_train = load_group_data(DATA_PATH + TASK + ".train.group") group_valid = load_group_data(DATA_PATH + TASK + ".valid.group") group_test = load_group_data(DATA_PATH + TASK + ".test.group") self.train_dmatrix = DMatrix(x_train, y_train) self.valid_dmatrix = DMatrix(x_valid, y_valid) self.test_dmatrix = DMatrix(x_test, y_test) self.train_dmatrix.set_group(group_train) self.valid_dmatrix.set_group(group_valid) self.test_dmatrix.set_group(group_test) def train(self): params = { 'booster': 'gbtree', 'objective': 'rank:pairwise', 'eta': 0.01, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 2, 'eval_metric': 'ndcg@1' } # ndcg@1, logloss xgb_model = xgb.train(params, self.train_dmatrix, num_boost_round=4, evals=[(self.train_dmatrix, 'train'), (self.valid_dmatrix, 'valid'), (self.test_dmatrix, 'test')]) pred = xgb_model.predict(self.test_dmatrix) print("save model to %s" % (self.model_path)) xgb_model.dump_model(self.model_path + self.model_name + ".txt") xgb_model.save_model(self.model_path + self.model_name) joblib.dump(xgb_model, self.model_path + '/xgb_clf.m') # save figures plt.clf() xgb.plot_importance(xgb_model) plt.savefig(self.model_path + '/feature_importance.png', dpi=800, format='png') def plotXgboostTree(self): xgb_model = xgb.Booster(model_file=self.model_path + self.model_name) xgbclf = joblib.load(self.model_path + '/xgb_clf.m') #plt.clf(); xgb.plot_tree(xgbclf, num_trees=0, fmap='./xgb.fmap'); plt.savefig('xgb_tree.png', dpi=800, format='png'); exit(0) for i in range(4): #plt.clf() xgb.plot_tree(xgb_model, num_trees=i, fmap='./get_jdcv_data/feature.fmap') fig = plt.gcf() fig.set_size_inches(150, 100) fig.savefig('xgb_tree_' + str(i) + '.png') #plt.savefig('xgb_tree_' + str(i) + '.png', dpi=800, format='png') a = 1 pass
group_train = [] with open("mq2008.train.group", "r") as f: data = f.readlines() for line in data: group_train.append(int(line.split("\n")[0])) group_valid = [] with open("mq2008.vali.group", "r") as f: data = f.readlines() for line in data: group_valid.append(int(line.split("\n")[0])) group_test = [] with open("mq2008.test.group", "r") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) train_dmatrix = DMatrix(x_train, y_train) valid_dmatrix = DMatrix(x_valid, y_valid) test_dmatrix = DMatrix(x_test) train_dmatrix.set_group(group_train) valid_dmatrix.set_group(group_valid) params = {'objective': 'rank:pairwise', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6} xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4, evals=[(valid_dmatrix, 'validation')]) pred = xgb_model.predict(test_dmatrix)
print('X_train.shape', X_train.shape) print('X_val.shape', X_val.shape) print('X_test.shape', X_test.shape) print('y_train.shape', y_train.shape) print('y_val.shape', y_val.shape) print(set(X_train.columns) - set(X_test.columns)) print(set(X_val.columns) - set(X_test.columns)) print(set(X_train.columns) - set(X_val.columns)) #%% Prepare matrices print('Creating Dmatrices') train_dmatrix = DMatrix(X_train.values, y_train) valid_dmatrix = DMatrix(X_val.values, y_val) test_dmatrix = DMatrix(X_test.values) train_dmatrix.set_group(train_group_sizes) valid_dmatrix.set_group(val_group_sizes) test_dmatrix.set_group(test_group_sizes) del X_train del X_val del X_test #%% XGBOOST MODEL: POSITION-BASED #XGB model using efficient data structure Dmatrix params = { 'max_depth': 3, 'min_child_weight': 10, 'learning_rate': 0.3, 'subsample': 0.5, 'colsample_bytree': 0.6,
print("data load done!!!") xgb_rank_params2 = { 'bst:max_depth': 5, # 构建树的深度,越大越容易过拟合 'bst:eta': 0.1, # 如同学习率 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0. 'objective': 'rank:pairwise', 'nthread': 8, # cpu 线程数 'eval_metric': 'ndcg@10-', 'metric': 'ndcg@10-' } train_group_list, train_data_list, train_target_list = data_generation( train_dict) xgbTrain = DMatrix(train_data_list, label=train_target_list) xgbTrain.set_group(train_group_list) eval_group_list, eval_data_list, eval_target_list = data_generation(eval_dict) xgbEval = DMatrix(eval_data_list, label=eval_target_list) xgbEval.set_group(eval_group_list) # get evallist evallist = [(xgbTrain, 'train'), (xgbEval, 'eval')] # train rankModel = train(xgb_rank_params2, xgbTrain, num_boost_round=5, evals=evallist) # get predict
class xgbLtr: def __init__(self): self.train_file = DATA_PATH + TASK + ".train" self.valid_file = DATA_PATH + TASK + ".valid" self.test_file = DATA_PATH + TASK + ".test" self.model_path = conf.xgb_rank_model self.model_name = TASK + "_xgb.model" def load_data(self): print("train data file: %s" % (DATA_PATH)) trans_data(DATA_PATH) x_train, y_train = load_svmlight_file(self.train_file) x_valid, y_valid = load_svmlight_file(self.valid_file) x_test, y_test = load_svmlight_file(self.test_file) #random.shuffle(y_train); random.shuffle(y_valid); random.shuffle(y_test) print("train data shape: [%d, %d]" % (x_train.shape[0], x_train.shape[1])) group_train = load_group_data(DATA_PATH + TASK + ".train.group") group_valid = load_group_data(DATA_PATH + TASK + ".valid.group") group_test = load_group_data(DATA_PATH + TASK + ".test.group") self.train_dmatrix = DMatrix(x_train, y_train) self.valid_dmatrix = DMatrix(x_valid, y_valid) self.test_dmatrix = DMatrix(x_test, y_test) self.train_dmatrix.set_group(group_train) self.valid_dmatrix.set_group(group_valid) self.test_dmatrix.set_group(group_test) def train(self): extra_pam = {} extra_pam = {'verbosity':0, 'validate_parameters': True, 'subsample':0.1, 'lambda': 0.6, 'alpha': 0.8, \ 'early_stopping_rounds':1} params = { 'booster': 'gbtree', 'objective': 'rank:pairwise', 'eta': 1e-3, 'gamma': 10.0, 'min_child_weight': 0.1, 'max_depth': 6, 'eval_metric': ['logloss'] } # ndcg@1, logloss,auc params.update(extra_pam) xgb_model = xgb.train( params, self.train_dmatrix, num_boost_round=100, #evals=[(self.valid_dmatrix, 'valid')]) evals=[(self.train_dmatrix, 'train'), (self.valid_dmatrix, 'valid'), (self.test_dmatrix, 'test')]) pred = xgb_model.predict(self.valid_dmatrix) print("save model to %s" % (self.model_path)) xgb_model.dump_model(self.model_path + self.model_name + ".txt") xgb_model.save_model(self.model_path + self.model_name) joblib.dump(xgb_model, self.model_path + '/xgb_clf.m') # save figures plt.clf() xgb.plot_importance(xgb_model) plt.savefig(self.model_path + '/feature_importance.png', dpi=800, format='png') def plotXgboostTree(self): xgb_model = xgb.Booster(model_file=self.model_path + self.model_name) xgbclf = joblib.load(self.model_path + '/xgb_clf.m') #plt.clf(); xgb.plot_tree(xgbclf, num_trees=0, fmap='./xgb.fmap'); plt.savefig('xgb_tree.png', dpi=800, format='png'); exit(0) for i in range(4): #plt.clf() xgb.plot_tree(xgb_model, num_trees=i, fmap='./get_jdcv_data/feature.fmap') fig = plt.gcf() fig.set_size_inches(150, 100) fig.savefig('xgb_tree_' + str(i) + '.png') #plt.savefig('xgb_tree_' + str(i) + '.png', dpi=800, format='png') a = 1 pass def predict(self, vec): print("xgb model file: %s" % (conf.xgb_rank_model)) self.xgb_model = xgb.Booster(model_file=conf.xgb_rank_model + self.model_name) feature_vector = [0] * 30 for ele in vec.split()[2:]: k, v = ele.split(":") try: val = int(v) except: val = float(v) feature_vector[int(k) - 1] = val a = 1 feature = np.array(feature_vector) feature_csr = sparse.csr_matrix(feature) input = DMatrix(feature_csr) score = self.xgb_model.predict(input)[0] return score def test(self, fea_num=24, topk=1, path=conf.rank_data_file + "valid.txt"): xgb_dict = parse_xgb_dict(conf.xgb_rank_model + self.model_name + ".txt") def cal_score(): pass xgb_model = xgb.Booster(model_file=conf.xgb_rank_model + self.model_name) group_data = {} print("test file: %s\ttree number: %d" % (path, len(xgb_dict))) text = [ line.strip().split() for line in open(path, encoding="utf8").readlines() ] for line in text: if line[1] not in group_data: group_data[line[1]] = [] group_data[line[1]].append(line) group_data = {k: v for k, v in group_data.items() if len(v) > 1} ndcgs = [] #np.zeros(len(group_data)) #for i, (_, datas) in enumerate(tqdm(group_data.items(), total=len(group_data))): for i, (_, datas) in enumerate(group_data.items()): score_label = [] for ele in datas: feature_vector = [0] * fea_num label = int(ele[0]) for e in ele[2:]: k, v = e.split(":") try: val = int(v) except: val = float(v) feature_vector[int(k) - 1] = val feature = np.array(feature_vector) feature_csr = sparse.csr_matrix(feature) input = DMatrix(feature_csr) score = xgb_model.predict(input)[0] # xgboost 自带的预测函数 #score = predict_proba(xgb_dict, feature) # 解析 .txt 模型文件得到的预测函数 score_label.append((score, label)) sorted_score_label = sorted(score_label, key=lambda d: d[0], reverse=True) label_list = [label for score, label in sorted_score_label] dcg, idcg, ndcg = cal_ndcg(label_list, topk) if len(set(label_list)) <= 1: continue ndcgs.append(ndcg) #[i] = ndcg print([(round(k, 3), v) for k, v in sorted_score_label], round(ndcg, 3)) ndcgs_mean = np.mean(np.array(ndcgs)) #np.mean(ndcgs) print("topk: %d\tndcgs mean: %.3f" % (topk, ndcgs_mean)) pass
def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None): """ Fit the gradient boosting model Parameters ---------- X : array_like Feature matrix with the first feature containing a group indicator y : array_like Labels sample_weight : array_like instance weights eval_set : list, optional A list of (X, y) tuple pairs to use as a validation set for early-stopping eval_metric : str, callable, optional If a str, should be a built-in evaluation metric to use. See doc/parameter.md. If callable, a custom evaluation metric. The call signature is func(y_predicted, y_true) where y_true will be a DMatrix object such that you may need to call the get_label method. It must return a str, value pair where the str is a name for the evaluation and value is the value of the evaluation function. This objective is always minimized. early_stopping_rounds : int Activates early stopping. Validation error needs to decrease at least every <early_stopping_rounds> round(s) to continue training. Requires at least one item in evals. If there's more than one, will use the last. Returns the model from the last iteration (not the best one). If early stopping occurs, the model will have three additional fields: bst.best_score, bst.best_iteration and bst.best_ntree_limit. (Use bst.best_ntree_limit to get the correct value if num_parallel_tree and/or num_class appears in the parameters) verbose : bool If `verbose` and an evaluation set is used, writes the evaluation metric measured on the validation set to stderr. xgb_model : str file name of stored xgb model or 'Booster' instance Xgb model to be loaded before training (allows training continuation). """ X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True) group_sizes, _, X_features, y, sample_weight = _preprare_data_in_groups( X, y, sample_weight) params = self.get_xgb_params() evals_result = {} feval = eval_metric if callable(eval_metric) else None if eval_metric is not None: if callable(eval_metric): eval_metric = None else: params.update({'eval_metric': eval_metric}) if sample_weight is not None: train_dmatrix = DMatrix(X_features, label=y, weight=sample_weight, missing=self.missing) else: train_dmatrix = DMatrix(X_features, label=y, missing=self.missing) train_dmatrix.set_group(group_sizes) self._Booster = train(params, train_dmatrix, self.n_estimators, early_stopping_rounds=early_stopping_rounds, evals_result=evals_result, obj=None, feval=feval, verbose_eval=verbose, xgb_model=xgb_model) if evals_result: for val in evals_result.items(): evals_result_key = list(val[1].keys())[0] evals_result[ val[0]][evals_result_key] = val[1][evals_result_key] self.evals_result = evals_result if early_stopping_rounds is not None: self.best_score = self._Booster.best_score self.best_iteration = self._Booster.best_iteration self.best_ntree_limit = self._Booster.best_ntree_limit return self
def cv_eval_lambdaMART_in_XGBoost(para_dict=None): # common parameters across different models debug, dataset, dir_data, model = para_dict['debug'], para_dict[ 'dataset'], para_dict['dir_data'], para_dict['model'] min_docs, min_rele, cutoffs = para_dict['min_docs'], para_dict[ 'min_rele'], para_dict['cutoffs'] do_validation, validation_k, do_log = para_dict[ 'do_validation'], para_dict['validation_k'], para_dict['do_log'] eta, gamma, min_child_weight, max_depth, tree_method = para_dict[ 'eta'], para_dict['gamma'], para_dict['min_child_weight'], para_dict[ 'max_depth'], para_dict['tree_method'] if debug: fold_num = 2 else: fold_num = 5 model_output = update_output_setting(para_dict=para_dict) if do_log: # open log file sys.stdout = open(model_output + 'log.txt', "w") time_begin = datetime.datetime.now() # timing l2r_cv_avg_scores = np.zeros(len(cutoffs)) # fold average for fold_k in range(1, fold_num + 1): print( '\nFold-', fold_k) # fold-wise data preparation plus certain light filtering dir_fold_k = dir_data + 'Fold' + str(fold_k) + '/' ori_file_train, ori_file_vali, ori_file_test = dir_fold_k + 'train.txt', dir_fold_k + 'vali.txt', dir_fold_k + 'test.txt' file_train_data, file_train_group = load_data_xgboost( ori_file_train, min_docs=min_docs, min_rele=min_rele, dataset=dataset) file_vali_data, file_vali_group = load_data_xgboost(ori_file_vali, min_docs=min_docs, min_rele=min_rele, dataset=dataset) file_test_data, file_test_group = load_data_xgboost(ori_file_test, min_docs=min_docs, min_rele=min_rele, dataset=dataset) x_train, y_train = load_svmlight_file(file_train_data) group_train = load_group_data(file_train_group) train_dmatrix = DMatrix(x_train, y_train) train_dmatrix.set_group(group_train) if do_validation: x_valid, y_valid = load_svmlight_file(file_vali_data) group_valid = load_group_data(file_vali_group) valid_dmatrix = DMatrix(x_valid, y_valid) valid_dmatrix.set_group(group_valid) x_test, y_test = load_svmlight_file(file_test_data) group_test = load_group_data(file_test_group) test_dmatrix = DMatrix(x_test) """ possible settings of params """ # params = {'objective': 'rank:pairwise', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6} # ndcg # params = {'objective': 'rank:ndcg', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6} #params = {'objective': 'rank:ndcg', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6, 'eval_metric': 'ndcg@10'} params = { 'objective': 'rank:ndcg', 'eta': eta, 'gamma': gamma, 'min_child_weight': min_child_weight, 'max_depth': max_depth, 'eval_metric': 'ndcg@10-', 'tree_method': tree_method } # if idealDCG=0, then 0 # map # params = {'objective': 'rank:map', 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6} if do_validation: fold_xgb_model = xgb.train(params, train_dmatrix, num_boost_round=500, evals=[(valid_dmatrix, 'validation')]) else: fold_xgb_model = xgb.train(params, train_dmatrix, num_boost_round=500) fold_checkpoint = '-'.join(['Fold', str(fold_k)]) # buffer model save_dir = model_output + fold_checkpoint + '/' if not os.path.exists(save_dir): os.makedirs(save_dir) with open(save_dir + '_'.join(['fold', str(fold_k), 'model']) + '.dat', 'wb') as model_file: pickle.dump(fold_xgb_model, model_file) pred = fold_xgb_model.predict(test_dmatrix) # fold-wise performance fold_avg_ndcg_at_ks = cal_nDCG_at_ks(all_std_labels=y_test, all_preds=pred, group=group_test, ks=cutoffs) performance_list = [model + ' Fold-' + str(fold_k)] for i, co in enumerate(cutoffs): performance_list.append('nDCG@{}:{:.4f}'.format( co, fold_avg_ndcg_at_ks[i])) performance_str = '\t'.join(performance_list) print('\n\t', performance_str) l2r_cv_avg_scores = np.add( l2r_cv_avg_scores, fold_avg_ndcg_at_ks) # sum for later cv-performance time_end = datetime.datetime.now() # overall timing elapsed_time_str = str(time_end - time_begin) print('Elapsed time:\t', elapsed_time_str + "\n") print() # begin to print either cv or average performance l2r_cv_avg_scores = np.divide(l2r_cv_avg_scores, fold_num) if do_validation: eval_prefix = str(fold_num) + '-fold cross validation scores:' else: eval_prefix = str(fold_num) + '-fold average scores:' print(model, eval_prefix, to_output_str(list_scores=l2r_cv_avg_scores, list_cutoffs=cutoffs)) return l2r_cv_avg_scores
data = f.readlines() for line in data: group_valid.append(int(line.split("\n")[0])) group_test = [] with open("data_lambdaMART\\qac.test.group", "r") as f: data = f.readlines() for line in data: group_test.append(int(line.split("\n")[0])) # create the train/validation/test DMatrix objects required by xgboost;s lambdaMART implementation train_dmatrix = DMatrix(x_train, y_train) valid_dmatrix = DMatrix(x_valid, y_valid) test_dmatrix = DMatrix(x_test) # set the groups for the training and validation sets train_dmatrix.set_group(group_train) valid_dmatrix.set_group(group_valid) # LambdaMART parameters params = {'objective': 'rank:pairwise', 'n_estimators': 300, 'eta': 0.1, 'gamma': 1.0, 'min_child_weight': 0.1, 'max_depth': 6} # create lambdaMart with the aforementioned parameters xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4, evals=[(valid_dmatrix, 'validation')]) # run the prediction process on the test set predictions = xgb_model.predict(test_dmatrix) # plot the importance of the features in the training and validation sets xgb.plot_importance(xgb_model)
def _dmat_init(group, **params): ret = DMatrix(**params) ret.set_group(group) return ret
def fit(self, X, y, group=None, eval_metric=None, sample_weight=None, early_stopping_rounds=None, verbose=True): """ Fit the gradient boosting model Parameters ---------- X : array_like Feature matrix y : array_like Labels group : list, optional Group number list. All X and y will be taken as single group when group is not provided. All ranking is valid only in their own group. sample_weight : array_like instance weights eval_set : list, optional A list of (X, y) tuple pairs to use as a validation set for early-stopping eval_metric : str, callable, optional If a str, should be a built-in evaluation metric to use. See doc/parameter.md. If callable, a custom evaluation metric. The call signature is func(y_predicted, y_true) where y_true will be a DMatrix object such that you may need to call the get_label method. It must return a str, value pair where the str is a name for the evaluation and value is the value of the evaluation function. This objective is always minimized. early_stopping_rounds : int Activates early stopping. Validation error needs to decrease at least every <early_stopping_rounds> round(s) to continue training. Requires at least one item in evals. If there's more than one, will use the last. Returns the model from the last iteration (not the best one). If early stopping occurs, the model will have three additional fields: bst.best_score, bst.best_iteration and bst.best_ntree_limit. (Use bst.best_ntree_limit to get the correct value if num_parallel_tree and/or num_class appears in the parameters) verbose : bool If `verbose` and an evaluation set is used, writes the evaluation metric measured on the validation set to stderr. xgb_model : str file name of stored xgb model or 'Booster' instance Xgb model to be loaded before training (allows training continuation). """ if group == None: group = [X.shape[0]] params = self.get_xgb_params() if callable(self.objective): obj = _objective_decorator(self.objective) # Use default value. Is it really not used ? xgb_options["objective"] = "rank:pairwise" else: obj = None evals_result = {} feval = eval_metric if callable(eval_metric) else None if eval_metric is not None: if callable(eval_metric): eval_metric = None else: params.update({'eval_metric': eval_metric}) if sample_weight is not None: train_dmatrix = DMatrix(X, label=y, weight=sample_weight, missing=self.missing) else: train_dmatrix = DMatrix(X, label=y, missing=self.missing) train_dmatrix.set_group(group) self.objective = params["objective"] self._Booster = train(params, train_dmatrix, self.n_estimators, early_stopping_rounds=early_stopping_rounds, evals_result=evals_result, obj=obj, feval=feval, verbose_eval=verbose, xgb_model=None) if evals_result: for val in evals_result.items(): evals_result_key = list(val[1].keys())[0] evals_result[ val[0]][evals_result_key] = val[1][evals_result_key] self.evals_result = evals_result if early_stopping_rounds is not None: self.best_score = self._Booster.best_score self.best_iteration = self._Booster.best_iteration self.best_ntree_limit = self._Booster.best_ntree_limit return self
# 'nthread': 4, 'eval_metric': 'ndcg' } # generate training dataset # 一共2组*每组3条,6条样本,特征维数是2 n_group = 2 n_choice = 3 dtrain = np.random.uniform(0, 100, [n_group * n_choice, 2]) # numpy.random.choice(a, size=None, replace=True, p=None) dtarget = np.array( [np.random.choice([0, 1, 2], 3, False) for i in range(n_group)]).flatten() # n_group用于表示从前到后每组各自有多少样本,前提是样本中各组是连续的,[3,3]表示一共6条样本中前3条是第一组,后3条是第二组 dgroup = np.array([n_choice for i in range(n_group)]).flatten() # concate Train data, very import here ! xgbTrain = DMatrix(dtrain, label=dtarget) xgbTrain.set_group(dgroup) # generate eval data dtrain_eval = np.random.uniform(0, 100, [n_group * n_choice, 2]) xgbTrain_eval = DMatrix(dtrain_eval, label=dtarget) xgbTrain_eval.set_group(dgroup) evallist = [(xgbTrain, 'train'), (xgbTrain_eval, 'eval')] # train model # xgb_rank_params1加上 evals 这个参数会报错,还没找到原因 # rankModel = train(xgb_rank_params1,xgbTrain,num_boost_round=10) rankModel = train(xgb_rank_params2, xgbTrain, num_boost_round=20, evals=evallist) # test dataset dtest = np.random.uniform(0, 100, [n_group * n_choice, 2]) dtestgroup = np.array([n_choice for i in range(n_group)]).flatten()
def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, callbacks=None, learning_rates=None): """ Fit the gradient boosting model Parameters ---------- X : array_like Feature matrix with the first feature containing a group indicator y : array_like Labels sample_weight : array_like instance weights eval_set : list, optional A list of (X, y) tuple pairs to use as a validation set for early-stopping eval_metric : str, callable, optional If a str, should be a built-in evaluation metric to use. See doc/parameter.md. If callable, a custom evaluation metric. The call signature is func(y_predicted, y_true) where y_true will be a DMatrix object such that you may need to call the get_label method. It must return a str, value pair where the str is a name for the evaluation and value is the value of the evaluation function. This objective is always minimized. early_stopping_rounds : int Activates early stopping. Validation error needs to decrease at least every <early_stopping_rounds> round(s) to continue training. Requires at least one item in evals. If there's more than one, will use the last. Returns the model from the last iteration (not the best one). If early stopping occurs, the model will have three additional fields: bst.best_score, bst.best_iteration and bst.best_ntree_limit. (Use bst.best_ntree_limit to get the correct value if num_parallel_tree and/or num_class appears in the parameters) verbose : bool If `verbose` and an evaluation set is used, writes the evaluation metric measured on the validation set to stderr. learning_rates: list or function (deprecated - use callback API instead) List of learning rate for each boosting round or a customized function that calculates eta in terms of current number of round and the total number of boosting round (e.g. yields learning rate decay) xgb_model : file name of stored xgb model or 'Booster' instance Xgb model to be loaded before training (allows training continuation). callbacks : list of callback functions List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using xgb.callback module. Example: [xgb.callback.reset_learning_rate(custom_rates)] """ #X, y = check_X_y(X, y, accept_sparse=False, y_numeric=True) sizes, _, X_features, y, _, _ = self._preprare_data_in_groups(X, y) params = self.get_xgb_params() if callable(self.objective): obj = _objective_decorator(self.objective) # Dummy, Not used when custom objective is given params["objective"] = "binary:logistic" else: obj = None evals_result = {} feval = eval_metric if callable(eval_metric) else None if eval_metric is not None: if callable(eval_metric): eval_metric = None else: params.update({'eval_metric': eval_metric}) if sample_weight is not None: train_dmatrix = DMatrix(X_features, label=y, weight=sample_weight, missing=self.missing) else: train_dmatrix = DMatrix(X_features, label=y, missing=self.missing) train_dmatrix.set_group(sizes) def _dmat_init(group, **params): ret = DMatrix(**params) ret.set_group(group) return ret eval_group = [] neval_set = [] if eval_set: for i in range(len(eval_set)): seval_group, _, X_features, y, _, _ = self._preprare_data_in_groups( eval_set[i][0], eval_set[i][1]) eval_group.append(seval_group) neval_set.append([X_features, y]) if neval_set != []: sample_weight_eval_set = [None] * len(eval_set) evals = [ _dmat_init(eval_group[i], data=neval_set[i][0], label=neval_set[i][1], missing=self.missing, weight=sample_weight_eval_set[i], nthread=self.n_jobs) for i in range(len(neval_set)) ] nevals = len(evals) eval_names = ["eval_{}".format(i) for i in range(nevals)] evals = list(zip(evals, eval_names)) else: evals = () self._Booster = train(params, train_dmatrix, self.n_estimators, evals=evals, early_stopping_rounds=early_stopping_rounds, evals_result=evals_result, obj=obj, feval=feval, verbose_eval=verbose, xgb_model=xgb_model, learning_rates=learning_rates, callbacks=callbacks) if evals_result: for val in evals_result.items(): evals_result_key = list(val[1].keys())[0] evals_result[ val[0]][evals_result_key] = val[1][evals_result_key] self.evals_result = evals_result if early_stopping_rounds is not None: self.best_score = self._Booster.best_score self.best_iteration = self._Booster.best_iteration self.best_ntree_limit = self._Booster.best_ntree_limit return self
def main_xgb(fold_offset): all_races_rank_regression = [] all_races_query_regression = [] all_races_target_regression = [] get_race_gets(all_races_train, all_races_rank_regression, all_races_query_regression, all_races_target_regression) all_races_rank_regression = np.array(all_races_rank_regression) all_races_query_regression = np.array(all_races_query_regression) all_races_target_regression = np.array(all_races_target_regression) if use_history: categorical_feature = [0, 1, 2, 3, 4] + list(range(6, 21)) else: categorical_feature = [0, 1, 2, 3, 4] categorical_dim = [ int(np.max(all_races_rank_regression[:, c])) for c in categorical_feature ] del all_races_rank_regression, all_races_query_regression, all_races_target_regression def get_matrix(mat): shape = list(mat.shape) shape[1] += int(np.sum(categorical_dim)) - len(categorical_feature) matrix = np.zeros(tuple(shape)) cur_dim = 0 cur_ind = 0 while cur_dim < shape[1]: if cur_ind in categorical_feature: dim = categorical_dim[categorical_feature.index(cur_ind)] for z in range(shape[0]): matrix[z, cur_dim + int(mat[z, cur_ind])] = 1 cur_dim += dim else: matrix[:, cur_dim] = mat[:, cur_ind] cur_dim += 1 cur_ind += 1 return matrix if len(test_src) > 0: all_races_rank_test_x = get_matrix(all_races_rank_test) if len(in_data) != 0 and len(in_meta) != 0: predict_races_target_x = get_matrix(predict_races_target) for fold_id, (train_index, test_index) in enumerate( KFold(n_splits=10).split(all_races_train)): all_races_train_train = all_races_train[train_index] all_races_train_valid = all_races_train[test_index] all_races_rank_train_train = [] all_races_query_train_train = [] all_races_target_train_train = [] all_races_rank_train_valid = [] all_races_query_train_valid = [] all_races_target_train_valid = [] get_race_gets(all_races_train_train, all_races_rank_train_train, all_races_query_train_train, all_races_target_train_train) get_race_gets(all_races_train_valid, all_races_rank_train_valid, all_races_query_train_valid, all_races_target_train_valid) all_races_rank_train_train = get_matrix( np.array(all_races_rank_train_train)) all_races_query_train_train = np.array(all_races_query_train_train) all_races_target_train_train = np.array(all_races_target_train_train) all_races_rank_train_valid = get_matrix( np.array(all_races_rank_train_valid)) all_races_query_train_valid = np.array(all_races_query_train_valid) all_races_target_train_valid = np.array(all_races_target_train_valid) xgb_params = { 'objective': 'rank:pairwise', 'eta': 0.1, 'gamma': 0.0001, 'min_child_weight': 0.1, 'max_depth': 6 } xgtrain = DMatrix(all_races_rank_train_train, all_races_target_train_train) xgtrain.set_group(all_races_query_train_train) xgvalid = DMatrix(all_races_rank_train_valid, all_races_target_train_valid) xgvalid.set_group(all_races_query_train_valid) del all_races_train_train, all_races_train_valid, all_races_rank_train_train, all_races_target_train_train, all_races_query_train_train, all_races_rank_train_valid, all_races_target_train_valid, all_races_query_train_valid xgb_clf = xgb.train(xgb_params, xgtrain, num_boost_round=10, evals=[(xgvalid, 'validation')]) del xgtrain, xgvalid if len(test_src) > 0: dst = norm_racedata( xgb_clf.predict(DMatrix(all_races_rank_test_x)), all_races_query_test) for dst_ind in range(len(dst)): test_validation_regression[dst_ind][fold_offset + fold_id] = dst[dst_ind] cur_pos = 0 if len(in_data) != 0 and len(in_meta) != 0: dst = norm_racedata( xgb_clf.predict(DMatrix(predict_races_target_x)), [len(predict_races_target_x)]) for dst_ind in range(len(dst)): predict_validation_regression[dst_ind][fold_offset + fold_id] = dst[dst_ind]
def _dmat_init(data, labels, **params): sizes, _, X_features, y, _ = _preprare_data_in_groups(data, labels) ret = DMatrix(X_features, y, **params) ret.set_group(sizes) return ret