-
Notifications
You must be signed in to change notification settings - Fork 0
/
xgboost_train.py
79 lines (62 loc) · 2.25 KB
/
xgboost_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from __future__ import print_function
from pprint import pprint
import os
import csv
import cPickle as pickle
import numpy as np
import scipy as sp
from feature_label_builder import FeatureLabelBuilder, get_feature_keys
from feature_label_builder import data_path, full_path
from util_eval import multiclass_log_loss, multiclass_accuracy
import xgboost as xgb
# load data
(all_xs, all_ys) = pickle.load(open(full_path("train_xs_ys_np.p"), "rb"))
all_ys = all_ys - 1.0
# set up split portion of train and test data
test_percentage = 0.1
all_num = len(all_ys)
train_num = int(round((1. - test_percentage) * all_num))
test_num = all_num - train_num
# prepare random shuffle index
random_idx = np.array(range(all_num))
np.random.seed(0)
np.random.shuffle(random_idx)
# prepare train and test dataset
train_xs = all_xs[random_idx][:train_num]
train_ys = all_ys[random_idx][:train_num]
test_xs = all_xs[random_idx][train_num:]
test_ys = all_ys[random_idx][train_num:]
# convert to xgb matrix
dtrain = xgb.DMatrix(train_xs, label=train_ys)
dtest = xgb.DMatrix(test_xs, label=test_ys)
# set up boosting params
params = {'bst:max_depth': 5,
'bst:eta': 0.1,
'silent': 0,
'objective': 'multi:softprob',
'num_class': 5,
'nthread': 16, # change this!
'eval_metric': 'mlogloss',
'subsample': 0.01}
ops = {'num_boost_round': 2000,
'early_stopping_rounds': 10}
print("start training")
bst = xgb.train(params.items(),
dtrain,
num_boost_round=ops['num_boost_round'],
evals=[(dtest, 'eval'), (dtrain, 'train')],
early_stopping_rounds=ops['early_stopping_rounds'])
# dump model
bst.dump_model('xgboost_round_%s_%s.txt' % (ops['num_boost_round'],
ops['early_stopping_rounds']))
bst.save_model('xgboost_round_%s_%s.model' % (ops['num_boost_round'],
ops['early_stopping_rounds']))
# load test (valid) set
dtest = xgb.DMatrix(test_xs)
# predict
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
test_ys_preds = np.array(ypred)
# metrics
print(multiclass_log_loss(test_ys, test_ys_preds))
print(multiclass_accuracy(test_ys, test_ys_preds))
# import ipdb; ipdb.set_trace()