forked from canivel/bnp_paribas
/
ensemble_1.py
144 lines (111 loc) · 7.12 KB
/
ensemble_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
'''
This script generates meta-probabilities and bumping probabilities, trains three separate models, ensembles their predictions,
and train offset values to construct the vector of predicted classes for the final submission.
The offset learning idea was borrowed (with substantial changes) from a public script by Michael Hartmann:
https://www.kaggle.com/zeroblue/prudential-life-insurance-assessment/xgboost-with-optimized-offsets/run/133836
However, as opposed to that script, offsets were trained on STACKED training predictions, which reduces the chance
of overfitting. Moreover, the initial values of offsets were chosen based on the discrepancies between test predictions
and the distribution of labels in training data estimated at quantile values.
'''
import numpy as np
from scipy.optimize import fmin_powell
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
# load user modules
import utils
from wrappers_xgboost import XGBoost_binary, XGBoost_regressor, XGBoost_multilabel
n_folds = 4 # set the number of folders for generating meta-features
def apply_offset(data, bin_offset, sv, scorer=utils.eval_wrapper):
# data has the format of pred=0, offset_pred=1, labels=2 in the first dim
data[1, data[0].astype(int) == sv] = data[0, data[0].astype(int) == sv] + bin_offset
score = scorer(data[1], data[2])
return score
print('Load data')
train, test, target, labels = utils.Load_data()
clf1 = XGBoost_binary(nthread=6, eta=0.003, gamma=1.2, max_depth=6,
min_child_weight=2, max_delta_step=None,
subsample=0.6, colsample_bytree=0.35, scale_pos_weight=1.5,
silent=0, seed=1301, l2_reg=1, l1_reg=0.2, n_estimators=4269)
clf2 = XGBoost_binary(nthread=6, eta=0.004, gamma=1.2, max_depth=6,
min_child_weight=2, max_delta_step=None,
subsample=0.6, colsample_bytree=0.35, scale_pos_weight=1,
silent=0, seed=1301, l2_reg=1, l1_reg=0.2, n_estimators=4200)
clf3 = XGBoost_binary(nthread=6, eta=0.004, gamma=1.2, max_depth=6,
min_child_weight=2, max_delta_step=None,
subsample=0.6, colsample_bytree=0.35, scale_pos_weight=1,
silent=0, seed=1301, l2_reg=1, l1_reg=0.2, n_estimators=4190)
clf4 = XGBoost_binary(nthread=6, eta=0.004, gamma=1.2, max_depth=6,
min_child_weight=2, max_delta_step=None,
subsample=0.6, colsample_bytree=0.35, scale_pos_weight=1,
silent=0, seed=1301, l2_reg=1, l1_reg=0.2, n_estimators=4188)
clf5 = XGBoost_binary(nthread=6, eta=0.004, gamma=1.2, max_depth=6,
min_child_weight=2, max_delta_step=None,
subsample=0.6, colsample_bytree=0.35, scale_pos_weight=1,
silent=0, seed=1301, l2_reg=1, l1_reg=0.2, n_estimators=4191)
clf6 = XGBoost_binary(nthread=6, eta=0.004, gamma=0.95, max_depth=6,
min_child_weight=4, max_delta_step=None,
subsample=0.55, colsample_bytree=0.35, scale_pos_weight=1,
silent=0, seed=1301, l2_reg=1, l1_reg=0.3, n_estimators=4190)
clf7 = XGBoost_binary(nthread=6, eta=0.004, gamma=0.85, max_depth=7,
min_child_weight=4, max_delta_step=None,
subsample=0.6, colsample_bytree=0.3, scale_pos_weight=1,
silent=0, seed=1301, l2_reg=1, l1_reg=0.05, n_estimators=4290)
clfs1 = [clf1, clf2, clf3, clf4, clf5, clf6, clf7]
print('Compute bumping probabilities')
bumps_train, bumps_test = utils.Stack_Bump_Probs(train, test, clfs1, labels, n_folds)
clf8 = XGBoost_multilabel(nthread=6, eta=0.012,
gamma=1, max_depth=6, min_child_weight=10, max_delta_step=0,
subsample=0.65, colsample_bytree=0.5, silent=1, seed=1301,
l2_reg=1.5, l1_reg=0, num_round=975)
clfs2 = [clf8]
print('Compute stacking probabilities')
y = (target - 1)
train_probs_xgb, test_probs_xgb = utils.Stack_Multi(np.column_stack((train, bumps_train)),
np.column_stack((test, bumps_test)), y, clfs2, n_folds)
print('Construct stacking data')
train_stuck = np.column_stack((train, bumps_train, train_probs_xgb))
test_stuck = np.column_stack((test, bumps_test, test_probs_xgb))
clf9 = XGBoost_regressor(nthread=3, eta=0.0057, gamma=0, max_depth=6,
min_child_weight=2, max_delta_step=None,
subsample=0.66, colsample_bytree=0.7,
silent=1, seed=1301, l2_reg=0, l1_reg=0, n_estimators=1000)
clf10 = RandomForestRegressor(n_estimators=1000, criterion='mse', max_depth=6, min_samples_split=2,
min_samples_leaf=4, min_weight_fraction_leaf=0.0,
max_features=0.5, max_leaf_nodes=None, bootstrap=True,
oob_score=False, n_jobs=3, random_state=1301)
clf11 = ExtraTreesRegressor(n_estimators=1000, criterion='mse', max_depth=6, min_samples_split=2,
min_samples_leaf=4, min_weight_fraction_leaf=0.0,
max_features=0.62, max_leaf_nodes=None, bootstrap=False,
oob_score=False, n_jobs=3, random_state=1301)
clfs3 = [clf9, clf10, clf11]
print('Train ensemble models')
train_preds_stuck, test_preds_stuck = utils.Stack_Regr(train_stuck, test_stuck, target, clfs3, n_folds)
print('Compute ensemble predictions')
# Stacked train data is used for training offset values, this reduces the chance of overfitting
train_preds = (train_preds_stuck[:, 0] ** 0.41) * (train_preds_stuck[:, 1] ** 0.01) * (train_preds_stuck[:, 2] ** 0.58)
test_preds = (test_preds_stuck[:, 0] ** 0.41) * (test_preds_stuck[:, 1] ** 0.01) * (test_preds_stuck[:, 2] ** 0.58)
print('Train offset values for label construction')
num_classes = np.unique(target).shape[0]
# Compute quantiles of test predictions
quant = []
for q in range(1, 100):
p = np.percentile(test_preds, q)
quant.append(p)
# Compute initial offset values based on the discrepancies between label distribution of train data and the distribution of test predictions
offsets = -1 * np.array(
[quant[9] - 1.5, quant[20] - 2.5, quant[22] - 3.5, quant[24] - 4.5, 0, quant[34] - 5.5, quant[53] - 6.5,
quant[67] - 7.5])
# train offsets
data = np.vstack((train_preds, train_preds, target))
for j in range(num_classes):
data[1, data[0].astype(int) == j] = data[0, data[0].astype(int) == j] + offsets[j]
for j in range(num_classes):
train_offset = lambda x: -apply_offset(data, x, j)
offsets[j] = fmin_powell(train_offset, offsets[j])
print('Apply offsets to test')
data = np.vstack((test_preds, test_preds))
for j in range(num_classes):
data[1, data[0].astype(int) == j] = data[0, data[0].astype(int) == j] + offsets[j]
preds_subm = np.round(np.clip(data[1], 1, 8)).astype(int)
# Save submission
print('Save submission file')
utils.save_submission(preds_subm)