def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor self.xgb = XGBRegressor(objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, self.off = DigitizedOptimizedOffsetRegressor( n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.xgb.fit(X, y) tr_y_hat = self.xgb.predict( X, ntree_limit=self.xgb.booster().best_iteration) print('Train score is:', -self.scoring(tr_y_hat, y)) self.off.fit(tr_y_hat, y) print("Offsets:", self.off.params) return self
def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.xgb.fit(X, y) tr_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) print('Train score is:', -self.scoring(tr_y_hat, y)) self.off.fit(tr_y_hat, y) print("Offsets:", self.off.params) return self
def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 2 / 5 grid scores: mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65531 3 / 5 grid scores: mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65474 4 / 5 grid scores: mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65490 2 / 10 grid scores: mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65688 3 / 10 grid scores: mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65705 4 / 10 grid scores: mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65643 5 / 10 grid scores: mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65630 """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=2) print(kf) params = [] for itrain, itest in kf: ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb.fit(Xtrain, ytrain) te_y_hat = self.xgb.predict(Xtest, ntree_limit=self.xgb.booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off.fit(te_y_hat, ytest) print("Offsets:", self.off.params) params += [list(self.off.params)] pass from numpy import array self.off.params = array(params).mean(axis=0) print("Mean Offsets:", self.off.params) self.xgb.fit(X, y) return self
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin): def __init__(self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 2 / 5 grid scores: mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65531 3 / 5 grid scores: mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65474 4 / 5 grid scores: mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65490 2 / 10 grid scores: mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65688 3 / 10 grid scores: mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65705 4 / 10 grid scores: mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65643 5 / 10 grid scores: mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65630 """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=2) print(kf) params = [] for itrain, itest in kf: ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb.fit(Xtrain, ytrain) te_y_hat = self.xgb.predict(Xtest, ntree_limit=self.xgb.booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off.fit(te_y_hat, ytest) print("Offsets:", self.off.params) params += [list(self.off.params)] pass from numpy import array self.off.params = array(params).mean(axis=0) print("Mean Offsets:", self.off.params) self.xgb.fit(X, y) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
class PrudentialRegressor(BaseEstimator, RegressorMixin): def __init__(self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[-1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor self.xgb = XGBRegressor( objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, self.off = DigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.xgb.fit(X, y) tr_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) print('Train score is:', -self.scoring(tr_y_hat, y)) self.off.fit(tr_y_hat, y) print("Offsets:", self.off.params) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict(X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
def fit(self, X, y): if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 5-fold Stratified CV grid scores: mean: 0.64475, std: 0.00483, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 2, 'max_depth': 6} mean: 0.64926, std: 0.00401, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 3, 'max_depth': 6} mean: 0.65281, std: 0.00384, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 4, 'max_depth': 6} mean: 0.65471, std: 0.00422, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 5, 'max_depth': 6} mean: 0.65563, std: 0.00440, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 6, 'max_depth': 6} mean: 0.65635, std: 0.00433, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6} mean: 0.65600, std: 0.00471, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 8, 'max_depth': 6} best score: 0.65635 best params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6} reversed params [8 bins]: mean: 0.65588, std: 0.00417, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 6, 'max_depth': 6} mean: 0.65640, std: 0.00438, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6} with Scirpus obj grid scores: mean: 0.65775, std: 0.00429, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6} best score: 0.65775 +1 na trzech Product_info_2* mean: 0.65555, std: 0.00462, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 6, 'max_depth': 6} mean: 0.65613, std: 0.00438, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6} DISCRETE: NaN=most_common, +Medical_History_10,24, (24 jest znaczacy) mean: 0.65589, std: 0.00490, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6} PROPER DATA + Scirpus + reversed params + no-drops mean: 0.65783, std: 0.00444, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6} PROPER DATA + Scirpus + reversed params + no-drops, [email protected] mean: 0.65790, std: 0.00421, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 6} jak wyzej, max_depth=7 mean: 0.65802, std: 0.00420, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 7} jak wyzej, max_depth=10 mean: 0.65833, std: 0.00387, params: {'colsample_bytree': 0.67, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10} jak wyzej, max_depth=10, eta=0.03 mean: 0.65888, std: 0.00391, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10} jak wyzej, max_depth=30, eta=0.02 mean: 0.65798, std: 0.00340, params: {'colsample_bytree': 0.67, 'learning_rate': 0.02, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 30} jak wyzej, max_depth=10, eta=0.03, eval_metric=Scirpus mean: 0.65891, std: 0.00395, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10} jak wyzej, max_depth=10, eta=0.03, eval_metric=QWKappa mean: 0.65827, std: 0.00368, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10} jak wyzej, max_depth=10, eta=0.03, eval_metric=Scirpus, GMM6,GMM17 mean: 0.65862, std: 0.00423, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10} jak wyzej, max_depth=10, eta=0.03, eval_metric=Scirpus, Gvector mean: 0.65864, std: 0.00384, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10} jak wyzej, max_depth=10, eta=0.03, eval_metric=Scirpus, learning_rates=[0.03] * 200 + [0.02] * 500, mean: 0.65910, std: 0.00384, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 240, 'n_estimators': 700, 'subsample': 0.9, 'int_fold': 7, 'max_depth': 10} """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=self.int_fold) print(kf) self.xgb = [] self.off = [] for i, (itrain, itest) in enumerate(kf): ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb += [None] from xgb_sklearn import XGBRegressor #from xgboost import XGBRegressor self.xgb[i] = XGBRegressor(objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb[i].fit( Xtrain, ytrain, eval_set=[(Xtest, ytest)], #eval_metric=self.scoring, #eval_metric='rmse', eval_metric=scirpus_error, #eval_metric=qwkappa_error, verbose=False, early_stopping_rounds=30, #learning_rates=[self.learning_rate] * 200 + [0.02] * 500, obj=scirpus_regobj #obj=qwkappa_regobj ) print("best iteration:", self.xgb[i].booster().best_iteration) te_y_hat = self.xgb[i].predict( Xtest, ntree_limit=self.xgb[i].booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off += [None] self.off[i] = DigitizedOptimizedOffsetRegressor( n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off[i].fit(te_y_hat, ytest) print("Offsets:", self.off[i].params) pass return self
def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 2 / 5 grid scores: mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65531 3 / 5 grid scores: mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65474 4 / 5 grid scores: mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65490 2 / 10 grid scores: mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65688 3 / 10 grid scores: mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65705 4 / 10 grid scores: mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65643 5 / 10 grid scores: mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65630 """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=2) print(kf) params = [] for itrain, itest in kf: ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb = XGBRegressor(objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb.fit(Xtrain, ytrain) te_y_hat = self.xgb.predict( Xtest, ntree_limit=self.xgb.booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off = DigitizedOptimizedOffsetRegressor( n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off.fit(te_y_hat, ytest) print("Offsets:", self.off.params) params += [list(self.off.params)] pass from numpy import array self.off.params = array(params).mean(axis=0) print("Mean Offsets:", self.off.params) self.xgb.fit(X, y) return self
class PrudentialRegressorCVO(BaseEstimator, RegressorMixin): def __init__( self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[ -1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, """ 2 / 5 grid scores: mean: 0.65531, std: 0.00333, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65531 3 / 5 grid scores: mean: 0.65474, std: 0.00308, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65474 4 / 5 grid scores: mean: 0.65490, std: 0.00302, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65490 2 / 10 grid scores: mean: 0.65688, std: 0.00725, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65688 3 / 10 grid scores: mean: 0.65705, std: 0.00714, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65705 4 / 10 grid scores: mean: 0.65643, std: 0.00715, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65643 5 / 10 grid scores: mean: 0.65630, std: 0.00699, params: {'n_estimators': 700, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 240} best score: 0.65630 """ from sklearn.cross_validation import StratifiedKFold kf = StratifiedKFold(y, n_folds=2) print(kf) params = [] for itrain, itest in kf: ytrain = y[itrain] Xtrain = X.iloc[list(itrain)] ytest = y[itest] Xtest = X.iloc[list(itest)] self.xgb = XGBRegressor(objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) self.xgb.fit(Xtrain, ytrain) te_y_hat = self.xgb.predict( Xtest, ntree_limit=self.xgb.booster().best_iteration) print('XGB Test score is:', -self.scoring(te_y_hat, ytest)) self.off = DigitizedOptimizedOffsetRegressor( n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.off.fit(te_y_hat, ytest) print("Offsets:", self.off.params) params += [list(self.off.params)] pass from numpy import array self.off.params = array(params).mean(axis=0) print("Mean Offsets:", self.off.params) self.xgb.fit(X, y) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict( X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass
class PrudentialRegressor(BaseEstimator, RegressorMixin): def __init__( self, objective='reg:linear', learning_rate=0.045, min_child_weight=50, subsample=0.8, colsample_bytree=0.7, max_depth=7, n_estimators=700, nthread=-1, seed=0, n_buckets=8, initial_params=[ -1.5, -2.6, -3.6, -1.2, -0.8, 0.04, 0.7, 3.6, #1., 2., 3., 4., 5., 6., 7. ], minimizer='BFGS', scoring=NegQWKappaScorer): self.objective = objective self.learning_rate = learning_rate self.min_child_weight = min_child_weight self.subsample = subsample self.colsample_bytree = colsample_bytree self.max_depth = max_depth self.n_estimators = n_estimators self.nthread = nthread self.seed = seed self.n_buckets = n_buckets self.initial_params = initial_params self.minimizer = minimizer self.scoring = scoring return def fit(self, X, y): from xgboost import XGBRegressor if not KAGGLE: from OptimizedOffsetRegressor import DigitizedOptimizedOffsetRegressor self.xgb = XGBRegressor(objective=self.objective, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, missing=0.0, seed=self.seed) #from OptimizedOffsetRegressor import FullDigitizedOptimizedOffsetRegressor #self.off = FullDigitizedOptimizedOffsetRegressor(n_buckets=self.n_buckets, # basinhopping=True, self.off = DigitizedOptimizedOffsetRegressor( n_buckets=self.n_buckets, initial_params=self.initial_params, minimizer=self.minimizer, scoring=self.scoring) self.xgb.fit(X, y) tr_y_hat = self.xgb.predict( X, ntree_limit=self.xgb.booster().best_iteration) print('Train score is:', -self.scoring(tr_y_hat, y)) self.off.fit(tr_y_hat, y) print("Offsets:", self.off.params) return self def predict(self, X): from numpy import clip te_y_hat = self.xgb.predict( X, ntree_limit=self.xgb.booster().best_iteration) return clip(self.off.predict(te_y_hat), 1, 8) pass