def _check_params(self): if self.loss is None: self.loss = AdaLossFunction() # Losses from sklearn are not allowed assert isinstance(self.loss, AbstractLossFunction), \ 'LossFunction should be derived from AbstractLossFunction' assert self.n_estimators > 0, 'n_estimators should be positive' self.random_state = check_random_state(self.random_state) assert 0 < self.subsample <= 1.0, 'subsample should be in the interval (0, 1]'
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = losses.CompositeLossFunction() loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [ loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn ]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format( result, loss) trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX)) for loss in [ losses.MSELossFunction(), losses.MAELossFunction(), losses.RankBoostLossFunction(request_column='fake_request') ]: print(loss) clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5, train_features=list( trainX.columns[1:])) clf.fit(trainX, trainY) roc_auc = roc_auc_score(testY, clf.predict(testX)) assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format( roc_auc, loss)
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6): """ Testing with two main classification losses. Also testing copying """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) for loss in [LogLossFunction(), AdaLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY) assert clf.n_features == n_features assert len(clf.feature_importances_) == n_features # checking that predict proba works for p in clf.staged_predict_proba(testX): assert p.shape == (n_samples, 2) assert numpy.all(p == clf.predict_proba(testX)) assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low' # checking clonability _ = clone(clf) clf_copy = copy.deepcopy(clf) assert numpy.all( clf.predict_proba(trainX) == clf_copy.predict_proba( trainX)), 'copied classifier is different'
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6): """ Testing how classifiers work with highly misbalanced (in the terms of weights) datasets. """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) trainW = trainY * 10000 + 1 testW = testY * 10000 + 1 for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY, sample_weight=trainW) p = clf.predict_proba(testX) assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = CompositeLossFunction() loss4 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [ loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn ]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format( result, loss)
class AbstractGradientBoostingClassifier(BaseEstimator, ClassifierMixin): def __init__(self, loss=None, n_estimators=100, learning_rate=0.1, subsample=1.0, train_variables=None, random_state=None, n_threads=1, dtype=DTYPE): """This version of gradient boosting supports only two-class classification and only special losses derived from AbstractLossFunction. There are some methods that should be overriden in descendants. :type loss: AbstractLossFunction, by default AdaLossFunction is used """ self.loss = loss self.n_estimators = n_estimators self.learning_rate = learning_rate self.subsample = subsample self.train_variables = train_variables self.random_state = random_state self.initial_prediction = 0. self.dtype = dtype self.n_threads = n_threads def _check_params(self): if self.loss is None: self.loss = AdaLossFunction() # Losses from sklearn are not allowed assert isinstance(self.loss, AbstractLossFunction), \ 'LossFunction should be derived from AbstractLossFunction' assert self.n_estimators > 0, 'n_estimators should be positive' self.random_state = check_random_state(self.random_state) assert 0 < self.subsample <= 1.0, 'subsample should be in the interval (0, 1]' def _create_estimator(self, stage): raise NotImplementedError('Should be overriden in descendants') def _fit_estimator(self, estimator, X, y, sample_weight, residual, mask): """ mask - which events to use in training """ # TODO do we need check_input=false for trees? estimator.fit(X[mask, :], residual[mask], sample_weight=sample_weight[mask]) def _update_estimator(self, estimator, X, y, sample_weight, residual, y_pred, mask): pass def _prepare_data_for_fitting(self, X, y, sample_weight): """By default the same format used as for trees """ X = self.get_train_vars(X) X, y = check_arrays(X, y, dtype=self.dtype, sparse_format="dense", check_ccontiguous=True) return X, y, sample_weight @staticmethod def _initial_data_check(X, y, sample_weight): sample_weight = check_sample_weight(y, sample_weight=sample_weight) assert len(X) == len(y), 'Different lengths of X and y' X = pandas.DataFrame(X) y = numpy.array(column_or_1d(y), dtype=int) assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported' return X, y, sample_weight def _prepare_initial_predictions(self, X, y, sample_weight): self.initial_prediction = logit(numpy.average(y, weights=sample_weight)) def _compute_initial_predictions(self, X): return numpy.zeros(len(X), dtype='float') + self.initial_prediction def _generate_mask(self, length, subsample): if subsample == 1.0: return slice(None, None, None) else: n_sampled_events = int(subsample * length) return self.random_state.choice(length, n_sampled_events, replace=True) def fit(self, X, y, sample_weight=None): X, y, sample_weight = self._initial_data_check(X, y, sample_weight) self._check_params() loss_weight = numpy.ones(len(sample_weight)) tree_weight = sample_weight if False: loss_weight, tree_weight = tree_weight, loss_weight self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=loss_weight) X, y, sample_weight = self._prepare_data_for_fitting(X, y, sample_weight) self._prepare_initial_predictions(X, y, sample_weight) y_pred = self._compute_initial_predictions(X) self.estimators = [] self.scores = [] # pool = ThreadPool(processes=self.n_threads) lock = Lock() train_params = [self, X, y, tree_weight, y_pred, lock] # TODO use threading # pool.map(_train_one_classifier, [train_params] * self.n_estimators, chunksize=1) map(_train_one_classifier, [train_params] * self.n_estimators) return self def get_train_vars(self, X): if self.train_variables is None: return numpy.array(X) else: return numpy.array(X.loc[:, self.train_variables]) @staticmethod def score_to_proba(score): result = numpy.zeros([len(score), 2], dtype=float) result[:, 1] = sigmoid_function(score, width=1.) result[:, 0] = 1. - result[:, 1] return result def staged_predict_score(self, X): X = self.get_train_vars(X) y_pred = self._compute_initial_predictions(X) for estimator in self.estimators: y_pred += self.learning_rate * estimator.predict(X) yield y_pred def predict_score(self, X): result = None for score in self.staged_predict_score(X): result = score return result def staged_predict_proba(self, X): for score in self.staged_predict_score(X): yield self.score_to_proba(score) def predict_proba(self, X): return self.score_to_proba(self.predict_score(X)) def predict(self, X): return numpy.argmax(self.predict_proba(X), axis=1)
class AbstractGradientBoostingClassifier(BaseEstimator, ClassifierMixin): def __init__(self, loss=None, n_estimators=100, learning_rate=0.1, subsample=1.0, train_variables=None, random_state=None, n_threads=1, dtype=DTYPE): """This version of gradient boosting supports only two-class classification and only special losses derived from AbstractLossFunction. There are some methods that should be overriden in descendants. :type loss: AbstractLossFunction, by default AdaLossFunction is used """ self.loss = loss self.n_estimators = n_estimators self.learning_rate = learning_rate self.subsample = subsample self.train_variables = train_variables self.random_state = random_state self.initial_prediction = 0. self.dtype = dtype self.n_threads = n_threads def _check_params(self): if self.loss is None: self.loss = AdaLossFunction() # Losses from sklearn are not allowed assert isinstance(self.loss, AbstractLossFunction), \ 'LossFunction should be derived from AbstractLossFunction' assert self.n_estimators > 0, 'n_estimators should be positive' self.random_state = check_random_state(self.random_state) assert 0 < self.subsample <= 1.0, 'subsample should be in the interval (0, 1]' def _create_estimator(self, stage): raise NotImplementedError('Should be overriden in descendants') def _fit_estimator(self, estimator, X, y, sample_weight, residual, mask): """ mask - which events to use in training """ # TODO do we need check_input=false for trees? estimator.fit(X[mask, :], residual[mask], sample_weight=sample_weight[mask]) def _update_estimator(self, estimator, X, y, sample_weight, residual, y_pred, mask): pass def _prepare_data_for_fitting(self, X, y, sample_weight): """By default the same format used as for trees """ X = self.get_train_vars(X) X, y = check_arrays(X, y) X = X.astype(self.dtype) return X, y, sample_weight @staticmethod def _initial_data_check(X, y, sample_weight): sample_weight = check_sample_weight(y, sample_weight=sample_weight) assert len(X) == len(y), 'Different lengths of X and y' X = pandas.DataFrame(X) y = numpy.array(column_or_1d(y), dtype=int) assert numpy.all(numpy.in1d( y, [0, 1])), 'Only two-class classification supported' return X, y, sample_weight def _prepare_initial_predictions(self, X, y, sample_weight): self.initial_prediction = logit(numpy.average(y, weights=sample_weight)) def _compute_initial_predictions(self, X): return numpy.zeros(len(X), dtype='float') + self.initial_prediction def _generate_mask(self, length, subsample): if subsample == 1.0: return slice(None, None, None) else: n_sampled_events = int(subsample * length) return self.random_state.choice(length, n_sampled_events, replace=True) def fit(self, X, y, sample_weight=None): X, y, sample_weight = self._initial_data_check(X, y, sample_weight) self._check_params() loss_weight = numpy.ones(len(sample_weight)) tree_weight = sample_weight if False: loss_weight, tree_weight = tree_weight, loss_weight self.loss = copy.copy(self.loss) self.loss.fit(X, y, sample_weight=loss_weight) X, y, sample_weight = self._prepare_data_for_fitting( X, y, sample_weight) self._prepare_initial_predictions(X, y, sample_weight) y_pred = self._compute_initial_predictions(X) self.estimators = [] self.scores = [] # pool = ThreadPool(processes=self.n_threads) lock = Lock() train_params = [self, X, y, tree_weight, y_pred, lock] # TODO use threading # pool.map(_train_one_classifier, [train_params] * self.n_estimators, chunksize=1) map(_train_one_classifier, [train_params] * self.n_estimators) return self def get_train_vars(self, X): if self.train_variables is None: return numpy.array(X) else: return numpy.array(X.loc[:, self.train_variables]) @staticmethod def score_to_proba(score): result = numpy.zeros([len(score), 2], dtype=float) result[:, 1] = sigmoid_function(score, width=1.) result[:, 0] = 1. - result[:, 1] return result def staged_predict_score(self, X): X = self.get_train_vars(X) y_pred = self._compute_initial_predictions(X) for estimator in self.estimators: y_pred += self.learning_rate * estimator.predict(X) yield y_pred def predict_score(self, X): result = None for score in self.staged_predict_score(X): result = score return result def staged_predict_proba(self, X): for score in self.staged_predict_score(X): yield self.score_to_proba(score) def predict_proba(self, X): return self.score_to_proba(self.predict_score(X)) def predict(self, X): return numpy.argmax(self.predict_proba(X), axis=1)