def test_gradient_boosting(n_samples=1000): """ Testing workability of GradientBoosting with different loss function """ # Generating some samples correlated with first variable distance = 0.6 testX, testY = generate_sample(n_samples, 10, distance) trainX, trainY = generate_sample(n_samples, 10, distance) # We will try to get uniform distribution along this variable uniform_features = ['column0'] loss1 = LogLossFunction() loss2 = AdaLossFunction() loss3 = losses.CompositeLossFunction() loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=1) loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features, uniform_label=[0, 1]) loss6bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=0) loss7bin = losses.BinFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) loss6knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=1) loss7knn = losses.KnnFlatnessLossFunction(uniform_features, fl_coefficient=2., uniform_label=[0, 1]) for loss in [ loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn, loss7knn ]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2, subsample=0.7, n_estimators=25, train_features=None) \ .fit(trainX[:n_samples], trainY[:n_samples]) result = clf.score(testX, testY) assert result >= 0.7, "The quality is too poor: {} with loss: {}".format( result, loss) trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX)) for loss in [ losses.MSELossFunction(), losses.MAELossFunction(), losses.RankBoostLossFunction(request_column='fake_request') ]: print(loss) clf = UGradientBoostingRegressor(loss=loss, max_depth=3, n_estimators=50, learning_rate=0.01, subsample=0.5, train_features=list( trainX.columns[1:])) clf.fit(trainX, trainY) roc_auc = roc_auc_score(testY, clf.predict(testX)) assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format( roc_auc, loss)
def test_step_optimality(n_samples=100): """ testing that for single leaf function returns the optimal value """ X, y = generate_sample(n_samples, n_features=10) sample_weight = numpy.random.exponential(size=n_samples) rank_column = X.columns[2] X[rank_column] = numpy.random.randint(0, 3, size=n_samples) tested_losses = [ losses.LogLossFunction(), losses.AdaLossFunction(), losses.KnnAdaLossFunction(X.columns[:1], uniform_label=0, knn=5), losses.CompositeLossFunction(), losses.RankBoostLossFunction(rank_column), losses.MSELossFunction(), ] pred = numpy.random.normal(size=n_samples) for loss in tested_losses: loss.fit(X, y, sample_weight=sample_weight) # Test simple way to get optimal step leaf_value = numpy.random.normal() step = 0. for _ in range(4): ministep, = loss.prepare_new_leaves_values( terminal_regions=numpy.zeros(n_samples, dtype=int), leaf_values=[leaf_value], y_pred=pred + step) step += ministep if isinstance(loss, losses.MAELossFunction): # checking that MAE is minimized with long process for iteration in range(1, 30): ministep, = loss.prepare_new_leaves_values( terminal_regions=numpy.zeros(n_samples, dtype=int), leaf_values=[leaf_value], y_pred=pred + step) step += ministep * 1. / iteration loss_values = [] coeffs = [0.9, 1.0, 1.1] for coeff in coeffs: loss_values.append(loss(pred + coeff * step)) print(loss, step, 'losses: ', loss_values) assert loss_values[1] <= loss_values[0] + 1e-7 assert loss_values[1] <= loss_values[2] + 1e-7 # Test standard function opt_value = loss.compute_optimal_step(y_pred=pred) loss_values2 = [] for coeff in coeffs: loss_values2.append(loss(pred + coeff * opt_value)) print(loss, step, 'losses: ', loss_values) assert loss_values2[1] <= loss_values2[0] + 1e-7 assert loss_values2[1] <= loss_values2[2] + 1e-7
def test_loss_functions(size=50, epsilon=1e-3): """ Testing that Hessians and gradients of loss functions coincide with numerical approximations """ X, y = generate_sample(size, n_features=10) rank_column = X.columns[2] X[rank_column] = numpy.random.randint(0, 3, size=size) sample_weight = numpy.random.exponential(size=size) tested_losses = [ losses.MSELossFunction(), losses.MAELossFunction(), losses.LogLossFunction(), losses.AdaLossFunction(), losses.KnnAdaLossFunction(X.columns[:1], uniform_label=1, knn=5), losses.CompositeLossFunction(), losses.RankBoostLossFunction(rank_column), ] pred = numpy.random.normal(size=size) # y = pred is a special point in i.e. MAELossFunction pred[numpy.abs(y - pred) < epsilon] = -0.1 print(sum(numpy.abs(y - pred) < epsilon)) for loss in tested_losses: loss.fit(X, y, sample_weight=sample_weight) # testing sign of gradient val = loss(pred) gradient = loss.negative_gradient(pred) numer_gradient = numpy.zeros(len(pred)) numer_hessian = numpy.zeros(len(pred)) for i in range(size): pred_plus = pred.copy() pred_plus[i] += epsilon val_plus = loss(pred_plus) pred_minus = pred.copy() pred_minus[i] -= epsilon val_minus = loss(pred_minus) numer_gradient[i] = -(val_plus - val_minus) / 2. / epsilon numer_hessian[i] = (val_plus + val_minus - 2 * val) / epsilon**2 assert numpy.allclose( gradient, numer_gradient), 'wrong computation of gradient for {}'.format( loss) if not isinstance(loss, losses.MSELossFunction) and not isinstance( loss, losses.MAELossFunction): assert (gradient * (2 * y - 1) >= 0).all(), 'wrong signs of gradients' if isinstance(loss, losses.HessianLossFunction): hessian = loss.hessian(pred) assert numpy.allclose( hessian, numer_hessian, atol=1e-5), 'wrong computation of hessian for {}'.format(loss)
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6): """ Testing how classifiers work with highly misbalanced (in the terms of weights) datasets. """ testX, testY = generate_sample(n_samples, n_features, distance=distance) trainX, trainY = generate_sample(n_samples, n_features, distance=distance) trainW = trainY * 10000 + 1 testW = testY * 10000 + 1 for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]: clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2, subsample=0.7, n_estimators=10, train_features=None) clf.fit(trainX, trainY, sample_weight=trainW) p = clf.predict_proba(testX) assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
def test_loss_functions(size=50, epsilon=1e-3): """ Testing that hessians and gradients of loss functions coincide with numerical """ X, y = generate_sample(size, n_features=10) sample_weight = numpy.random.exponential(size=size) tested_losses = [ losses.BinomialDevianceLossFunction(), losses.AdaLossFunction(), losses.SimpleKnnLossFunction(X.columns[:1], knn=5), losses.CompositeLossFunction() ] pred = numpy.random.normal(size=size) for loss in tested_losses: loss.fit(X, y, sample_weight=sample_weight) # testing sign of gradient val = loss(pred) gradient = loss.negative_gradient(pred) hessian = loss.hessian(pred) assert (gradient * (2 * y - 1) >= 0).all() numer_gradient = numpy.zeros(len(pred)) numer_hessian = numpy.zeros(len(pred)) for i in range(size): pred_plus = pred.copy() pred_plus[i] += epsilon val_plus = loss(pred_plus) pred_minus = pred.copy() pred_minus[i] -= epsilon val_minus = loss(pred_minus) numer_gradient[i] = - (val_plus - val_minus) / 2. / epsilon numer_hessian[i] = (val_plus + val_minus - 2 * val) / epsilon ** 2 assert numpy.allclose(gradient, numer_gradient), 'wrong computation of gradient' assert numpy.allclose(hessian, numer_hessian), 'wrong computation of hessian'