Exemplo n.º 1
0
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [
            losses.MSELossFunction(),
            losses.MAELossFunction(),
            losses.RankBoostLossFunction(request_column='fake_request')
    ]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss,
                                         max_depth=3,
                                         n_estimators=50,
                                         learning_rate=0.01,
                                         subsample=0.5,
                                         train_features=list(
                                             trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(
            roc_auc, loss)
Exemplo n.º 2
0
def test_step_optimality(n_samples=100):
    """
    testing that for single leaf function returns the optimal value
    """
    X, y = generate_sample(n_samples, n_features=10)
    sample_weight = numpy.random.exponential(size=n_samples)

    rank_column = X.columns[2]
    X[rank_column] = numpy.random.randint(0, 3, size=n_samples)

    tested_losses = [
        losses.LogLossFunction(),
        losses.AdaLossFunction(),
        losses.KnnAdaLossFunction(X.columns[:1], uniform_label=0, knn=5),
        losses.CompositeLossFunction(),
        losses.RankBoostLossFunction(rank_column),
        losses.MSELossFunction(),
    ]

    pred = numpy.random.normal(size=n_samples)

    for loss in tested_losses:
        loss.fit(X, y, sample_weight=sample_weight)

        # Test simple way to get optimal step
        leaf_value = numpy.random.normal()
        step = 0.
        for _ in range(4):
            ministep, = loss.prepare_new_leaves_values(
                terminal_regions=numpy.zeros(n_samples, dtype=int),
                leaf_values=[leaf_value],
                y_pred=pred + step)
            step += ministep

        if isinstance(loss, losses.MAELossFunction):
            # checking that MAE is minimized with long process
            for iteration in range(1, 30):
                ministep, = loss.prepare_new_leaves_values(
                    terminal_regions=numpy.zeros(n_samples, dtype=int),
                    leaf_values=[leaf_value],
                    y_pred=pred + step)
                step += ministep * 1. / iteration

        loss_values = []
        coeffs = [0.9, 1.0, 1.1]
        for coeff in coeffs:
            loss_values.append(loss(pred + coeff * step))
        print(loss, step, 'losses: ', loss_values)
        assert loss_values[1] <= loss_values[0] + 1e-7
        assert loss_values[1] <= loss_values[2] + 1e-7

        # Test standard function
        opt_value = loss.compute_optimal_step(y_pred=pred)
        loss_values2 = []
        for coeff in coeffs:
            loss_values2.append(loss(pred + coeff * opt_value))
        print(loss, step, 'losses: ', loss_values)
        assert loss_values2[1] <= loss_values2[0] + 1e-7
        assert loss_values2[1] <= loss_values2[2] + 1e-7
Exemplo n.º 3
0
def test_loss_functions(size=50, epsilon=1e-3):
    """
    Testing that Hessians and gradients of loss functions coincide with numerical approximations
    """
    X, y = generate_sample(size, n_features=10)
    rank_column = X.columns[2]
    X[rank_column] = numpy.random.randint(0, 3, size=size)
    sample_weight = numpy.random.exponential(size=size)
    tested_losses = [
        losses.MSELossFunction(),
        losses.MAELossFunction(),
        losses.LogLossFunction(),
        losses.AdaLossFunction(),
        losses.KnnAdaLossFunction(X.columns[:1], uniform_label=1, knn=5),
        losses.CompositeLossFunction(),
        losses.RankBoostLossFunction(rank_column),
    ]
    pred = numpy.random.normal(size=size)
    # y = pred is a special point in i.e. MAELossFunction
    pred[numpy.abs(y - pred) < epsilon] = -0.1
    print(sum(numpy.abs(y - pred) < epsilon))

    for loss in tested_losses:
        loss.fit(X, y, sample_weight=sample_weight)
        # testing sign of gradient
        val = loss(pred)
        gradient = loss.negative_gradient(pred)

        numer_gradient = numpy.zeros(len(pred))
        numer_hessian = numpy.zeros(len(pred))
        for i in range(size):
            pred_plus = pred.copy()
            pred_plus[i] += epsilon
            val_plus = loss(pred_plus)

            pred_minus = pred.copy()
            pred_minus[i] -= epsilon
            val_minus = loss(pred_minus)

            numer_gradient[i] = -(val_plus - val_minus) / 2. / epsilon
            numer_hessian[i] = (val_plus + val_minus - 2 * val) / epsilon**2

        assert numpy.allclose(
            gradient,
            numer_gradient), 'wrong computation of gradient for {}'.format(
                loss)
        if not isinstance(loss, losses.MSELossFunction) and not isinstance(
                loss, losses.MAELossFunction):
            assert (gradient *
                    (2 * y - 1) >= 0).all(), 'wrong signs of gradients'
        if isinstance(loss, losses.HessianLossFunction):
            hessian = loss.hessian(pred)
            assert numpy.allclose(
                hessian, numer_hessian,
                atol=1e-5), 'wrong computation of hessian for {}'.format(loss)
Exemplo n.º 4
0
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing how classifiers work with highly misbalanced (in the terms of weights) datasets.
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    trainW = trainY * 10000 + 1
    testW = testY * 10000 + 1
    for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY, sample_weight=trainW)
        p = clf.predict_proba(testX)
        assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
Exemplo n.º 5
0
def test_loss_functions(size=50, epsilon=1e-3):
    """
    Testing that hessians and gradients of loss functions coincide with numerical
    """
    X, y = generate_sample(size, n_features=10)
    sample_weight = numpy.random.exponential(size=size)
    tested_losses = [
        losses.BinomialDevianceLossFunction(),
        losses.AdaLossFunction(),
        losses.SimpleKnnLossFunction(X.columns[:1], knn=5),
        losses.CompositeLossFunction()
    ]
    pred = numpy.random.normal(size=size)

    for loss in tested_losses:
        loss.fit(X, y, sample_weight=sample_weight)
        # testing sign of gradient
        val = loss(pred)
        gradient = loss.negative_gradient(pred)
        hessian = loss.hessian(pred)
        assert (gradient * (2 * y - 1) >= 0).all()

        numer_gradient = numpy.zeros(len(pred))
        numer_hessian = numpy.zeros(len(pred))
        for i in range(size):
            pred_plus = pred.copy()
            pred_plus[i] += epsilon
            val_plus = loss(pred_plus)

            pred_minus = pred.copy()
            pred_minus[i] -= epsilon
            val_minus = loss(pred_minus)

            numer_gradient[i] = - (val_plus - val_minus) / 2. / epsilon
            numer_hessian[i] = (val_plus + val_minus - 2 * val) / epsilon ** 2

        assert numpy.allclose(gradient, numer_gradient), 'wrong computation of gradient'
        assert numpy.allclose(hessian, numer_hessian), 'wrong computation of hessian'