示例#1
0
def test_mnb_sample_weight():
    clf = MultinomialNB()
    clf.fit([[1, 2], [1, 2], [1, 0]], [0, 0, 1], sample_weight=[1, 1, 4])
    assert_array_equal(clf.predict([[1, 0]]), [1])
    positive_prior = np.exp(clf.intercept_[0])
    assert_array_almost_equal([1 - positive_prior, positive_prior],
                              [1 / 3., 2 / 3.])
示例#2
0
def test_check_accuracy_on_digits():
    # Non regression test to make sure that any further refactoring / optim
    # of the NB models do not harm the performance on a slightly non-linearly
    # separable dataset
    X, y = load_digits(return_X_y=True)
    binary_3v8 = np.logical_or(y == 3, y == 8)
    X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]

    # Multinomial NB
    scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
    assert scores.mean() > 0.86

    scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
    assert scores.mean() > 0.94

    # Bernoulli NB
    scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
    assert scores.mean() > 0.83

    scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
    assert scores.mean() > 0.92

    # Gaussian NB
    scores = cross_val_score(GaussianNB(), X, y, cv=10)
    assert scores.mean() > 0.77

    scores = cross_val_score(GaussianNB(var_smoothing=0.1), X, y, cv=10)
    assert scores.mean() > 0.89

    scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
    assert scores.mean() > 0.86
示例#3
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intended
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert len(ovr.estimators_) == len(np.unique(y))
    assert np.mean(y == pred) > 0.65

    # Test when mini batches doesn't have all classes
    # with SGDClassifier
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]

    ovr = OneVsRestClassifier(
        SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0))
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    ovr.partial_fit(X[7:], y[7:])
    pred = ovr.predict(X)
    ovr1 = OneVsRestClassifier(
        SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0))
    pred1 = ovr1.fit(X, y).predict(X)
    assert np.mean(pred == y) == np.mean(pred1 == y)

    # test partial_fit only exists if estimator has it:
    ovr = OneVsRestClassifier(SVC())
    assert not hasattr(ovr, "partial_fit")
示例#4
0
def test_ovr_exceptions():
    ovr = OneVsRestClassifier(LinearSVC(random_state=0))
    assert_raises(ValueError, ovr.predict, [])

    # Fail on multioutput data
    assert_raises(ValueError,
                  OneVsRestClassifier(MultinomialNB()).fit,
                  np.array([[1, 0], [0, 1]]), np.array([[1, 2], [3, 1]]))
    assert_raises(ValueError,
                  OneVsRestClassifier(MultinomialNB()).fit,
                  np.array([[1, 0], [0, 1]]), np.array([[1.5, 2.4], [3.1,
                                                                     0.8]]))
示例#5
0
def test_alpha():
    # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    nb = BernoulliNB(alpha=0.)
    assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.)
    assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[2. / 3, 1. / 3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test sparse X
    X = scipy.sparse.csr_matrix(X)
    nb = BernoulliNB(alpha=0.)
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.)
    assert_warns(UserWarning, nb.fit, X, y)
    prob = np.array([[2. / 3, 1. / 3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test for alpha < 0
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    expected_msg = ('Smoothing parameter alpha = -1.0e-01. '
                    'alpha should be > 0.')
    b_nb = BernoulliNB(alpha=-0.1)
    m_nb = MultinomialNB(alpha=-0.1)
    assert_raise_message(ValueError, expected_msg, b_nb.fit, X, y)
    assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)

    b_nb = BernoulliNB(alpha=-0.1)
    m_nb = MultinomialNB(alpha=-0.1)
    assert_raise_message(ValueError,
                         expected_msg,
                         b_nb.partial_fit,
                         X,
                         y,
                         classes=[0, 1])
    assert_raise_message(ValueError,
                         expected_msg,
                         m_nb.partial_fit,
                         X,
                         y,
                         classes=[0, 1])
示例#6
0
def test_ovo_fit_predict():
    # A classifier which implements decision_function.
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    ovo.fit(iris.data, iris.target).predict(iris.data)
    assert len(ovo.estimators_) == n_classes * (n_classes - 1) / 2

    # A classifier which implements predict_proba.
    ovo = OneVsOneClassifier(MultinomialNB())
    ovo.fit(iris.data, iris.target).predict(iris.data)
    assert len(ovo.estimators_) == n_classes * (n_classes - 1) / 2
示例#7
0
def test_ecoc_fit_predict():
    # A classifier which implements decision_function.
    ecoc = OutputCodeClassifier(LinearSVC(random_state=0),
                                code_size=2,
                                random_state=0)
    ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert len(ecoc.estimators_) == n_classes * 2

    # A classifier which implements predict_proba.
    ecoc = OutputCodeClassifier(MultinomialNB(), code_size=2, random_state=0)
    ecoc.fit(iris.data, iris.target).predict(iris.data)
    assert len(ecoc.estimators_) == n_classes * 2
示例#8
0
def test_ovr_multilabel():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
    y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1], [1, 0, 0]])

    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
                     LinearRegression(), Ridge(), ElasticNet(),
                     Lasso(alpha=0.5)):
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        y_pred = clf.predict([[0, 4, 4]])[0]
        assert_array_equal(y_pred, [0, 1, 1])
        assert clf.multilabel_
示例#9
0
def test_ovr_partial_fit_exceptions():
    ovr = OneVsRestClassifier(MultinomialNB())
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    # A new class value which was not in the first call of partial_fit
    # It should raise ValueError
    y1 = [5] + y[7:-1]
    assert_raises_regexp(ValueError, r"Mini-batch contains \[.+\] while "
                         r"classes must be subset of \[.+\]",
                         ovr.partial_fit,
                         X=X[7:],
                         y=y1)
示例#10
0
def test_calibration_prefit():
    """Test calibration for prefitted classifiers"""
    n_samples = 50
    X, y = make_classification(n_samples=3 * n_samples, n_features=6,
                               random_state=42)
    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)

    X -= X.min()  # MultinomialNB only allows positive X

    # split train and test
    X_train, y_train, sw_train = \
        X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_calib, y_calib, sw_calib = \
        X[n_samples:2 * n_samples], y[n_samples:2 * n_samples], \
        sample_weight[n_samples:2 * n_samples]
    X_test, y_test = X[2 * n_samples:], y[2 * n_samples:]

    # Naive-Bayes
    clf = MultinomialNB()
    clf.fit(X_train, y_train, sw_train)
    prob_pos_clf = clf.predict_proba(X_test)[:, 1]

    # Naive Bayes with calibration
    for this_X_calib, this_X_test in [(X_calib, X_test),
                                      (sparse.csr_matrix(X_calib),
                                       sparse.csr_matrix(X_test))]:
        for method in ['isotonic', 'sigmoid']:
            pc_clf = CalibratedClassifierCV(clf, method=method, cv="prefit")

            for sw in [sw_calib, None]:
                pc_clf.fit(this_X_calib, y_calib, sample_weight=sw)
                y_prob = pc_clf.predict_proba(this_X_test)
                y_pred = pc_clf.predict(this_X_test)
                prob_pos_pc_clf = y_prob[:, 1]
                assert_array_equal(y_pred,
                                   np.array([0, 1])[np.argmax(y_prob, axis=1)])

                assert (brier_score_loss(y_test, prob_pos_clf) >
                               brier_score_loss(y_test, prob_pos_pc_clf))
示例#11
0
def test_ovr_fit_predict():
    # A classifier which implements decision_function.
    ovr = OneVsRestClassifier(LinearSVC(random_state=0))
    pred = ovr.fit(iris.data, iris.target).predict(iris.data)
    assert len(ovr.estimators_) == n_classes

    clf = LinearSVC(random_state=0)
    pred2 = clf.fit(iris.data, iris.target).predict(iris.data)
    assert np.mean(iris.target == pred) == np.mean(iris.target == pred2)

    # A classifier which implements predict_proba.
    ovr = OneVsRestClassifier(MultinomialNB())
    pred = ovr.fit(iris.data, iris.target).predict(iris.data)
    assert np.mean(iris.target == pred) > 0.65
示例#12
0
def test_ovo_partial_fit_predict():
    temp = datasets.load_iris()
    X, y = temp.data, temp.target
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:100], y[:100], np.unique(y))
    ovo1.partial_fit(X[100:], y[100:])
    pred1 = ovo1.predict(X)

    ovo2 = OneVsOneClassifier(MultinomialNB())
    ovo2.fit(X, y)
    pred2 = ovo2.predict(X)
    assert len(ovo1.estimators_) == n_classes * (n_classes - 1) / 2
    assert np.mean(y == pred1) > 0.65
    assert_almost_equal(pred1, pred2)

    # Test when mini-batches have binary target classes
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:60], y[:60], np.unique(y))
    ovo1.partial_fit(X[60:], y[60:])
    pred1 = ovo1.predict(X)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(X, y).predict(X)

    assert_almost_equal(pred1, pred2)
    assert len(ovo1.estimators_) == len(np.unique(y))
    assert np.mean(y == pred1) > 0.65

    ovo = OneVsOneClassifier(MultinomialNB())
    X = np.random.rand(14, 2)
    y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2]
    ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4])
    ovo.partial_fit(X[7:], y[7:])
    pred = ovo.predict(X)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(X, y).predict(X)
    assert_almost_equal(pred, pred2)

    # raises error when mini-batch does not have classes from all_classes
    ovo = OneVsOneClassifier(MultinomialNB())
    error_y = [0, 1, 2, 3, 4, 5, 2]
    message_re = escape("Mini-batch contains {0} while "
                        "it must be subset of {1}".format(
                            np.unique(error_y), np.unique(y)))
    assert_raises_regexp(ValueError, message_re, ovo.partial_fit, X[:7],
                         error_y, np.unique(y))

    # test partial_fit only exists if estimator has it:
    ovr = OneVsOneClassifier(SVC())
    assert not hasattr(ovr, "partial_fit")
示例#13
0
def test_ovr_multiclass():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
    y = ["eggs", "spam", "ham", "eggs", "ham"]
    Y = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1], [1, 0, 0]])

    classes = set("ham eggs spam".split())

    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
                     LinearRegression(), Ridge(), ElasticNet()):
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        assert set(clf.classes_) == classes
        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
        assert_array_equal(y_pred, ["eggs"])

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[0, 0, 4]])[0]
        assert_array_equal(y_pred, [0, 0, 1])
示例#14
0
def test_ovr_single_label_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    X, Y = iris.data, iris.target
    X_train, Y_train = X[:80], Y[:80]
    X_test = X[80:]
    clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

    # Decision function only estimator.
    decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
    assert not hasattr(decision_only, 'predict_proba')

    Y_pred = clf.predict(X_test)
    Y_proba = clf.predict_proba(X_test)

    assert_almost_equal(Y_proba.sum(axis=1), 1.0)
    # predict assigns a label if the probability that the
    # sample has the label is greater than 0.5.
    pred = np.array([l.argmax() for l in Y_proba])
    assert not (pred - Y_pred).any()
示例#15
0
def test_ovr_fit_predict_sparse():
    for sparse in [
            sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix,
            sp.lil_matrix
    ]:
        base_clf = MultinomialNB(alpha=1)

        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=True,
                                                       random_state=0)

        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]

        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
        Y_pred_sprs = clf_sprs.predict(X_test)

        assert clf.multilabel_
        assert sp.issparse(Y_pred_sprs)
        assert_array_equal(Y_pred_sprs.toarray(), Y_pred)

        # Test predict_proba
        Y_proba = clf_sprs.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > .5
        assert_array_equal(pred, Y_pred_sprs.toarray())

        # Test decision_function
        clf = svm.SVC()
        clf_sprs = OneVsRestClassifier(clf).fit(X_train, sparse(Y_train))
        dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
        assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
示例#16
0
def test_ovr_multilabel_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    for au in (False, True):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

        # Decision function only estimator.
        decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
        assert not hasattr(decision_only, 'predict_proba')

        # Estimator with predict_proba disabled, depending on parameters.
        decision_only = OneVsRestClassifier(svm.SVC(probability=False))
        assert not hasattr(decision_only, 'predict_proba')
        decision_only.fit(X_train, Y_train)
        assert not hasattr(decision_only, 'predict_proba')
        assert hasattr(decision_only, 'decision_function')

        # Estimator which can get predict_proba enabled after fitting
        gs = GridSearchCV(svm.SVC(probability=False),
                          param_grid={'probability': [True]})
        proba_after_fit = OneVsRestClassifier(gs)
        assert not hasattr(proba_after_fit, 'predict_proba')
        proba_after_fit.fit(X_train, Y_train)
        assert hasattr(proba_after_fit, 'predict_proba')

        Y_pred = clf.predict(X_test)
        Y_proba = clf.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > .5
        assert_array_equal(pred, Y_pred)
示例#17
0
def test_ovr_multilabel_dataset():
    base_clf = MultinomialNB(alpha=1)
    for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=2,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        assert clf.multilabel_
        assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
                            prec,
                            decimal=2)
        assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"),
                            recall,
                            decimal=2)
示例#18
0
def test_ovr_binary():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
    y = ["eggs", "spam", "spam", "eggs", "spam"]
    Y = np.array([[0, 1, 1, 0, 1]]).T

    classes = set("eggs spam".split())

    def conduct_test(base_clf, test_predict_proba=False):
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        assert set(clf.classes_) == classes
        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
        assert_array_equal(y_pred, ["eggs"])
        if hasattr(base_clf, 'decision_function'):
            dec = clf.decision_function(X)
            assert dec.shape == (5, )

        if test_predict_proba:
            X_test = np.array([[0, 0, 4]])
            probabilities = clf.predict_proba(X_test)
            assert 2 == len(probabilities[0])
            assert (clf.classes_[np.argmax(probabilities,
                                           axis=1)] == clf.predict(X_test))

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[3, 0, 0]])[0]
        assert y_pred == 1

    for base_clf in (LinearSVC(random_state=0), LinearRegression(), Ridge(),
                     ElasticNet()):
        conduct_test(base_clf)

    for base_clf in (MultinomialNB(), SVC(probability=True),
                     LogisticRegression()):
        conduct_test(base_clf, test_predict_proba=True)
示例#19
0
def test_mnb_prior_unobserved_targets():
    # test smoothing of prior for yet unobserved targets

    # Create toy training data
    X = np.array([[0, 1], [1, 0]])
    y = np.array([0, 1])

    clf = MultinomialNB()

    assert_no_warnings(clf.partial_fit, X, y, classes=[0, 1, 2])

    assert clf.predict([[0, 1]]) == 0
    assert clf.predict([[1, 0]]) == 1
    assert clf.predict([[1, 1]]) == 0

    # add a training example with previously unobserved class
    assert_no_warnings(clf.partial_fit, [[1, 1]], [2])

    assert clf.predict([[0, 1]]) == 0
    assert clf.predict([[1, 0]]) == 1
    assert clf.predict([[1, 1]]) == 2
示例#20
0
from mrex.ensemble import ExtraTreesClassifier
from mrex.ensemble import AdaBoostClassifier
from mrex.linear_model import LogisticRegression
from mrex.naive_bayes import MultinomialNB

ESTIMATORS = {
    "dummy":
    DummyClassifier(),
    "random_forest":
    RandomForestClassifier(max_features="sqrt", min_samples_split=10),
    "extra_trees":
    ExtraTreesClassifier(max_features="sqrt", min_samples_split=10),
    "logistic_regression":
    LogisticRegression(),
    "naive_bayes":
    MultinomialNB(),
    "adaboost":
    AdaBoostClassifier(n_estimators=10),
}

###############################################################################
# Data

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('-e',
                        '--estimators',
                        nargs="+",
                        required=True,
                        choices=ESTIMATORS)
示例#21
0
def test_alpha_vector():
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])

    # Setting alpha=np.array with same length
    # as number of features should be fine
    alpha = np.array([1, 2])
    nb = MultinomialNB(alpha=alpha)
    nb.partial_fit(X, y, classes=[0, 1])

    # Test feature probabilities uses pseudo-counts (alpha)
    feature_prob = np.array([[1 / 2, 1 / 2], [2 / 5, 3 / 5]])
    assert_array_almost_equal(nb.feature_log_prob_, np.log(feature_prob))

    # Test predictions
    prob = np.array([[5 / 9, 4 / 9], [25 / 49, 24 / 49]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test alpha non-negative
    alpha = np.array([1., -0.1])
    expected_msg = ('Smoothing parameter alpha = -1.0e-01. '
                    'alpha should be > 0.')
    m_nb = MultinomialNB(alpha=alpha)
    assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)

    # Test that too small pseudo-counts are replaced
    ALPHA_MIN = 1e-10
    alpha = np.array([ALPHA_MIN / 2, 0.5])
    m_nb = MultinomialNB(alpha=alpha)
    m_nb.partial_fit(X, y, classes=[0, 1])
    assert_array_almost_equal(m_nb._check_alpha(), [ALPHA_MIN, 0.5],
                              decimal=12)

    # Test correct dimensions
    alpha = np.array([1., 2., 3.])
    m_nb = MultinomialNB(alpha=alpha)
    expected_msg = ('alpha should be a scalar or a numpy array '
                    'with shape [n_features]')
    assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)
示例#22
0
def test_calibration():
    """Test calibration objects with isotonic and sigmoid"""
    n_samples = 100
    X, y = make_classification(n_samples=2 * n_samples, n_features=6,
                               random_state=42)
    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)

    X -= X.min()  # MultinomialNB only allows positive X

    # split train and test
    X_train, y_train, sw_train = \
        X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_test, y_test = X[n_samples:], y[n_samples:]

    # Naive-Bayes
    clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train)
    prob_pos_clf = clf.predict_proba(X_test)[:, 1]

    pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1)
    assert_raises(ValueError, pc_clf.fit, X, y)

    # Naive Bayes with calibration
    for this_X_train, this_X_test in [(X_train, X_test),
                                      (sparse.csr_matrix(X_train),
                                       sparse.csr_matrix(X_test))]:
        for method in ['isotonic', 'sigmoid']:
            pc_clf = CalibratedClassifierCV(clf, method=method, cv=2)
            # Note that this fit overwrites the fit on the entire training
            # set
            pc_clf.fit(this_X_train, y_train, sample_weight=sw_train)
            prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1]

            # Check that brier score has improved after calibration
            assert (brier_score_loss(y_test, prob_pos_clf) >
                           brier_score_loss(y_test, prob_pos_pc_clf))

            # Check invariance against relabeling [0, 1] -> [1, 2]
            pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
            assert_array_almost_equal(prob_pos_pc_clf,
                                      prob_pos_pc_clf_relabeled)

            # Check invariance against relabeling [0, 1] -> [-1, 1]
            pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
            assert_array_almost_equal(prob_pos_pc_clf,
                                      prob_pos_pc_clf_relabeled)

            # Check invariance against relabeling [0, 1] -> [1, 0]
            pc_clf.fit(this_X_train, (y_train + 1) % 2,
                       sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = \
                pc_clf.predict_proba(this_X_test)[:, 1]
            if method == "sigmoid":
                assert_array_almost_equal(prob_pos_pc_clf,
                                          1 - prob_pos_pc_clf_relabeled)
            else:
                # Isotonic calibration is not invariant against relabeling
                # but should improve in both cases
                assert (brier_score_loss(y_test, prob_pos_clf) >
                               brier_score_loss((y_test + 1) % 2,
                                                prob_pos_pc_clf_relabeled))

        # Check failure cases:
        # only "isotonic" and "sigmoid" should be accepted as methods
        clf_invalid_method = CalibratedClassifierCV(clf, method="foo")
        assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train)

        # base-estimators should provide either decision_function or
        # predict_proba (most regressors, for instance, should fail)
        clf_base_regressor = \
            CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid")
        assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
示例#23
0
# Iterator over parsed Reuters SGML files.
data_stream = stream_reuters_documents()

# We learn a binary classification between the "acq" class and all the others.
# "acq" was chosen as it is more or less evenly distributed in the Reuters
# files. For other datasets, one should take care of creating a test set with
# a realistic portion of positive instances.
all_classes = np.array([0, 1])
positive_class = 'acq'

# Here are some classifiers that support the `partial_fit` method
partial_fit_classifiers = {
    'SGD': SGDClassifier(max_iter=5),
    'Perceptron': Perceptron(),
    'NB Multinomial': MultinomialNB(alpha=0.01),
    'Passive-Aggressive': PassiveAggressiveClassifier(),
}


def get_minibatch(doc_iter, size, pos_class=positive_class):
    """Extract a minibatch of examples, return a tuple X_text, y.

    Note: size is before excluding invalid docs with no topics assigned.

    """
    data = [('{title}\n\n{body}'.format(**doc), pos_class in doc['topics'])
            for doc in itertools.islice(doc_iter, size)
            if doc['topics']]
    if not len(data):
        return np.asarray([], dtype=int), np.asarray([], dtype=int)
示例#24
0
def test_mnnb(kind):
    # Test Multinomial Naive Bayes classification.
    # This checks that MultinomialNB implements fit and predict and returns
    # correct values for a simple toy dataset.

    if kind == 'dense':
        X = X2
    elif kind == 'sparse':
        X = scipy.sparse.csr_matrix(X2)

    # Check the ability to predict the learning set.
    clf = MultinomialNB()
    assert_raises(ValueError, clf.fit, -X, y2)
    y_pred = clf.fit(X, y2).predict(X)

    assert_array_equal(y_pred, y2)

    # Verify that np.log(clf.predict_proba(X)) gives the same results as
    # clf.predict_log_proba(X)
    y_pred_proba = clf.predict_proba(X)
    y_pred_log_proba = clf.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)

    # Check that incremental fitting yields the same results
    clf2 = MultinomialNB()
    clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2))
    clf2.partial_fit(X[2:5], y2[2:5])
    clf2.partial_fit(X[5:], y2[5:])

    y_pred2 = clf2.predict(X)
    assert_array_equal(y_pred2, y2)

    y_pred_proba2 = clf2.predict_proba(X)
    y_pred_log_proba2 = clf2.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8)
    assert_array_almost_equal(y_pred_proba2, y_pred_proba)
    assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba)

    # Partial fit on the whole data at once should be the same as fit too
    clf3 = MultinomialNB()
    clf3.partial_fit(X, y2, classes=np.unique(y2))

    y_pred3 = clf3.predict(X)
    assert_array_equal(y_pred3, y2)
    y_pred_proba3 = clf3.predict_proba(X)
    y_pred_log_proba3 = clf3.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8)
    assert_array_almost_equal(y_pred_proba3, y_pred_proba)
    assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba)