def test_predict_proba_multiclass(): # Test that predict_proba works as expected for multi class. X = X_digits_multi[:10] y = y_digits_multi[:10] clf = MLPClassifier(hidden_layer_sizes=5) with ignore_warnings(category=ConvergenceWarning): clf.fit(X, y) y_proba = clf.predict_proba(X) y_log_proba = clf.predict_log_proba(X) (n_samples, n_classes) = y.shape[0], np.unique(y).size proba_max = y_proba.argmax(axis=1) proba_log_max = y_log_proba.argmax(axis=1) assert y_proba.shape == (n_samples, n_classes) assert_array_equal(proba_max, proba_log_max) assert_array_equal(y_log_proba, np.log(y_proba))
def test_predict_proba_multilabel(): # Test that predict_proba works as expected for multilabel. # Multilabel should not use softmax which makes probabilities sum to 1 X, Y = make_multilabel_classification(n_samples=50, random_state=0, return_indicator=True) n_samples, n_classes = Y.shape clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=30, random_state=0) clf.fit(X, Y) y_proba = clf.predict_proba(X) assert y_proba.shape == (n_samples, n_classes) assert_array_equal(y_proba > 0.5, Y) y_log_proba = clf.predict_log_proba(X) proba_max = y_proba.argmax(axis=1) proba_log_max = y_log_proba.argmax(axis=1) assert (y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1) > 1e-10 assert_array_equal(proba_max, proba_log_max) assert_array_equal(y_log_proba, np.log(y_proba))
def test_predict_proba_binary(): # Test that predict_proba works as expected for binary class. X = X_digits_binary[:50] y = y_digits_binary[:50] clf = MLPClassifier(hidden_layer_sizes=5, activation='logistic', random_state=1) with ignore_warnings(category=ConvergenceWarning): clf.fit(X, y) y_proba = clf.predict_proba(X) y_log_proba = clf.predict_log_proba(X) (n_samples, n_classes) = y.shape[0], 2 proba_max = y_proba.argmax(axis=1) proba_log_max = y_log_proba.argmax(axis=1) assert y_proba.shape == (n_samples, n_classes) assert_array_equal(proba_max, proba_log_max) assert_array_equal(y_log_proba, np.log(y_proba)) assert roc_auc_score(y, y_proba[:, 1]) == 1.0
def test_fit(): # Test that the algorithm solution is equal to a worked out example. X = np.array([[0.6, 0.8, 0.7]]) y = np.array([0]) mlp = MLPClassifier(solver='sgd', learning_rate_init=0.1, alpha=0.1, activation='logistic', random_state=1, max_iter=1, hidden_layer_sizes=2, momentum=0) # set weights mlp.coefs_ = [0] * 2 mlp.intercepts_ = [0] * 2 mlp.n_outputs_ = 1 mlp.coefs_[0] = np.array([[0.1, 0.2], [0.3, 0.1], [0.5, 0]]) mlp.coefs_[1] = np.array([[0.1], [0.2]]) mlp.intercepts_[0] = np.array([0.1, 0.1]) mlp.intercepts_[1] = np.array([1.0]) mlp._coef_grads = [] * 2 mlp._intercept_grads = [] * 2 # Initialize parameters mlp.n_iter_ = 0 mlp.learning_rate_ = 0.1 # Compute the number of layers mlp.n_layers_ = 3 # Pre-allocate gradient matrices mlp._coef_grads = [0] * (mlp.n_layers_ - 1) mlp._intercept_grads = [0] * (mlp.n_layers_ - 1) mlp.out_activation_ = 'logistic' mlp.t_ = 0 mlp.best_loss_ = np.inf mlp.loss_curve_ = [] mlp._no_improvement_count = 0 mlp._intercept_velocity = [ np.zeros_like(intercepts) for intercepts in mlp.intercepts_ ] mlp._coef_velocity = [np.zeros_like(coefs) for coefs in mlp.coefs_] mlp.partial_fit(X, y, classes=[0, 1]) # Manually worked out example # h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.1 + 0.8 * 0.3 + 0.7 * 0.5 + 0.1) # = 0.679178699175393 # h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.2 + 0.8 * 0.1 + 0.7 * 0 + 0.1) # = 0.574442516811659 # o1 = g(h * W2 + b21) = g(0.679 * 0.1 + 0.574 * 0.2 + 1) # = 0.7654329236196236 # d21 = -(0 - 0.765) = 0.765 # d11 = (1 - 0.679) * 0.679 * 0.765 * 0.1 = 0.01667 # d12 = (1 - 0.574) * 0.574 * 0.765 * 0.2 = 0.0374 # W1grad11 = X1 * d11 + alpha * W11 = 0.6 * 0.01667 + 0.1 * 0.1 = 0.0200 # W1grad11 = X1 * d12 + alpha * W12 = 0.6 * 0.0374 + 0.1 * 0.2 = 0.04244 # W1grad21 = X2 * d11 + alpha * W13 = 0.8 * 0.01667 + 0.1 * 0.3 = 0.043336 # W1grad22 = X2 * d12 + alpha * W14 = 0.8 * 0.0374 + 0.1 * 0.1 = 0.03992 # W1grad31 = X3 * d11 + alpha * W15 = 0.6 * 0.01667 + 0.1 * 0.5 = 0.060002 # W1grad32 = X3 * d12 + alpha * W16 = 0.6 * 0.0374 + 0.1 * 0 = 0.02244 # W2grad1 = h1 * d21 + alpha * W21 = 0.679 * 0.765 + 0.1 * 0.1 = 0.5294 # W2grad2 = h2 * d21 + alpha * W22 = 0.574 * 0.765 + 0.1 * 0.2 = 0.45911 # b1grad1 = d11 = 0.01667 # b1grad2 = d12 = 0.0374 # b2grad = d21 = 0.765 # W1 = W1 - eta * [W1grad11, .., W1grad32] = [[0.1, 0.2], [0.3, 0.1], # [0.5, 0]] - 0.1 * [[0.0200, 0.04244], [0.043336, 0.03992], # [0.060002, 0.02244]] = [[0.098, 0.195756], [0.2956664, # 0.096008], [0.4939998, -0.002244]] # W2 = W2 - eta * [W2grad1, W2grad2] = [[0.1], [0.2]] - 0.1 * # [[0.5294], [0.45911]] = [[0.04706], [0.154089]] # b1 = b1 - eta * [b1grad1, b1grad2] = 0.1 - 0.1 * [0.01667, 0.0374] # = [0.098333, 0.09626] # b2 = b2 - eta * b2grad = 1.0 - 0.1 * 0.765 = 0.9235 assert_almost_equal(mlp.coefs_[0], np.array([[0.098, 0.195756], [0.2956664, 0.096008], [0.4939998, -0.002244]]), decimal=3) assert_almost_equal(mlp.coefs_[1], np.array([[0.04706], [0.154089]]), decimal=3) assert_almost_equal(mlp.intercepts_[0], np.array([0.098333, 0.09626]), decimal=3) assert_almost_equal(mlp.intercepts_[1], np.array(0.9235), decimal=3) # Testing output # h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.098 + 0.8 * 0.2956664 + # 0.7 * 0.4939998 + 0.098333) = 0.677 # h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.195756 + 0.8 * 0.096008 + # 0.7 * -0.002244 + 0.09626) = 0.572 # o1 = h * W2 + b21 = 0.677 * 0.04706 + # 0.572 * 0.154089 + 0.9235 = 1.043 # prob = sigmoid(o1) = 0.739 assert_almost_equal(mlp.predict_proba(X)[0, 1], 0.739, decimal=3)