def test_ova_iris(): """Test that our OvA implementation is correct using the iris dataset.""" train, target = iris.data, iris.target n_samples, n_features = train.shape # Use pre-defined fold as folds generated for different y cv = StratifiedKFold(target, 3) clf = LogisticRegressionCV(cv=cv) clf.fit(train, target) clf1 = LogisticRegressionCV(cv=cv) target[target == 0] = 1 clf1.fit(train, target) assert_array_almost_equal(clf.scores_[2], clf1.scores_[2]) assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_) assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_) # Test the shape of various attributes. assert_equal(clf.coef_.shape, (3, n_features)) assert_array_equal(clf.classes_, [0, 1, 2]) assert_equal(len(clf.classes_), 3) coefs_paths = np.asarray(list(clf.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, 3, 10, n_features + 1)) assert_equal(clf.Cs_.shape, (10, )) scores = np.asarray(list(clf.scores_.values())) assert_equal(scores.shape, (3, 3, 10))
def test_logistic_regression_multinomial(): """Tests for the multinomial option in logistic regression""" # Some basic attributes of Logistic Regression n_samples, n_features, n_classes = 50, 20, 3 X, y = make_classification(n_samples=50, n_features=20, n_informative=10, n_classes=3, random_state=0) clf_int = LogisticRegression(solver='lbfgs', multi_class='multinomial') clf_int.fit(X, y) assert_array_equal(clf_int.coef_.shape, (n_classes, n_features)) clf_wint = LogisticRegression(solver='lbfgs', multi_class='multinomial', fit_intercept=False) clf_wint.fit(X, y) assert_array_equal(clf_wint.coef_.shape, (n_classes, n_features)) # Test that the path give almost the same results. However since in this # case we take the average of the coefs after fitting across all the # folds, it need not be exactly the same. clf_path = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial', Cs=[1.]) clf_path.fit(X, y) assert_array_almost_equal(clf_path.coef_, clf_int.coef_, decimal=3) assert_almost_equal(clf_path.intercept_, clf_int.intercept_, decimal=3)
def test_logistic_cv_score_does_not_warn_by_default(): lr = LogisticRegressionCV(cv=2) lr.fit(X, Y1) with pytest.warns(None) as record: lr.score(X, lr.predict(X)) assert len(record) == 0
def test_logistic_cv_mock_scorer(): class MockScorer(object): def __init__(self): self.calls = 0 self.scores = [0.1, 0.4, 0.8, 0.5] def __call__(self, model, X, y, sample_weight=None): score = self.scores[self.calls % len(self.scores)] self.calls += 1 return score mock_scorer = MockScorer() Cs = [1, 2, 3, 4] cv = 2 lr = LogisticRegressionCV(Cs=Cs, scoring=mock_scorer, cv=cv) lr.fit(X, Y1) # Cs[2] has the highest score (0.8) from MockScorer assert lr.C_[0] == Cs[2] # scorer called 8 times (cv*len(Cs)) assert mock_scorer.calls == cv * len(Cs) # reset mock_scorer mock_scorer.calls = 0 with pytest.warns(ChangedBehaviorWarning): custom_score = lr.score(X, lr.predict(X)) assert custom_score == mock_scorer.scores[0] assert mock_scorer.calls == 1
def test_logistic_cv(): # test for LogisticRegressionCV object n_samples, n_features = 50, 5 rng = np.random.RandomState(0) X_ref = rng.randn(n_samples, n_features) y = np.sign(X_ref.dot(5 * rng.randn(n_features))) X_ref -= X_ref.mean() X_ref /= X_ref.std() lr_cv = LogisticRegressionCV(Cs=[1.], fit_intercept=False, solver='liblinear') lr_cv.fit(X_ref, y) lr = LogisticRegression(C=1., fit_intercept=False) lr.fit(X_ref, y) assert_array_almost_equal(lr.coef_, lr_cv.coef_) assert_array_equal(lr_cv.coef_.shape, (1, n_features)) assert_array_equal(lr_cv.classes_, [-1, 1]) assert_equal(len(lr_cv.classes_), 2) coefs_paths = np.asarray(list(lr_cv.coefs_paths_.values())) assert_array_equal(coefs_paths.shape, (1, 3, 1, n_features)) assert_array_equal(lr_cv.Cs_.shape, (1, )) scores = np.asarray(list(lr_cv.scores_.values())) assert_array_equal(scores.shape, (1, 3, 1))
def test_logistic_regressioncv_class_weights(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) # Test the liblinear fails when class_weight of type dict is # provided, when it is multiclass clf_lib = LogisticRegressionCV(class_weight={ 0: 0.1, 1: 0.2 }, solver='liblinear') assert_raises(ValueError, clf_lib.fit, X, y) # Test for class_weight=auto X, y = make_classification(n_samples=20, n_features=20, n_informative=10, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False, class_weight='auto') clf_lbf.fit(X, y) clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False, class_weight='auto') clf_lib.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
def test_logreg_cv_penalty(): # Test that the correct penalty is passed to the final fit. X, y = make_classification(n_samples=50, n_features=20, random_state=0) lr_cv = LogisticRegressionCV(penalty="l1", Cs=[1.0], solver='liblinear') lr_cv.fit(X, y) lr = LogisticRegression(penalty="l1", C=1.0, solver='liblinear') lr.fit(X, y) assert_equal(np.count_nonzero(lr_cv.coef_), np.count_nonzero(lr.coef_))
def test_logistic_regression_multinomial(): # Tests for the multinomial option in logistic regression # Some basic attributes of Logistic Regression n_samples, n_features, n_classes = 50, 20, 3 X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=10, n_classes=n_classes, random_state=0) # 'lbfgs' is used as a referenced solver = 'lbfgs' ref_i = LogisticRegression(solver=solver, multi_class='multinomial') ref_w = LogisticRegression(solver=solver, multi_class='multinomial', fit_intercept=False) ref_i.fit(X, y) ref_w.fit(X, y) assert_array_equal(ref_i.coef_.shape, (n_classes, n_features)) assert_array_equal(ref_w.coef_.shape, (n_classes, n_features)) for solver in ['sag', 'newton-cg']: clf_i = LogisticRegression(solver=solver, multi_class='multinomial', random_state=42, max_iter=1000, tol=1e-6) clf_w = LogisticRegression(solver=solver, multi_class='multinomial', random_state=42, max_iter=1000, tol=1e-6, fit_intercept=False) clf_i.fit(X, y) clf_w.fit(X, y) assert_array_equal(clf_i.coef_.shape, (n_classes, n_features)) assert_array_equal(clf_w.coef_.shape, (n_classes, n_features)) # Compare solutions between lbfgs and the other solvers assert_almost_equal(ref_i.coef_, clf_i.coef_, decimal=3) assert_almost_equal(ref_w.coef_, clf_w.coef_, decimal=3) assert_almost_equal(ref_i.intercept_, clf_i.intercept_, decimal=3) # Test that the path give almost the same results. However since in this # case we take the average of the coefs after fitting across all the # folds, it need not be exactly the same. for solver in ['lbfgs', 'newton-cg', 'sag']: clf_path = LogisticRegressionCV(solver=solver, max_iter=2000, tol=1e-6, multi_class='multinomial', Cs=[1.]) clf_path.fit(X, y) assert_array_almost_equal(clf_path.coef_, ref_i.coef_, decimal=3) assert_almost_equal(clf_path.intercept_, ref_i.intercept_, decimal=3)
def test_logistic_cv_sparse(): X, y = make_classification(n_samples=50, n_features=5, random_state=0) X[X < 1.0] = 0.0 csr = sp.csr_matrix(X) clf = LogisticRegressionCV(fit_intercept=True) clf.fit(X, y) clfs = LogisticRegressionCV(fit_intercept=True) clfs.fit(csr, y) assert_array_almost_equal(clfs.coef_, clf.coef_) assert_array_almost_equal(clfs.intercept_, clf.intercept_) assert_equal(clfs.C_, clf.C_)
class LogisticRegressionCVImpl(): def __init__(self, Cs=10, fit_intercept=True, cv=3, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight='balanced', n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='ovr', random_state=None): self._hyperparams = { 'Cs': Cs, 'fit_intercept': fit_intercept, 'cv': cv, 'dual': dual, 'penalty': penalty, 'scoring': scoring, 'solver': solver, 'tol': tol, 'max_iter': max_iter, 'class_weight': class_weight, 'n_jobs': n_jobs, 'verbose': verbose, 'refit': refit, 'intercept_scaling': intercept_scaling, 'multi_class': multi_class, 'random_state': random_state } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def test_logistic_regressioncv_class_weights(): for weight in [{0: 0.1, 1: 0.2}, {0: 0.1, 1: 0.2, 2: 0.5}]: n_classes = len(weight) for class_weight in (weight, 'balanced'): X, y = make_classification(n_samples=30, n_features=3, n_repeated=0, n_informative=3, n_redundant=0, n_classes=n_classes, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', Cs=1, fit_intercept=False, class_weight=class_weight) clf_ncg = LogisticRegressionCV(solver='newton-cg', Cs=1, fit_intercept=False, class_weight=class_weight) clf_lib = LogisticRegressionCV(solver='liblinear', Cs=1, fit_intercept=False, class_weight=class_weight) clf_sag = LogisticRegressionCV(solver='sag', Cs=1, fit_intercept=False, class_weight=class_weight, tol=1e-5, max_iter=10000, random_state=0) clf_lbf.fit(X, y) clf_ncg.fit(X, y) clf_lib.fit(X, y) clf_sag.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4) assert_array_almost_equal(clf_ncg.coef_, clf_lbf.coef_, decimal=4) assert_array_almost_equal(clf_sag.coef_, clf_lbf.coef_, decimal=4)
def test_logistic_regressioncv_class_weights(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) msg = ("In LogisticRegressionCV the liblinear solver cannot handle " "multiclass with class_weight of type dict. Use the lbfgs, " "newton-cg or sag solvers or set class_weight='balanced'") clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2}, solver='liblinear') assert_raise_message(ValueError, msg, clf_lib.fit, X, y) y_ = y.copy() y_[y == 2] = 1 clf_lib.fit(X, y_) assert_array_equal(clf_lib.classes_, [0, 1]) # Test for class_weight=balanced X, y = make_classification(n_samples=20, n_features=20, n_informative=10, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False, class_weight='balanced') clf_lbf.fit(X, y) clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False, class_weight='balanced') clf_lib.fit(X, y) clf_sag = LogisticRegressionCV(solver='sag', fit_intercept=False, class_weight='balanced', max_iter=2000) clf_sag.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4) assert_array_almost_equal(clf_sag.coef_, clf_lbf.coef_, decimal=4) assert_array_almost_equal(clf_lib.coef_, clf_sag.coef_, decimal=4)
def test_logistic_regressioncv_class_weights(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) # Test the liblinear fails when class_weight of type dict is # provided, when it is multiclass. However it can handle # binary problems. clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2}, solver='liblinear') assert_raises(ValueError, clf_lib.fit, X, y) y_ = y.copy() y_[y == 2] = 1 clf_lib.fit(X, y_) assert_array_equal(clf_lib.classes_, [0, 1]) # Test for class_weight=balanced X, y = make_classification(n_samples=20, n_features=20, n_informative=10, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False, class_weight='balanced') clf_lbf.fit(X, y) clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False, class_weight='balanced') clf_lib.fit(X, y) clf_sag = LogisticRegressionCV(solver='sag', fit_intercept=False, class_weight='balanced', max_iter=2000) clf_sag.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4) assert_array_almost_equal(clf_sag.coef_, clf_lbf.coef_, decimal=4) assert_array_almost_equal(clf_lib.coef_, clf_sag.coef_, decimal=4)
def test_logistic_regression_multinomial(): # Tests for the multinomial option in logistic regression # Some basic attributes of Logistic Regression n_samples, n_features, n_classes = 50, 20, 3 X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=10, n_classes=n_classes, random_state=0) # 'lbfgs' is used as a referenced solver = 'lbfgs' ref_i = LogisticRegression(solver=solver, multi_class='multinomial') ref_w = LogisticRegression(solver=solver, multi_class='multinomial', fit_intercept=False) ref_i.fit(X, y) ref_w.fit(X, y) assert_array_equal(ref_i.coef_.shape, (n_classes, n_features)) assert_array_equal(ref_w.coef_.shape, (n_classes, n_features)) for solver in ['sag', 'saga', 'newton-cg']: clf_i = LogisticRegression(solver=solver, multi_class='multinomial', random_state=42, max_iter=2000, tol=1e-7, ) clf_w = LogisticRegression(solver=solver, multi_class='multinomial', random_state=42, max_iter=2000, tol=1e-7, fit_intercept=False) clf_i.fit(X, y) clf_w.fit(X, y) assert_array_equal(clf_i.coef_.shape, (n_classes, n_features)) assert_array_equal(clf_w.coef_.shape, (n_classes, n_features)) # Compare solutions between lbfgs and the other solvers assert_almost_equal(ref_i.coef_, clf_i.coef_, decimal=3) assert_almost_equal(ref_w.coef_, clf_w.coef_, decimal=3) assert_almost_equal(ref_i.intercept_, clf_i.intercept_, decimal=3) # Test that the path give almost the same results. However since in this # case we take the average of the coefs after fitting across all the # folds, it need not be exactly the same. for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']: clf_path = LogisticRegressionCV(solver=solver, max_iter=2000, tol=1e-6, multi_class='multinomial', Cs=[1.]) clf_path.fit(X, y) assert_array_almost_equal(clf_path.coef_, ref_i.coef_, decimal=3) assert_almost_equal(clf_path.intercept_, ref_i.intercept_, decimal=3)
def test_multinomial_logistic_regression_string_inputs(): # Test with string labels for LogisticRegression(CV) n_samples, n_features, n_classes = 50, 5, 3 X_ref, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative=3, random_state=0) y_str = LabelEncoder().fit(['bar', 'baz', 'foo']).inverse_transform(y) # For numerical labels, let y values be taken from set (-1, 0, 1) y = np.array(y) - 1 # Test for string labels lr = LogisticRegression(solver='lbfgs', multi_class='multinomial') lr_cv = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial') lr_str = LogisticRegression(solver='lbfgs', multi_class='multinomial') lr_cv_str = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial') lr.fit(X_ref, y) lr_cv.fit(X_ref, y) lr_str.fit(X_ref, y_str) lr_cv_str.fit(X_ref, y_str) assert_array_almost_equal(lr.coef_, lr_str.coef_) assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo']) assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_) assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo']) assert_equal(sorted(lr_cv_str.classes_), ['bar', 'baz', 'foo']) # The predictions should be in original labels assert_equal(sorted(np.unique(lr_str.predict(X_ref))), ['bar', 'baz', 'foo']) assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz', 'foo']) # Make sure class weights can be given with string labels lr_cv_str = LogisticRegression(solver='lbfgs', class_weight={ 'bar': 1, 'baz': 2, 'foo': 0 }, multi_class='multinomial').fit( X_ref, y_str) assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz'])
def test_logistic_regression_multinomial(): # Tests for the multinomial option in logistic regression # Some basic attributes of Logistic Regression n_samples, n_features, n_classes = 50, 20, 3 X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=10, n_classes=n_classes, random_state=0) clf_int = LogisticRegression(solver='lbfgs', multi_class='multinomial') clf_int.fit(X, y) assert_array_equal(clf_int.coef_.shape, (n_classes, n_features)) clf_wint = LogisticRegression(solver='lbfgs', multi_class='multinomial', fit_intercept=False) clf_wint.fit(X, y) assert_array_equal(clf_wint.coef_.shape, (n_classes, n_features)) # Similar tests for newton-cg solver option clf_ncg_int = LogisticRegression(solver='newton-cg', multi_class='multinomial') clf_ncg_int.fit(X, y) assert_array_equal(clf_ncg_int.coef_.shape, (n_classes, n_features)) clf_ncg_wint = LogisticRegression(solver='newton-cg', fit_intercept=False, multi_class='multinomial') clf_ncg_wint.fit(X, y) assert_array_equal(clf_ncg_wint.coef_.shape, (n_classes, n_features)) # Compare solutions between lbfgs and newton-cg assert_almost_equal(clf_int.coef_, clf_ncg_int.coef_, decimal=3) assert_almost_equal(clf_wint.coef_, clf_ncg_wint.coef_, decimal=3) assert_almost_equal(clf_int.intercept_, clf_ncg_int.intercept_, decimal=3) # Test that the path give almost the same results. However since in this # case we take the average of the coefs after fitting across all the # folds, it need not be exactly the same. for solver in ['lbfgs', 'newton-cg']: clf_path = LogisticRegressionCV(solver=solver, multi_class='multinomial', Cs=[1.]) clf_path.fit(X, y) assert_array_almost_equal(clf_path.coef_, clf_int.coef_, decimal=3) assert_almost_equal(clf_path.intercept_, clf_int.intercept_, decimal=3)
def test_logistic_regressioncv_class_weights(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) # Test the liblinear fails when class_weight of type dict is # provided, when it is multiclass clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2}, solver='liblinear') assert_raises(ValueError, clf_lib.fit, X, y) # Test for class_weight=auto X, y = make_classification(n_samples=20, n_features=20, n_informative=10, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False, class_weight='auto') clf_lbf.fit(X, y) clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False, class_weight='auto') clf_lib.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
def test_logistic_regressioncv_class_weights(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) # Test the liblinear fails when class_weight of type dict is # provided, when it is multiclass. However it can handle # binary problems. clf_lib = LogisticRegressionCV(class_weight={ 0: 0.1, 1: 0.2 }, solver='liblinear') assert_raises(ValueError, clf_lib.fit, X, y) y_ = y.copy() y_[y == 2] = 1 clf_lib.fit(X, y_) assert_array_equal(clf_lib.classes_, [0, 1]) # Test for class_weight=balanced X, y = make_classification(n_samples=20, n_features=20, n_informative=10, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False, class_weight='balanced') clf_lbf.fit(X, y) clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False, class_weight='balanced') clf_lib.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
def test_ovr_multinomial_iris(): # Test that OvR and multinomial are correct using the iris dataset. train, target = iris.data, iris.target n_samples, n_features = train.shape # The cv indices from stratified kfold (where stratification is done based # on the fine-grained iris classes, i.e, before the classes 0 and 1 are # conflated) is used for both clf and clf1 n_cv = 2 cv = StratifiedKFold(n_cv) precomputed_folds = list(cv.split(train, target)) # Train clf on the original dataset where classes 0 and 1 are separated clf = LogisticRegressionCV(cv=precomputed_folds) clf.fit(train, target) # Conflate classes 0 and 1 and train clf1 on this modified dataset clf1 = LogisticRegressionCV(cv=precomputed_folds) target_copy = target.copy() target_copy[target_copy == 0] = 1 clf1.fit(train, target_copy) # Ensure that what OvR learns for class2 is same regardless of whether # classes 0 and 1 are separated or not assert_array_almost_equal(clf.scores_[2], clf1.scores_[2]) assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_) assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_) # Test the shape of various attributes. assert_equal(clf.coef_.shape, (3, n_features)) assert_array_equal(clf.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1)) assert_equal(clf.Cs_.shape, (10, )) scores = np.asarray(list(clf.scores_.values())) assert_equal(scores.shape, (3, n_cv, 10)) # Test that for the iris data multinomial gives a better accuracy than OvR for solver in ['lbfgs', 'newton-cg', 'sag']: max_iter = 100 if solver == 'sag' else 15 clf_multi = LogisticRegressionCV(solver=solver, multi_class='multinomial', max_iter=max_iter, random_state=42, tol=1e-2, cv=2) clf_multi.fit(train, target) multi_score = clf_multi.score(train, target) ovr_score = clf.score(train, target) assert_greater(multi_score, ovr_score) # Test attributes of LogisticRegressionCV assert_equal(clf.coef_.shape, clf_multi.coef_.shape) assert_array_equal(clf_multi.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1)) assert_equal(clf_multi.Cs_.shape, (10, )) scores = np.asarray(list(clf_multi.scores_.values())) assert_equal(scores.shape, (3, n_cv, 10))
def estimate_weights_logistic_regresssion(X_s, X_t): """ estimate a logistic regressor to predict the probability of a sample to be generated by one class or the other. If one class is over or under represented weights will be adapted. Parameters: X_s: samples from the source domain X_t: samples from the target domain Returns: weigths for X_s """ X_all, all_labels = prepare_data_for_weights_estimation(X_s, X_t) kf = KFold(X_all.shape[0], 10, shuffle=True) best_lr = LogisticRegressionCV(class_weight="auto", Cs=np.logspace(4, 8, 10), fit_intercept=False) best_lr.fit(X_all, all_labels) weights = X_s.shape[0] / X_t.shape[0] * np.exp( np.dot(X_s, best_lr.coef_.T) + best_lr.intercept_) return weights
def estimate_weights_logistic_regresssion(X_s, X_t): """ estimate a logistic regressor to predict the probability of a sample to be generated by one class or the other. If one class is over or under represented weights will be adapted. Parameters: X_s: samples from the source domain X_t: samples from the target domain Returns: weigths for X_s """ X_all, all_labels = prepare_data_for_weights_estimation(X_s, X_t) kf = KFold(X_all.shape[0], 10, shuffle=True) best_lr = LogisticRegressionCV(class_weight="auto", Cs=np.logspace(4, 8, 10), fit_intercept=False) best_lr.fit(X_all, all_labels) weights = X_s.shape[0] / X_t.shape[0] * np.exp(np.dot(X_s, best_lr.coef_.T) + best_lr.intercept_) return weights
def test_ovr_multinomial_iris(): # Test that OvR and multinomial are correct using the iris dataset. train, target = iris.data, iris.target n_samples, n_features = train.shape # The cv indices from stratified kfold (where stratification is done based # on the fine-grained iris classes, i.e, before the classes 0 and 1 are # conflated) is used for both clf and clf1 n_cv = 2 cv = StratifiedKFold(n_cv) precomputed_folds = list(cv.split(train, target)) # Train clf on the original dataset where classes 0 and 1 are separated clf = LogisticRegressionCV(cv=precomputed_folds) clf.fit(train, target) # Conflate classes 0 and 1 and train clf1 on this modified dataset clf1 = LogisticRegressionCV(cv=precomputed_folds) target_copy = target.copy() target_copy[target_copy == 0] = 1 clf1.fit(train, target_copy) # Ensure that what OvR learns for class2 is same regardless of whether # classes 0 and 1 are separated or not assert_array_almost_equal(clf.scores_[2], clf1.scores_[2]) assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_) assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_) # Test the shape of various attributes. assert_equal(clf.coef_.shape, (3, n_features)) assert_array_equal(clf.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1)) assert_equal(clf.Cs_.shape, (10,)) scores = np.asarray(list(clf.scores_.values())) assert_equal(scores.shape, (3, n_cv, 10)) # Test that for the iris data multinomial gives a better accuracy than OvR for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']: max_iter = 2000 if solver in ['sag', 'saga'] else 15 clf_multi = LogisticRegressionCV( solver=solver, multi_class='multinomial', max_iter=max_iter, random_state=42, tol=1e-5 if solver in ['sag', 'saga'] else 1e-2, cv=2) clf_multi.fit(train, target) multi_score = clf_multi.score(train, target) ovr_score = clf.score(train, target) assert_greater(multi_score, ovr_score) # Test attributes of LogisticRegressionCV assert_equal(clf.coef_.shape, clf_multi.coef_.shape) assert_array_equal(clf_multi.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, n_cv, 10, n_features + 1)) assert_equal(clf_multi.Cs_.shape, (10,)) scores = np.asarray(list(clf_multi.scores_.values())) assert_equal(scores.shape, (3, n_cv, 10))
def test_logistic_cv(): # test for LogisticRegressionCV object n_samples, n_features = 50, 5 rng = np.random.RandomState(0) X_ref = rng.randn(n_samples, n_features) y = np.sign(X_ref.dot(5 * rng.randn(n_features))) X_ref -= X_ref.mean() X_ref /= X_ref.std() lr_cv = LogisticRegressionCV(Cs=[1.0], fit_intercept=False, solver="liblinear") lr_cv.fit(X_ref, y) lr = LogisticRegression(C=1.0, fit_intercept=False) lr.fit(X_ref, y) assert_array_almost_equal(lr.coef_, lr_cv.coef_) assert_array_equal(lr_cv.coef_.shape, (1, n_features)) assert_array_equal(lr_cv.classes_, [-1, 1]) assert_equal(len(lr_cv.classes_), 2) coefs_paths = np.asarray(list(lr_cv.coefs_paths_.values())) assert_array_equal(coefs_paths.shape, (1, 3, 1, n_features)) assert_array_equal(lr_cv.Cs_.shape, (1,)) scores = np.asarray(list(lr_cv.scores_.values())) assert_array_equal(scores.shape, (1, 3, 1))
def test_multinomial_logistic_regression_string_inputs(): # Test with string labels for LogisticRegression(CV) n_samples, n_features, n_classes = 50, 5, 3 X_ref, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative=3, random_state=0) y_str = LabelEncoder().fit(['bar', 'baz', 'foo']).inverse_transform(y) # For numerical labels, let y values be taken from set (-1, 0, 1) y = np.array(y) - 1 # Test for string labels lr = LogisticRegression(solver='lbfgs', multi_class='multinomial') lr_cv = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial') lr_str = LogisticRegression(solver='lbfgs', multi_class='multinomial') lr_cv_str = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial') lr.fit(X_ref, y) lr_cv.fit(X_ref, y) lr_str.fit(X_ref, y_str) lr_cv_str.fit(X_ref, y_str) assert_array_almost_equal(lr.coef_, lr_str.coef_) assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo']) assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_) assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo']) assert_equal(sorted(lr_cv_str.classes_), ['bar', 'baz', 'foo']) # The predictions should be in original labels assert_equal(sorted(np.unique(lr_str.predict(X_ref))), ['bar', 'baz', 'foo']) assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz', 'foo']) # Make sure class weights can be given with string labels lr_cv_str = LogisticRegression( solver='lbfgs', class_weight={'bar': 1, 'baz': 2, 'foo': 0}, multi_class='multinomial').fit(X_ref, y_str) assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz'])
def test_ovr_multinomial_iris(): # Test that OvR and multinomial are correct using the iris dataset. train, target = iris.data, iris.target n_samples, n_features = train.shape # Use pre-defined fold as folds generated for different y cv = StratifiedKFold(target, 3) clf = LogisticRegressionCV(cv=cv) clf.fit(train, target) clf1 = LogisticRegressionCV(cv=cv) target_copy = target.copy() target_copy[target_copy == 0] = 1 clf1.fit(train, target_copy) assert_array_almost_equal(clf.scores_[2], clf1.scores_[2]) assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_) assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_) # Test the shape of various attributes. assert_equal(clf.coef_.shape, (3, n_features)) assert_array_equal(clf.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, 3, 10, n_features + 1)) assert_equal(clf.Cs_.shape, (10, )) scores = np.asarray(list(clf.scores_.values())) assert_equal(scores.shape, (3, 3, 10)) # Test that for the iris data multinomial gives a better accuracy than OvR for solver in ['lbfgs', 'newton-cg']: clf_multi = LogisticRegressionCV( solver=solver, multi_class='multinomial', max_iter=15 ) clf_multi.fit(train, target) multi_score = clf_multi.score(train, target) ovr_score = clf.score(train, target) assert_greater(multi_score, ovr_score) # Test attributes of LogisticRegressionCV assert_equal(clf.coef_.shape, clf_multi.coef_.shape) assert_array_equal(clf_multi.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, 3, 10, n_features + 1)) assert_equal(clf_multi.Cs_.shape, (10, )) scores = np.asarray(list(clf_multi.scores_.values())) assert_equal(scores.shape, (3, 3, 10))
def test_ovr_multinomial_iris(): # Test that OvR and multinomial are correct using the iris dataset. train, target = iris.data, iris.target n_samples, n_features = train.shape # Use pre-defined fold as folds generated for different y cv = StratifiedKFold(target, 3) clf = LogisticRegressionCV(cv=cv) clf.fit(train, target) clf1 = LogisticRegressionCV(cv=cv) target_copy = target.copy() target_copy[target_copy == 0] = 1 clf1.fit(train, target_copy) assert_array_almost_equal(clf.scores_[2], clf1.scores_[2]) assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_) assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_) # Test the shape of various attributes. assert_equal(clf.coef_.shape, (3, n_features)) assert_array_equal(clf.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, 3, 10, n_features + 1)) assert_equal(clf.Cs_.shape, (10, )) scores = np.asarray(list(clf.scores_.values())) assert_equal(scores.shape, (3, 3, 10)) # Test that for the iris data multinomial gives a better accuracy than OvR for solver in ['lbfgs', 'newton-cg']: clf_multi = LogisticRegressionCV(solver=solver, multi_class='multinomial', max_iter=15) clf_multi.fit(train, target) multi_score = clf_multi.score(train, target) ovr_score = clf.score(train, target) assert_greater(multi_score, ovr_score) # Test attributes of LogisticRegressionCV assert_equal(clf.coef_.shape, clf_multi.coef_.shape) assert_array_equal(clf_multi.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, 3, 10, n_features + 1)) assert_equal(clf_multi.Cs_.shape, (10, )) scores = np.asarray(list(clf_multi.scores_.values())) assert_equal(scores.shape, (3, 3, 10))
def test_liblinear_logregcv_sparse(): """Test LogRegCV with solver='liblinear' works for sparse matrices""" X, y = make_classification(n_samples=10, n_features=5) clf = LogisticRegressionCV(solver='liblinear') clf.fit(sparse.csr_matrix(X), y)
def test_saga_sparse(): # Test LogRegCV with solver='liblinear' works for sparse matrices X, y = make_classification(n_samples=10, n_features=5, random_state=0) clf = LogisticRegressionCV(solver='saga') clf.fit(sparse.csr_matrix(X), y)
def test_liblinear_logregcv_sparse(): # Test LogRegCV with solver='liblinear' works for sparse matrices X, y = make_classification(n_samples=10, n_features=5) clf = LogisticRegressionCV(solver='liblinear') clf.fit(sparse.csr_matrix(X), y)
print("The F score-Micro (MultinomialNB) is:", f_score) # Random Forest rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) rf_clf.fit(X_train, y_train) rf_predictions = rf_clf.predict(X_test) score = accuracy_score(y_test, rf_predictions) f_score = f1_score(y_test, rf_predictions, average='micro') print("The accuracy score (Random Forest) is:", score) print("The F score-Micro (Random Forest) is:", f_score) from sklearn import metrics # Logistic Regression lr_classifier = LogisticRegressionCV() lr_classifier.fit(X_train, y_train) lt_predictions = lr_classifier.predict(X_test) score = accuracy_score(y_test, lt_predictions) f_score = f1_score(y_test, lt_predictions, average='micro') print("The accuracy score (Logistic Regression) is:", score) print("The F score-Micro (Logistic Regression) is:", f_score) cnf_matrix = metrics.confusion_matrix(y_test, rf_predictions) # import required modules import seaborn as sns class_names = [0, 1] # name of classes fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names)