def test_class_weight_auto_classifiers(): """Test that class_weight="auto" improves f1-score""" # This test is broken; its success depends on: # * a rare fortuitous RNG seed for make_classification; and # * the use of binary F1 over a seemingly arbitrary positive class for two # datasets, and weighted average F1 for the third. # Its expectations need to be clarified and reimplemented. raise SkipTest("This test requires redefinition") classifiers = all_estimators(type_filter="classifier") clean_warning_registry() with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()] for n_classes, weights in zip([2, 3], [[0.8, 0.2], [0.8, 0.1, 0.1]]): # create unbalanced dataset X, y = make_classification( n_classes=n_classes, n_samples=200, n_features=10, weights=weights, random_state=0, n_informative=n_classes ) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) for name, Classifier in classifiers: if ( name != "NuSVC" # the sparse version has a parameter that doesn't do anything and not name.startswith("RidgeClassifier") # RidgeClassifier behaves unexpected # FIXME! and not name.endswith("NB") ): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! yield (check_class_weight_auto_classifiers, name, Classifier, X_train, y_train, X_test, y_test, weights)
def test_auc_score_non_binary_class(): # Test that roc_auc_score function returns an error when trying # to compute AUC for non-binary class values. rng = check_random_state(404) y_pred = rng.rand(10) # y_true contains only one class value y_true = np.zeros(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) y_true = np.ones(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) y_true = -np.ones(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) # y_true contains three different class values y_true = rng.randint(0, 3, size=10) assert_raise_message(ValueError, "multiclass format is not supported", roc_auc_score, y_true, y_pred) clean_warning_registry() with warnings.catch_warnings(record=True): rng = check_random_state(404) y_pred = rng.rand(10) # y_true contains only one class value y_true = np.zeros(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) y_true = np.ones(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) y_true = -np.ones(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) # y_true contains three different class values y_true = rng.randint(0, 3, size=10) assert_raise_message(ValueError, "multiclass format is not supported", roc_auc_score, y_true, y_pred)
def test_tfidf_no_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.]) # the lack of smoothing make IDF fragile in the presence of feature with # only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') clean_warning_registry() with warnings.catch_warnings(record=True) as w: 1. / np.array([0.]) numpy_provides_div0_warning = len(w) == 1 in_warning_message = 'divide by zero' tfidf = assert_warns_message(RuntimeWarning, in_warning_message, tr.fit_transform, X).toarray() if not numpy_provides_div0_warning: raise SkipTest("Numpy does not provide div 0 warnings.")
def test_configure(): # Smoke test the 'configure' step of setup, this tests all the # 'configure' functions in the setup.pys in the scikit cwd = os.getcwd() setup_path = os.path.abspath(os.path.join(sklearn.__path__[0], "..")) setup_filename = os.path.join(setup_path, "setup.py") if not os.path.exists(setup_filename): return try: os.chdir(setup_path) old_argv = sys.argv sys.argv = ["setup.py", "config"] clean_warning_registry() with warnings.catch_warnings(): # The configuration spits out warnings when not finding # Blas/Atlas development headers warnings.simplefilter("ignore", UserWarning) if PY3: with open("setup.py") as f: exec(f.read(), dict(__name__="__main__")) else: execfile("setup.py", dict(__name__="__main__")) finally: sys.argv = old_argv os.chdir(cwd)
def _tested_linear_classifiers(): classifiers = all_estimators(type_filter='classifier') clean_warning_registry() with warnings.catch_warnings(record=True): for name, clazz in classifiers: if ('class_weight' in clazz().get_params().keys() and issubclass(clazz, LinearClassifierMixin)): yield name, clazz
def test_warning_scaling_integers(): # Check warning when scaling integer data X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) w = "Data with input dtype uint8 was converted to float64" clean_warning_registry() assert_warns_message(DataConversionWarning, w, scale, X) assert_warns_message(DataConversionWarning, w, StandardScaler().fit, X) assert_warns_message(DataConversionWarning, w, MinMaxScaler().fit, X)
def test_warning_scaling_integers(): # Check warning when scaling integer data X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) w = "assumes floating point values as input, got uint8" clean_warning_registry() assert_warns_message(UserWarning, w, scale, X) assert_warns_message(UserWarning, w, StandardScaler().fit, X) assert_warns_message(UserWarning, w, MinMaxScaler().fit, X)
def test_chi2_unused_feature(): # Unused feature should evaluate to NaN # and should issue no runtime warning clean_warning_registry() with warnings.catch_warnings(record=True) as warned: warnings.simplefilter('always') chi, p = chi2([[1, 0], [0, 0]], [1, 0]) for w in warned: if 'divide by zero' in repr(w): raise AssertionError('Found unexpected warning %s' % w) assert_array_equal(chi, [1, np.nan]) assert_array_equal(p[1], np.nan)
def test_class_weight_balanced_linear_classifiers(): classifiers = all_estimators(type_filter='classifier') clean_warning_registry() with warnings.catch_warnings(record=True): linear_classifiers = [ (name, clazz) for name, clazz in classifiers if 'class_weight' in clazz().get_params().keys() and issubclass(clazz, LinearClassifierMixin)] for name, Classifier in linear_classifiers: yield check_class_weight_balanced_linear_classifier, name, Classifier
def test_warning_scaling_integers(): """Check warning when scaling integer data""" X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8) clean_warning_registry() with warnings.catch_warnings(record=True): warnings.simplefilter("always") assert_warns(UserWarning, StandardScaler().fit, X) clean_warning_registry() with warnings.catch_warnings(record=True): warnings.simplefilter("always") assert_warns(UserWarning, MinMaxScaler().fit, X)
def test_recall_warnings(): assert_no_warnings(recall_score, np.array([[1, 1], [1, 1]]), np.array([[0, 0], [0, 0]]), average='micro') clean_warning_registry() with warnings.catch_warnings(record=True) as record: warnings.simplefilter('always') recall_score(np.array([[0, 0], [0, 0]]), np.array([[1, 1], [1, 1]]), average='micro') assert_equal(str(record.pop().message), 'Recall is ill-defined and ' 'being set to 0.0 due to no true samples.')
def test_class_weight_classifiers(): # test that class_weight works and that the semantics are consistent classifiers = all_estimators(type_filter="classifier") clean_warning_registry() with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()] for name, Classifier in classifiers: if name == "NuSVC": # the sparse version has a parameter that doesn't do anything continue if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! continue yield check_class_weight_classifiers, name, Classifier
def test_warn_deviance(): """Test if mdeviance and bdeviance give deprecated warning. """ for loss in ('bdeviance', 'mdeviance'): clean_warning_registry() with warnings.catch_warnings(record=True) as w: # This will raise a DataConversionWarning that we want to # "always" raise, elsewhere the warnings gets ignored in the # later tests, and the tests that check for this warning fail warnings.simplefilter("always", DataConversionWarning) clf = GradientBoostingClassifier(loss=loss) try: clf.fit(X, y) except: # mdeviance will raise ValueError because only 2 classes pass # deprecated warning for bdeviance and mdeviance assert len(w) == 1
def test_fscore_warnings(): clean_warning_registry() with warnings.catch_warnings(record=True) as record: warnings.simplefilter('always') for score in [f1_score, partial(fbeta_score, beta=2)]: score(np.array([[1, 1], [1, 1]]), np.array([[0, 0], [0, 0]]), average='micro') assert_equal(str(record.pop().message), 'F-score is ill-defined and ' 'being set to 0.0 due to no predicted samples.') score(np.array([[0, 0], [0, 0]]), np.array([[1, 1], [1, 1]]), average='micro') assert_equal(str(record.pop().message), 'F-score is ill-defined and ' 'being set to 0.0 due to no true samples.')
def test_class_weight_auto_linear_classifiers(): classifiers = all_estimators(type_filter="classifier") clean_warning_registry() with warnings.catch_warnings(record=True): linear_classifiers = [ (name, clazz) for name, clazz in classifiers if "class_weight" in clazz().get_params().keys() and issubclass(clazz, LinearClassifierMixin) ] for name, Classifier in linear_classifiers: if name == "LogisticRegressionCV": # Contrary to RidgeClassifierCV, LogisticRegressionCV use actual # CV folds and fit a model for each CV iteration before averaging # the coef. Therefore it is expected to not behave exactly as the # other linear model. continue yield check_class_weight_auto_linear_classifier, name, Classifier
def test_configure(): # Smoke test the 'configure' step of setup, this tests all the # 'configure' functions in the setup.pys in scikit-learn cwd = os.getcwd() setup_path = os.path.abspath(os.path.join(sklearn.__path__[0], '..')) setup_filename = os.path.join(setup_path, 'setup.py') if not os.path.exists(setup_filename): return try: os.chdir(setup_path) old_argv = sys.argv sys.argv = ['setup.py', 'config'] clean_warning_registry() with warnings.catch_warnings(): # The configuration spits out warnings when not finding # Blas/Atlas development headers warnings.simplefilter('ignore', UserWarning) with open('setup.py') as f: exec(f.read(), dict(__name__='__main__')) finally: sys.argv = old_argv os.chdir(cwd)
def test_tfidf_no_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization assert_array_almost_equal((tfidf**2).sum(axis=1), [1., 1., 1.]) # the lack of smoothing make IDF fragile in the presence of feature with # only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') clean_warning_registry() with warnings.catch_warnings(record=True) as w: 1. / np.array([0.]) numpy_provides_div0_warning = len(w) == 1 in_warning_message = 'divide by zero' tfidf = assert_warns_message(RuntimeWarning, in_warning_message, tr.fit_transform, X).toarray() if not numpy_provides_div0_warning: raise SkipTest("Numpy does not provide div 0 warnings.")
def test_l2_deprecation(): clean_warning_registry() with warnings.catch_warnings(record=True) as w: assert_equal(l1_min_c(dense_X, Y1, "l2"), l1_min_c(dense_X, Y1, "squared_hinge")) assert_equal(w[0].category, DeprecationWarning)
def test_scaler_int(): # test that scaler converts integer input to floating # for both sparse and dense matrices rng = np.random.RandomState(42) X = rng.randint(20, size=(4, 5)) X[:, 0] = 0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) clean_warning_registry() with warnings.catch_warnings(record=True): X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) clean_warning_registry() with warnings.catch_warnings(record=True): scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) clean_warning_registry() with warnings.catch_warnings(record=True): scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) clean_warning_registry() with warnings.catch_warnings(record=True): scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csr.transform(X_csc, copy=True) assert_false(np.any(np.isnan(X_csc_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.std_, scaler_csc.std_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., 1.109, 1.856, 21., 1.559], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis( X_csr_scaled.astype(np.float), 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert_true(X_csc_scaled_back is not X_csc) assert_true(X_csc_scaled_back is not X_csc_scaled) assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
def test_scaler_int(): # test that scaler converts integer input to floating # for both sparse and dense matrices rng = np.random.RandomState(42) X = rng.randint(20, size=(4, 5)) X[:, 0] = 0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) clean_warning_registry() with warnings.catch_warnings(record=True): X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) clean_warning_registry() with warnings.catch_warnings(record=True): scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) clean_warning_registry() with warnings.catch_warnings(record=True): scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) clean_warning_registry() with warnings.catch_warnings(record=True): scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csr.transform(X_csc, copy=True) assert_false(np.any(np.isnan(X_csc_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.std_, scaler_csc.std_) assert_array_almost_equal(X_scaled.mean(axis=0), [0., 1.109, 1.856, 21., 1.559], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis( X_csr_scaled.astype(np.float), 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert_true(X_csc_scaled_back is not X_csc) assert_true(X_csc_scaled_back is not X_csc_scaled) assert_array_almost_equal(X_csc_scaled_back.toarray(), X)