def test_sparsify_estimators(): """Test if predict with sparsified estimators works. Tests regression, binary classification, and multi-class classification. """ estimators = all_estimators() X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) y = [1, 1, 1, 2, 2, 2] # test regression and binary classification for name, Estimator in estimators: try: Estimator.sparsify except: continue yield check_sparsify_binary_classifier, name, Estimator, X, y # test multiclass classification classifiers = all_estimators(type_filter='classifier') y[-1] = 3 # make multi-class for name, Classifier in classifiers: try: Classifier.sparsify except: continue yield check_sparsify_multiclass_classifier, name, Classifier, X, y
def test_sparsify_estimators(): """Test if predict with sparsified estimators works. Tests regression, binary classification, and multi-class classification. """ estimators = all_estimators() X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) y = [1, 1, 1, 2, 2, 2] # test regression and binary classification for name, Estimator in estimators: try: Estimator.sparsify except: continue est = Estimator() est.fit(X, y) pred_orig = est.predict(X) # test sparsify with dense inputs est.sparsify() assert_true(sparse.issparse(est.coef_)) pred = est.predict(X) assert_array_equal(pred, pred_orig) # pickle and unpickle with sparse coef_ est = pickle.loads(pickle.dumps(est)) assert_true(sparse.issparse(est.coef_)) pred = est.predict(X) assert_array_equal(pred, pred_orig) # test multiclass classification classifiers = all_estimators(type_filter='classifier') y[-1] = 3 # make multi-class for name, Classifier in classifiers: try: Classifier.sparsify except: continue est = Classifier() est.fit(X, y) pred_orig = est.predict(X) # test sparsify with dense inputs est.sparsify() assert_true(sparse.issparse(est.coef_)) pred = est.predict(X) assert_array_equal(pred, pred_orig) # pickle and unpickle with sparse coef_ est = pickle.loads(pickle.dumps(est)) assert_true(sparse.issparse(est.coef_)) pred = est.predict(X) assert_array_equal(pred, pred_orig)
def discover_supervised(): classifiers = all_estimators(type_filter="classifier") regressors = all_estimators(type_filter="regressor") classes = [] for name, Est in classifiers + regressors: if issubclass(Est, ClassifierMixin): namespace = "classifiers" else: namespace = "regressors" classes.append(make_module(name, Est, namespace, supervised=True)) return classes
def test_non_meta_estimators(): # input validation etc for non-meta estimators estimators = all_estimators() for name, Estimator in estimators: if issubclass(Estimator, BiclusterMixin): continue if name.endswith("HMM") or name.startswith("_"): continue if name not in CROSS_DECOMPOSITION: yield check_estimators_dtypes, name, Estimator yield check_fit_score_takes_y, name, Estimator yield check_dtype_object, name, Estimator # Check that all estimator yield informative messages when # trained on empty datasets yield check_estimators_empty_data_messages, name, Estimator if name not in CROSS_DECOMPOSITION + ['SpectralEmbedding']: # SpectralEmbedding is non-deterministic, # see issue #4236 yield check_pipeline_consistency, name, Estimator if name not in CROSS_DECOMPOSITION + ['Imputer']: # Test that all estimators check their input for NaN's and infs yield check_estimators_nan_inf, name, Estimator if name not in CROSS_DECOMPOSITION + ['GaussianProcess']: # FIXME! # in particular GaussianProcess! yield check_estimators_overwrite_params, name, Estimator if hasattr(Estimator, 'sparsify'): yield check_sparsify_coefficients, name, Estimator yield check_estimator_sparse_data, name, Estimator
def _tested_non_meta_estimators(): for name, Estimator in all_estimators(): if issubclass(Estimator, BiclusterMixin): continue if name.startswith("_"): continue yield name, Estimator
def test_class_weight_classifiers(): # test that class_weight works and that the semantics are consistent classifiers = all_estimators(type_filter="classifier") with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()] for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) for name, Classifier in classifiers: if name == "NuSVC": # the sparse version has a parameter that doesn't do anything continue if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! continue if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.9)
def test_regressors_int(): # test if regressors can cope with integer labels (by converting them to # float) regressors = all_estimators(type_filter="regressor") X, _ = _boston_subset() X = X[:50] rnd = np.random.RandomState(0) y = rnd.randint(3, size=X.shape[0]) for name, Regressor in regressors: if name in dont_test or name in ("CCA"): continue # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds regressor_1 = Regressor() regressor_2 = Regressor() set_random_state(regressor_1) set_random_state(regressor_2) if name in ("_PLS", "PLSCanonical", "PLSRegression"): y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y # fit regressor_1.fit(X, y_) pred1 = regressor_1.predict(X) regressor_2.fit(X, y_.astype(np.float)) pred2 = regressor_2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def test_classifiers_classes(): # test if classifiers can cope with non-consecutive classes estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) y = 2 * y + 1 # TODO: make work with next line :) #y = y.astype(np.str) for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) # training set performance assert_array_equal(np.unique(y), np.unique(y_pred)) assert_greater(zero_one_score(y, y_pred), 0.78)
def test_estimators_sparse_data(): # All estimators should either deal with sparse data, or raise an # intelligible error message rng = np.random.RandomState(0) X = rng.rand(40, 10) X[X < .8] = 0 X = sparse.csr_matrix(X) y = (4 * rng.rand(40)).astype(np.int) estimators = all_estimators() estimators = [(name, E) for name, E in estimators if issubclass(E, (ClassifierMixin, RegressorMixin))] for name, Clf in estimators: if Clf in dont_test or Clf in meta_estimators: continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # fit try: clf.fit(X, y) except TypeError, e: if not 'sparse' in repr(e): print ("Estimator %s doesn't seem to fail gracefully on " "sparse data" % name) traceback.print_exc(file=sys.stdout) raise e except Exception, exc: print ("Estimator %s doesn't seem to fail gracefully on " "sparse data" % name) traceback.print_exc(file=sys.stdout) raise exc
def test_transformers_data_not_an_array(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 for name, Transformer in transformers: # XXX: some transformers are transforming the input # data. This is a bug that we'll fix later. Right now we copy # the data each time this_X = NotAnArray(X.copy()) this_y = NotAnArray(np.asarray(y)) if name in dont_test: continue # these don't actually fit the data: if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']: continue # And these wan't multivariate output if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): continue yield check_transformer, name, Transformer, this_X, this_y
def test_regressors_int(): # test if regressors can cope with integer labels (by converting them to # float) regressors = all_estimators(type_filter='regressor') boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) X = StandardScaler().fit_transform(X) y = np.random.randint(2, size=X.shape[0]) for name, Reg in regressors: if Reg in dont_test or Reg in (CCA,): continue # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds reg1 = Reg() reg2 = Reg() set_random_state(reg1) set_random_state(reg2) if Reg in (_PLS, PLSCanonical, PLSRegression): y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))]) y_ = y_.T else: y_ = y # fit reg1.fit(X, y_) pred1 = reg1.predict(X) reg2.fit(X, y_.astype(np.float)) pred2 = reg2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def test_classifiers_classes(): # test if classifiers can cope with non-consecutive classes classifiers = all_estimators(type_filter='classifier') X, y = make_blobs(random_state=12345) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) y = 2 * y + 1 classes = np.unique(y) # TODO: make work with next line :) #y = y.astype(np.str) for name, Clf in classifiers: if Clf in dont_test: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) # training set performance assert_array_equal(np.unique(y), np.unique(y_pred)) assert_greater(zero_one_score(y, y_pred), 0.78, "accuracy of %s not greater than 0.78" % str(Clf)) assert_array_equal( clf.classes_, classes, "Unexpected classes_ attribute for %r" % clf)
def inspect(afilter='classifier', parameter='sample_weight'): """ helps you inspect some of the parameters and some options you may want to choose""" import inspect from sklearn.utils.testing import all_estimators for name, clf in all_estimators(type_filter=afilter): if parameter in inspect.getargspec(clf().fit)[0]: print name
def test_estimators_nan_inf(): # Test that all estimators check their input for NaN's and infs estimators = all_estimators(type_filter=['classifier', 'regressor', 'transformer', 'cluster']) for name, Estimator in estimators: if name not in CROSS_DECOMPOSITION + ['Imputer']: yield check_estimators_nan_inf, name, Estimator
def test_estimators_nan_inf(): # Test that all estimators check their input for NaN's and infs rnd = np.random.RandomState(0) X_train_finite = rnd.uniform(size=(10, 3)) X_train_nan = rnd.uniform(size=(10, 3)) X_train_nan[0, 0] = np.nan X_train_inf = rnd.uniform(size=(10, 3)) X_train_inf[0, 0] = np.inf y = np.ones(10) y[:5] = 0 estimators = all_estimators() estimators = [(name, E) for name, E in estimators if (issubclass(E, ClassifierMixin) or issubclass(E, RegressorMixin) or issubclass(E, TransformerMixin) or issubclass(E, ClusterMixin))] for X_train in [X_train_nan, X_train_inf]: for name, Estimator in estimators: if name in dont_test: continue if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD', 'Imputer'): # Imputer accepts nan continue yield (check_estimators_nan_inf, name, Estimator, X_train, X_train_finite, multioutput_estimator_convert_y_2d(name, y))
def test_class_weight_classifiers(): # test that class_weight works and that the semantics are consistent classifiers = all_estimators(type_filter='classifier') with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if 'class_weight' in c[1]().get_params().keys()] for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) for name, Classifier in classifiers: if name == "NuSVC": # the sparse version has a parameter that doesn't do anything continue if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! continue check_class_weight_classifiers.description =\ "check_class_weight_classfiers(%s, %d)" % (name, n_centers) yield (check_class_weight_classifiers, name, Classifier, X_train, y_train, X_test, y_test)
def test_class_weight_auto_classifiers(): # test that class_weight="auto" improves f1-score classifiers = all_estimators(type_filter='classifier') with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if 'class_weight' in c[1]().get_params().keys()] for n_classes, weights in zip([2, 3], [[.8, .2], [.8, .1, .1]]): # create unbalanced dataset X, y = make_classification(n_classes=n_classes, n_samples=200, n_features=10, weights=weights, random_state=0, n_informative=n_classes) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) for name, Classifier in classifiers: if name == "NuSVC": # the sparse version has a parameter that doesn't do anything continue if name.startswith("RidgeClassifier"): # RidgeClassifier behaves unexpected # FIXME! continue if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! continue check_class_weight_auto_classifiers.description =\ "check_class_weight_auto_classifiers(%s, %d)" % (name, n_classes) yield (check_class_weight_auto_classifiers, name, Classifier, X_train, y_train, X_test, y_test, weights)
def test_transformers_sparse_data(): # All estimators should either deal with sparse data, or raise an # intelligible error message rng = np.random.RandomState(0) X = rng.rand(40, 10) X[X < 0.8] = 0 X = sparse.csr_matrix(X) y = (4 * rng.rand(40)).astype(np.int) estimators = all_estimators() estimators = [(name, E) for name, E in estimators if issubclass(E, TransformerMixin)] for name, Trans in estimators: if Trans in dont_test or Trans in meta_estimators: continue # catch deprecation warnings with warnings.catch_warnings(record=True): if Trans in [Scaler, StandardScaler]: trans = Trans(with_mean=False) else: trans = Trans() # fit try: trans.fit(X, y) except TypeError, e: if not "sparse" in repr(e): print ("Estimator %s doesn't seem to fail gracefully on " "sparse data" % name) traceback.print_exc(file=sys.stdout) raise e except Exception, exc: print ("Estimator %s doesn't seem to fail gracefully on " "sparse data" % name) traceback.print_exc(file=sys.stdout) raise exc
def test_regressors_train(): estimators = all_estimators() regressors = [(name, E) for name, E in estimators if issubclass(E, RegressorMixin)] boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) # TODO: test with intercept # TODO: test with multiple responses X = Scaler().fit_transform(X) y = Scaler().fit_transform(y) for name, Reg in regressors: if Reg in dont_test or Reg in meta_estimators: continue # catch deprecation warnings with warnings.catch_warnings(record=True): reg = Reg() if hasattr(reg, 'alpha'): reg.set_params(alpha=0.01) # raises error on malformed input for fit assert_raises(ValueError, reg.fit, X, y[:-1]) # fit reg.fit(X, y) reg.predict(X) assert_greater(reg.score(X, y), 0.5)
def test_non_transformer_estimators_n_iter(): # Test that all estimators of type which are non-transformer # and which have an attribute of max_iter, return the attribute # of n_iter atleast 1. for est_type in ['regressor', 'classifier', 'cluster']: regressors = all_estimators(type_filter=est_type) for name, Estimator in regressors: # LassoLars stops early for the default alpha=1.0 for # the iris dataset. if name == 'LassoLars': estimator = Estimator(alpha=0.) else: estimator = Estimator() if hasattr(estimator, "max_iter"): # These models are dependent on external solvers like # libsvm and accessing the iter parameter is non-trivial. if name in (['Ridge', 'SVR', 'NuSVR', 'NuSVC', 'RidgeClassifier', 'SVC', 'RandomizedLasso', 'LogisticRegressionCV']): continue # Tested in test_transformer_n_iter below elif (name in CROSS_DECOMPOSITION or name in ['LinearSVC', 'LogisticRegression']): continue else: # Multitask models related to ENet cannot handle # if y is mono-output. yield (check_non_transformer_estimators_n_iter, name, estimator, 'Multi' in name)
def test_class_weight_auto_classifiers(): """Test that class_weight="auto" improves f1-score""" # This test is broken; its success depends on: # * a rare fortuitous RNG seed for make_classification; and # * the use of binary F1 over a seemingly arbitrary positive class for two # datasets, and weighted average F1 for the third. # Its expectations need to be clarified and reimplemented. raise SkipTest("This test requires redefinition") classifiers = all_estimators(type_filter="classifier") clean_warning_registry() with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()] for n_classes, weights in zip([2, 3], [[0.8, 0.2], [0.8, 0.1, 0.1]]): # create unbalanced dataset X, y = make_classification( n_classes=n_classes, n_samples=200, n_features=10, weights=weights, random_state=0, n_informative=n_classes ) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) for name, Classifier in classifiers: if ( name != "NuSVC" # the sparse version has a parameter that doesn't do anything and not name.startswith("RidgeClassifier") # RidgeClassifier behaves unexpected # FIXME! and not name.endswith("NB") ): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! yield (check_class_weight_auto_classifiers, name, Classifier, X_train, y_train, X_test, y_test, weights)
def test_transformers_pickle(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() succeeded = True for name, Transformer in transformers: if name in dont_test: continue # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() if not hasattr(transformer, 'transform'): continue set_random_state(transformer) if hasattr(transformer, 'compute_importances'): transformer.compute_importances = True if name == "SelectKBest": # SelectKBest has a default of k=10 # which is more feature than we have. transformer.k = 1 elif name in ['GaussianRandomProjection', 'SparseRandomProjection']: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) transformer.n_components = 1 # fit if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit(X, y_).transform(X) pickled_transformer = pickle.dumps(transformer) unpickled_transformer = pickle.loads(pickled_transformer) pickled_X_pred = unpickled_transformer.transform(X) try: assert_array_almost_equal(pickled_X_pred, X_pred) except Exception as exc: succeeded = False print ("Transformer %s doesn't predict the same value " "after pickling" % name) raise exc assert_true(succeeded)
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=7) n_samples, n_features = X.shape n_labels = len(np.unique(y)) X = Scaler().fit_transform(X) for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # raises error on malformed input for fit assert_raises(ValueError, clf.fit, X, y[:-1]) # fit clf.fit(X, y) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples,)) # training set performance assert_greater(zero_one_score(y, y_pred), 0.78) # raises error on malformed input for predict assert_raises(ValueError, clf.predict, X.T) if hasattr(clf, "decision_function"): try: # decision_function agrees with predict: decision = clf.decision_function(X) assert_equal(decision.shape, (n_samples, n_labels)) # raises error on malformed input assert_raises(ValueError, clf.decision_function, X.T) if not isinstance(clf, BaseLibSVM): # 1on1 of LibSVM works differently assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function assert_raises(ValueError, clf.decision_function, X.T) except NotImplementedError: pass if hasattr(clf, "predict_proba"): try: # predict_proba agrees with predict: y_prob = clf.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_labels)) # raises error on malformed input assert_raises(ValueError, clf.predict_proba, X.T) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # raises error on malformed input for predict_proba assert_raises(ValueError, clf.predict_proba, X.T) except NotImplementedError: pass
def test_all_estimators(): # Test that estimators are default-constructible, clonable # and have working repr. estimators = all_estimators(include_meta_estimators=True) classifier = LDA() for name, Estimator in estimators: # some can just not be sensibly default constructed if name in dont_test: continue # test default-constructibility # get rid of deprecation warnings with warnings.catch_warnings(record=True): if name in meta_estimators: estimator = Estimator(classifier) else: estimator = Estimator() # test cloning clone(estimator) # test __repr__ repr(estimator) # test that set_params returns self assert_true(isinstance(estimator.set_params(), Estimator)) # test if init does nothing but set parameters # this is important for grid_search etc. # We get the default parameters from init and then # compare these against the actual values of the attributes. # this comes from getattr. Gets rid of deprecation decorator. init = getattr(estimator.__init__, 'deprecated_original', estimator.__init__) try: args, varargs, kws, defaults = inspect.getargspec(init) except TypeError: # init is not a python function. # true for mixins continue params = estimator.get_params() if name in meta_estimators: # they need a non-default argument args = args[2:] else: args = args[1:] if args: # non-empty list assert_equal(len(args), len(defaults)) else: continue for arg, default in zip(args, defaults): if arg not in params.keys(): # deprecated parameter, not in get_params assert_true(default is None) continue if isinstance(params[arg], np.ndarray): assert_array_equal(params[arg], default) else: assert_equal(params[arg], default)
def test_cluster_overwrite_params(): # test whether any classifier overwrites his init parameters during fit clusterers = all_estimators(type_filter="cluster") X, y = make_blobs(random_state=0, n_samples=9) # some want non-negative input X for name, Clustering in clusterers: yield check_cluster_overwrite_params, name, Clustering, X, y
def test_estimators_sparse_data(): # All estimators should either deal with sparse data or raise an # exception with type TypeError and an intelligible error message estimators = all_estimators() estimators = [(name, Estimator) for name, Estimator in estimators if issubclass(Estimator, (ClassifierMixin, RegressorMixin))] for name, Estimator in estimators: yield check_regressors_classifiers_sparse_data, name, Estimator
def _tested_linear_classifiers(): classifiers = all_estimators(type_filter='classifier') clean_warning_registry() with warnings.catch_warnings(record=True): for name, clazz in classifiers: if ('class_weight' in clazz().get_params().keys() and issubclass(clazz, LinearClassifierMixin)): yield name, clazz
def test_classifiers_data_not_an_array(): classifiers = all_estimators(type_filter="classifier") X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1]]) y = [1, 1, 1, 2, 2, 2] for name, Classifier in classifiers: if name in dont_test: continue yield (check_estimators_data_not_an_array, name, Classifier, X, multioutput_estimator_convert_y_2d(name, y))
def test_estimators_overwrite_params(): # test whether any classifier overwrites his init parameters during fit for est_type in ["classifier", "regressor", "transformer"]: estimators = all_estimators(type_filter=est_type) for name, Estimator in estimators: if name not in ["CCA", "_CCA", "PLSCanonical", "PLSRegression", "PLSSVD", "GaussianProcess"]: # FIXME! # in particular GaussianProcess! yield check_estimators_overwrite_params, name, Estimator
def test_get_params_invariance(): # Test for estimators that support get_params, that # get_params(deep=False) is a subset of get_params(deep=True) # Related to issue #4465 estimators = all_estimators(include_meta_estimators=False, include_other=True) for name, Estimator in estimators: if hasattr(Estimator, 'get_params'): yield check_get_params_invariance, name, Estimator
from sklearn.utils import IS_PYPY from sklearn.utils.estimator_checks import ( _yield_all_checks, _safe_tags, set_checking_parameters, check_parameters_default_constructible, check_no_attributes_set_in_init, check_class_weight_balanced_linear_classifier) def test_all_estimator_no_base_class(): # test that all_estimators doesn't find abstract classes. for name, Estimator in all_estimators(): msg = ("Base estimators such as {0} should not be included" " in all_estimators").format(name) assert not name.lower().startswith('base'), msg @pytest.mark.parametrize('name, Estimator', all_estimators()) def test_parameters_default_constructible(name, Estimator): # Test that estimators are default-constructible check_parameters_default_constructible(name, Estimator) def _tested_estimators(): for name, Estimator in all_estimators(): if issubclass(Estimator, BiclusterMixin): continue if name.startswith("_"): continue # FIXME _skip_test should be used here (if we could) required_parameters = getattr(Estimator, "_required_parameters", []) if len(required_parameters):
import warnings warnings.filterwarnings('ignore') #1 데이터 iris = pd.read_csv('./data/csv/iris.csv', header=0) x = iris.iloc[:, 0:4] y = iris.iloc[:, 4] print(x) print(y) #x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.2, random_state=6) kfold = KFold(n_splits=5, shuffle=True) warnings.filterwarnings('ignore') allAlgorithms = all_estimators(type_filter='classifier') # 모든 분류 모델 확인 for (name, algorithm) in allAlgorithms: model = algorithm() scores = cross_val_score(model, x, y, cv=kfold) print(name, "의 정답률 = ") print(scores) #model.fit(x, y) import sklearn print(sklearn.__version__)
def test_estimators_nan_inf(): # Test that all estimators check their input for NaN's and infs rnd = np.random.RandomState(0) X_train_finite = rnd.uniform(size=(10, 3)) X_train_nan = rnd.uniform(size=(10, 3)) X_train_nan[0, 0] = np.nan X_train_inf = rnd.uniform(size=(10, 3)) X_train_inf[0, 0] = np.inf y = np.ones(10) y[:5] = 0 estimators = all_estimators() estimators = [ (name, E) for name, E in estimators if (issubclass(E, ClassifierMixin) or issubclass(E, RegressorMixin) or issubclass(E, TransformerMixin) or issubclass(E, ClusterMixin)) ] error_string_fit = "Estimator doesn't check for NaN and inf in fit." error_string_predict = ("Estimator doesn't check for NaN and inf in" " predict.") error_string_transform = ("Estimator doesn't check for NaN and inf in" " transform.") for X_train in [X_train_nan, X_train_inf]: for name, Estimator in estimators: if name in dont_test: continue if name in ('_PLS', 'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): continue # catch deprecation warnings with warnings.catch_warnings(record=True): estimator = Estimator() if name in [ 'GaussianRandomProjection', 'SparseRandomProjection' ]: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be # greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) estimator = Estimator(n_components=1) set_random_state(estimator, 1) # try to fit try: if issubclass(Estimator, ClusterMixin): estimator.fit(X_train) else: estimator.fit(X_train, y) except ValueError as e: if not 'inf' in repr(e) and not 'NaN' in repr(e): print(error_string_fit, Estimator, e) traceback.print_exc(file=sys.stdout) raise e except Exception as exc: print(error_string_fit, Estimator, exc) traceback.print_exc(file=sys.stdout) raise exc else: raise AssertionError(error_string_fit, Estimator) # actually fit if issubclass(Estimator, ClusterMixin): # All estimators except clustering algorithm # support fitting with (optional) y estimator.fit(X_train_finite) else: estimator.fit(X_train_finite, y) # predict if hasattr(estimator, "predict"): try: estimator.predict(X_train) except ValueError as e: if not 'inf' in repr(e) and not 'NaN' in repr(e): print(error_string_predict, Estimator, e) traceback.print_exc(file=sys.stdout) raise e except Exception as exc: print(error_string_predict, Estimator, exc) traceback.print_exc(file=sys.stdout) else: raise AssertionError(error_string_predict, Estimator) # transform if hasattr(estimator, "transform"): try: estimator.transform(X_train) except ValueError as e: if not 'inf' in repr(e) and not 'NaN' in repr(e): print(error_string_transform, Estimator, e) traceback.print_exc(file=sys.stdout) raise e except Exception as exc: print(error_string_transform, Estimator, exc) traceback.print_exc(file=sys.stdout) else: raise AssertionError(error_string_transform, Estimator)
def test_transformers_pickle(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() succeeded = True for name, Transformer in transformers: if name in dont_test: continue # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() if not hasattr(transformer, 'transform'): continue set_random_state(transformer) if hasattr(transformer, 'compute_importances'): transformer.compute_importances = True if name == "SelectKBest": # SelectKBest has a default of k=10 # which is more feature than we have. transformer.k = 1 elif name in ['GaussianRandomProjection', 'SparseRandomProjection']: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) transformer.n_components = 1 # fit if name in ('_PLS', 'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit(X, y_).transform(X) pickled_transformer = pickle.dumps(transformer) unpickled_transformer = pickle.loads(pickled_transformer) pickled_X_pred = unpickled_transformer.transform(X) try: assert_array_almost_equal(pickled_X_pred, X_pred) except Exception as exc: succeeded = False print("Transformer %s doesn't predict the same value " "after pickling" % name) raise exc assert_true(succeeded)
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors classifiers = all_estimators(type_filter='classifier') X_m, y_m = make_blobs(random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # do it once with binary, once with multiclass classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape for name, Classifier in classifiers: if name in dont_test: continue if name in ['MultinomialNB', 'BernoulliNB']: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit classifier.fit(X, y) assert_true(hasattr(classifier, "classes_")) y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(accuracy_score(y, y_pred), 0.85) # raises error on malformed input for predict assert_raises(ValueError, classifier.predict, X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict: decision = classifier.decision_function(X) if n_classes is 2: assert_equal(decision.ravel().shape, (n_samples, )) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if (n_classes is 3 and not isinstance(classifier, BaseLibSVM)): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, classifier.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, classifier.decision_function, X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): try: # predict_proba agrees with predict: y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, classifier.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, classifier.predict_proba, X.T) except NotImplementedError: pass
str(e) for e in descriptions) else: completeDescription = str(type_) + "\n \n" + " ".join( str(e) for e in descriptions) # add into the dict at the key (name of the params) # a tuple (instance of type, default value of params, description) dico[name] = (type_map.get(types.group()), classifierTemp[name], completeDescription) return dico # ---------------------------------------------------------------------------------------------------------------- # ---------------------------------------- retrieves all estimators ---------------------------------------------- # ---------------------------------------------------------------------------------------------------------------- for name, class_ in all_estimators(): # Retrieves the type of the current estimator typeclass = str(getattr(class_, "_estimator_type", None)) # Delete the wrong estimator and check if it is a classifier if "_" not in name and typeclass == "classifier": # Retrieves the name and path of the module, from scikit for the current estimator modulePath = str(class_).split("'")[1] # Check if the classifier's name is in the path # If this is in, it is removed from the module path if name in modulePath: # Remove the name on the module path and stock an just-in-time import for the key (name of classifier) dictEstimator[name] = getattr(
roc_auc_score, f1_score, r2_score, mean_squared_error, ) import warnings import xgboost # import catboost import lightgbm warnings.filterwarnings("ignore") pd.set_option("display.precision", 2) pd.set_option("display.float_format", lambda x: "%.2f" % x) CLASSIFIERS = [est for est in all_estimators() if issubclass(est[1], ClassifierMixin)] REGRESSORS = [est for est in all_estimators() if issubclass(est[1], RegressorMixin)] removed_classifiers = [ ("CheckingClassifier", sklearn.utils._mocking.CheckingClassifier), ("ClassifierChain", sklearn.multioutput.ClassifierChain), ("ComplementNB", sklearn.naive_bayes.ComplementNB), ( "GradientBoostingClassifier", sklearn.ensemble.gradient_boosting.GradientBoostingClassifier, ), ( "GaussianProcessClassifier", sklearn.gaussian_process.gpc.GaussianProcessClassifier, ), (
from sklearn.compose import ColumnTransformer from sklearn.utils.testing import all_estimators from sklearn.base import RegressorMixin from sklearn.base import ClassifierMixin from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, r2_score, mean_squared_error import warnings import xgboost # import catboost import lightgbm warnings.filterwarnings("ignore") pd.set_option("display.precision", 2) pd.set_option("display.float_format", lambda x: '%.2f' % x) CLASSIFIERS = [ est for est in all_estimators() if issubclass(est[1], ClassifierMixin) ] REGRESSORS = [ est for est in all_estimators() if issubclass(est[1], RegressorMixin) ] removed_classifiers = [ ('ClassifierChain', sklearn.multioutput.ClassifierChain), ('ComplementNB', sklearn.naive_bayes.ComplementNB), ('GradientBoostingClassifier', sklearn.ensemble.gradient_boosting.GradientBoostingClassifier), ('GaussianProcessClassifier', sklearn.gaussian_process.gpc.GaussianProcessClassifier), ('HistGradientBoostingClassifier', sklearn.ensemble. _hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier), ('MLPClassifier',
from sklearn.model_selection import KFold import warnings from sklearn.model_selection import cross_val_score # アヤメデータの読み込み iris_data = pd.read_csv("iris.csv", encoding="utf-8") # アヤメデータをラベルと入力データに分離する y = iris_data.loc[:, "Name"] x = iris_data.loc[:, ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"]] # 学習用とテスト用に分離する # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, train_size = 0.8, shuffle = True) # classifierのアルゴリズムをすべて取得する allAlgorithms = all_estimators(type_filter="classifier") # warnings.simplefilter("error") # K分割クロスバリデーション用オブジェクト kfold_cv = KFold(n_splits=5, shuffle=True) warnings.filterwarnings('ignore') for(name, algorithm) in allAlgorithms: try: # 各アルゴリズムのオブジェクトを作成 if(name == "LinearSVC"): clf = algorithm(max_iter = 10000) else: clf = algorithm() # scoreメソッドを持つクラスを対象とする
def test_all_estimator_no_base_class(): # test that all_estimators doesn't find abstract classes. for name, Estimator in all_estimators(): msg = ("Base estimators such as {0} should not be included" " in all_estimators").format(name) assert_false(name.lower().startswith('base'), msg=msg)
for name, Estimator in all_estimators(): msg = ("Base estimators such as {0} should not be included" " in all_estimators").format(name) assert not name.lower().startswith('base'), msg def test_all_estimators(): estimators = all_estimators(include_meta_estimators=True) # Meta sanity-check to make sure that the estimator introspection runs # properly assert_greater(len(estimators), 0) @pytest.mark.parametrize('name, Estimator', all_estimators(include_meta_estimators=True)) def test_parameters_default_constructible(name, Estimator): # Test that estimators are default-constructible check_parameters_default_constructible(name, Estimator) def _tested_non_meta_estimators(): for name, Estimator in all_estimators(): if issubclass(Estimator, BiclusterMixin): continue if name.startswith("_"): continue yield name, Estimator def _generate_checks_per_estimator(check_generator, estimators):
def test_estimators_sparse_data(): # All estimators should either deal with sparse data or raise an # exception with type TypeError and an intelligible error message estimators = all_estimators(type_filter=['classifier', 'regressor']) for name, Estimator in estimators: yield check_regressors_classifiers_sparse_data, name, Estimator
start = time.time() # データの読み込み dataset = pd.read_csv( "C:/Users/yunom/Desktop/Output_python/Dataset_amedas.csv") dataset = dataset.drop('Unnamed: 0', axis=1) # データをラベルと入力データに分離する target_col = 'MORE_AMP' feature_col = dataset.columns[1:] feature_col = np.array(feature_col) y = np.array(dataset[target_col]) x = np.array(dataset[feature_col]) # classifierのすべてのアルゴリズムを取得する allAlgorithms = all_estimators(type_filter="regressor") list_name = [] for i in range(100): list_rmse_1 = [] list_rmse_2 = [] list_score_train = [] list_score_test = [] list_stdev = [] # 学習用とテスト用に分離する x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=i) start = time.time()
def get_model_configs( my_models: Union[str, List[str]], class_key="CLASS", fit_key="FIT", meta_key="META", ) -> Union[dict, List[dict]]: """build sklearn model configuration parameters Take (full) class name of an scikit-learn model and retrieve its `class` and `fit` parameters and their default values. Also returns some useful metadata values for the class """ # get a list of all sklearn estimators estimators = all_estimators() def _get_estimator(pkg_class): """find a specific class in a list of sklearn estimators""" my_class = pkg_class.split('.')[-1] return list(filter(lambda x: x[0] == my_class, estimators))[0] # find estimators corresponding to my_models list my_estimators = [] my_models = [my_models] if isinstance(my_models, str) else my_models for model in my_models: estimator_name, estimator_class = _get_estimator(model) my_estimators.append((estimator_name, estimator_class)) # get class and fit specs estimator_specs = [] for an_estimator in my_estimators: estimator_specs.append(( an_estimator[0], # model only name getfullargspec(an_estimator[1]), # class params getfullargspec(an_estimator[1].fit), # fit params an_estimator[1])) # package.module.model model_configs = [] for estimator in estimator_specs: model_json = {class_key: {}, fit_key: {}} fit_params = {} for i, key in enumerate(model_json.keys()): f = estimator[i + 1] args_paired = [] defs_paired = [] # reverse the args since there are fewer defaults than args args = f.args args.reverse() n_args = len(args) defs = f.defaults if defs is None: defs = [defs] defs = list(defs) defs.reverse() n_defs = len(defs) n_smallest = min(n_args, n_defs) n_largest = max(n_args, n_defs) # build 2 lists that can be concatenated for ix in range(n_smallest): if args[ix] is not "self": args_paired.append(args[ix]) defs_paired.append(defs[ix]) for ix in range(n_smallest, n_largest): if ix is not 0 and args[ix] is not "self": args_paired.append(args[ix]) defs_paired.append(None) # concatenate lists into appropriate structure model_json[key] = dict( zip(reversed(args_paired), reversed(defs_paired))) model_json[meta_key] = {} model_json[meta_key]['sklearn_version'] = skversion model_json[meta_key]['class'] = '.'.join( [estimator[3].__module__, estimator[0]]) model_configs.append(model_json) if len(model_configs) == 1: # do we want to log this modified model as an artifact? return model_configs[0] else: # do we want to log this modified model as an artifact? return model_configs
def test_all_estimators(): estimators = all_estimators(include_meta_estimators=True) # Meta sanity-check to make sure that the estimator introspection runs # properly assert_greater(len(estimators), 0)
def discover_clustering(): return [ make_module(name, Est, "clustering") for (name, Est) in all_estimators(type_filter="cluster") ]
def test_transformers(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() succeeded = True for name, Transformer in transformers: if name in dont_test: continue # these don't actually fit the data: if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']: continue # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) if hasattr(transformer, 'compute_importances'): transformer.compute_importances = True if name == 'SelectKBest': # SelectKBest has a default of k=10 # which is more feature than we have. transformer.k = 1 elif name in ['GaussianRandomProjection', 'SparseRandomProjection']: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) transformer.n_components = 1 elif name == "MiniBatchDictionaryLearning": transformer.set_params(n_iter=5) # default = 1000 elif name == "KernelPCA": transformer.remove_zero_eig = False # fit if name in ('_PLS', 'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y try: transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) except Exception as e: print(transformer) print(e) print() succeeded = False continue if hasattr(transformer, 'transform'): if name in ('_PLS', 'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform not correct in %s" % Transformer) assert_array_almost_equal( x_pred3, x_pred2, 2, "fit_transform not correct in %s" % Transformer) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform not correct in %s" % Transformer) assert_array_almost_equal( X_pred3, X_pred2, 2, "fit_transform not correct in %s" % Transformer) # raises error on malformed input for transform assert_raises(ValueError, transformer.transform, X.T) assert_true(succeeded)