예제 #1
def check_classifiers_classes(name, classifier_orig):
    # Case of shapelet models
    if name == 'SerializableShapeletModel':
        raise SkipTest('Skipping check_classifiers_classes for shapelets'
                       ' due to convergence issues...')
    elif name == 'ShapeletModel':
        X_multiclass, y_multiclass = _create_large_ts_dataset()
        classifier_orig = clone(classifier_orig)
        classifier_orig.max_iter = 1000
        X_multiclass, y_multiclass = _create_small_ts_dataset()

    X_multiclass, y_multiclass = shuffle(X_multiclass,

    scaler = TimeSeriesScalerMeanVariance()
    X_multiclass = scaler.fit_transform(X_multiclass)

    X_multiclass = np.reshape(X_multiclass,
                              (X_multiclass.shape[0], X_multiclass.shape[1]))

    X_binary = X_multiclass[y_multiclass != 2]
    y_binary = y_multiclass[y_multiclass != 2]

    X_multiclass = pairwise_estimator_convert_X(X_multiclass, classifier_orig)
    X_binary = pairwise_estimator_convert_X(X_binary, classifier_orig)

    labels_multiclass = ["one", "two", "three"]
    labels_binary = ["one", "two"]

    y_names_multiclass = np.take(labels_multiclass, y_multiclass)
    y_names_binary = np.take(labels_binary, y_binary)

    problems = [(X_binary, y_binary, y_names_binary)]

    if not classifier_orig._get_tags()['binary_only']:
        problems.append((X_multiclass, y_multiclass, y_names_multiclass))

    for X, y, y_names in problems:
        for y_names_i in [y_names, y_names.astype('O')]:
            y_ = choose_check_classifiers_labels(name, y, y_names_i)
            check_classifiers_predictions(X, y_, name, classifier_orig)

    labels_binary = [-1, 1]
    y_names_binary = np.take(labels_binary, y_binary)
    y_binary = choose_check_classifiers_labels(name, y_binary, y_names_binary)
    check_classifiers_predictions(X_binary, y_binary, name, classifier_orig)
예제 #2
def check_estimators_overwrite_params(name, estimator_orig):
    X, y = make_blobs(random_state=0, n_samples=9)
    y = (y > 1).astype(int)
    # some want non-negative input
    X -= X.min()
    X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)


    # Make a physical copy of the original estimator parameters before fitting.
    params = estimator.get_params()
    original_params = deepcopy(params)

    # Fit the model
    estimator.fit(X, y)

    # Compare the state of the model parameters with the original parameters
    new_params = estimator.get_params()
    for param_name, original_value in original_params.items():
        new_value = new_params[param_name]

        # We should never change or mutate the internal state of input
        # parameters by default. To check this we use the joblib.hash function
        # that introspects recursively any subobjects to compute a checksum.
        # The only exception to this rule of immutable constructor parameters
        # is possible RandomState instance but in this check we explicitly
        # fixed the random_state params recursively to be integer seeds.
            _joblib.hash(new_value), _joblib.hash(original_value),
            "Estimator %s should not change or mutate "
            " the parameter %s from %s to %s during fit." %
            (name, param_name, original_value, new_value))
예제 #3
def check_classifiers_classes(name, classifier_orig):
    X_multiclass, y_multiclass = make_blobs(n_samples=30,
    X_multiclass, y_multiclass = shuffle(X_multiclass,
    X_multiclass = StandardScaler().fit_transform(X_multiclass)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X_multiclass -= X_multiclass.min() - .1

    X_binary = X_multiclass[y_multiclass != 2]
    y_binary = y_multiclass[y_multiclass != 2]

    X_binary = pairwise_estimator_convert_X(X_binary, classifier_orig)

    labels_binary = ["one", "two"]

    y_names_binary = np.take(labels_binary, y_binary)

    for X, y, y_names in [(X_binary, y_binary, y_names_binary)]:
        for y_names_i in [y_names, y_names.astype('O')]:
            y_ = choose_check_classifiers_labels(name, y, y_names_i)
            check_classifiers_predictions(X, y_, name, classifier_orig)

    labels_binary = [-1, 1]
    y_names_binary = np.take(labels_binary, y_binary)
    y_binary = choose_check_classifiers_labels(name, y_binary, y_names_binary)
    check_classifiers_predictions(X_binary, y_binary, name, classifier_orig)
예제 #4
def check_supervised_y_2d(name, estimator_orig):
    rnd = np.random.RandomState(0)
    X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig)
    y = np.arange(10) % 2
    estimator = clone(estimator_orig)
    # fit
    estimator.fit(X, y)
    y_pred = estimator.predict(X)

    # Check that when a 2D y is given, a DataConversionWarning is
    # raised
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always", DataConversionWarning)
        warnings.simplefilter("ignore", RuntimeWarning)
        estimator.fit(X, y[:, np.newaxis])
    y_pred_2d = estimator.predict(X)
    msg = "expected 1 DataConversionWarning, got: %s" % (", ".join(
        [str(w_x) for w_x in w]))
    # check that we warned if we don't support multi-output
    assert_greater(len(w), 0, msg)
    assert "DataConversionWarning('A column-vector y" \
           " was passed when a 1d array was expected" in msg
    assert_allclose(y_pred.ravel(), y_pred_2d.ravel())
예제 #5
def check_regressor_data_not_an_array(name, estimator_orig):
    if name in ['TimeSeriesSVR']:
    X, y = _boston_subset(n_samples=50)
    X = pairwise_estimator_convert_X(X, estimator_orig)
    y = enforce_estimator_tags_y(estimator_orig, y)
    check_estimators_data_not_an_array(name, estimator_orig, X, y)
예제 #6
def check_fit_score_takes_y(name, estimator_orig):
    # check that all estimators accept an optional y
    # in fit and score so they can be used in pipelines
    rnd = np.random.RandomState(0)
    X = rnd.uniform(size=(10, 3))
    X = pairwise_estimator_convert_X(X, estimator_orig)
    y = np.arange(10) % 2
    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)

    funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"]
    for func_name in funcs:
        func = getattr(estimator, func_name, None)
        if func is not None:
            func(X, y)
            args = [p.name for p in signature(func).parameters.values()]
            if args[0] == "self":
                # if_delegate_has_method makes methods into functions
                # with an explicit "self", so need to shift arguments
                args = args[1:]
            assert args[1] in [
                "y", "Y"
            ], ("Expected y or Y as second argument for method "
                "%s of %s. Got arguments: %r." %
                (func_name, type(estimator).__name__, args))
예제 #7
def check_methods_subset_invariance(name, estimator_orig):
    # check that method gives invariant results if applied
    # on mini bathes or the whole set
    rnd = np.random.RandomState(0)
    X = 3 * rnd.uniform(size=(20, 3))
    X = pairwise_estimator_convert_X(X, estimator_orig)
    y = (X[:, 0] > 1).astype(np.int)
    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)

    if hasattr(estimator, "n_components"):
        estimator.n_components = 1
    if hasattr(estimator, "n_clusters"):
        estimator.n_clusters = 1

    set_random_state(estimator, 1)
    estimator.fit(X, y)

    for method in [
            "predict", "transform", "decision_function", "score_samples",

        msg = ("{method} of {name} is not invariant when applied "
               "to a subset.").format(method=method, name=name)

        if hasattr(estimator, method):
            result_full, result_by_batch = _apply_on_subsets(
                getattr(estimator, method), X)
예제 #8
def check_dtype_object(name, estimator_orig):
    # check that estimators treat dtype object as numeric if possible
    rng = np.random.RandomState(0)
    X = pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig)
    X = X.astype(object)
    y = (X[:, 0] * 2).astype(np.int)
    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)

    estimator.fit(X, y)
    if hasattr(estimator, "predict"):

    if hasattr(estimator, "transform"):

        estimator.fit(X, y.astype(object))
    except Exception as e:
        if "Unknown label type" not in str(e):

    X[0, 0] = {'foo': 'bar'}
    msg = "argument must be a string.* number"
    assert_raises_regex(TypeError, msg, estimator.fit, X, y)
예제 #9
def check_pipeline_consistency(name, estimator_orig):
    if estimator_orig._get_tags()['non_deterministic']:
        msg = name + ' is non deterministic'
        raise SkipTest(msg)

    # check that make_pipeline(est) gives same score as est
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X -= X.min()
    X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)
    pipeline = make_pipeline(estimator)
    estimator.fit(X, y)
    pipeline.fit(X, y)

    funcs = ["score", "fit_transform"]

    for func_name in funcs:
        func = getattr(estimator, func_name, None)
        if func is not None:
            func_pipeline = getattr(pipeline, func_name)
            result = func(X, y)
            result_pipe = func_pipeline(X, y)
            assert_allclose_dense_sparse(result, result_pipe)
예제 #10
def check_fit_idempotent(name, estimator_orig):
    # Check that est.fit(X) is the same as est.fit(X).fit(X). Ideally we would
    # check that the estimated parameters during training (e.g. coefs_) are
    # the same, but having a universal comparison function for those
    # attributes is difficult and full of edge cases. So instead we check that
    # predict(), predict_proba(), decision_function() and transform() return
    # the same results.

    check_methods = ["predict", "transform", "decision_function",
    rng = np.random.RandomState(0)

    if estimator_orig._get_tags()['non_deterministic']:
        msg = name + ' is non deterministic'
        raise SkipTest(msg)

    estimator = clone(estimator_orig)
    if 'warm_start' in estimator.get_params().keys():

    n_samples = 100
    X, _ = _create_small_ts_dataset()
    X = X.reshape((X.shape[0], X.shape[1]))
    X = pairwise_estimator_convert_X(X, estimator)
    if is_regressor(estimator_orig):
        y = rng.normal(size=n_samples)
        y = rng.randint(low=0, high=2, size=n_samples)

    train, test = next(ShuffleSplit(test_size=.2, random_state=rng).split(X))
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    # Fit for the first time
    estimator.fit(X_train, y_train)

    result = {method: getattr(estimator, method)(X_test)
              for method in check_methods
              if hasattr(estimator, method)}

    # Fit again
    estimator.fit(X_train, y_train)

    for method in check_methods:
        if hasattr(estimator, method):
            new_result = getattr(estimator, method)(X_test)
            if np.issubdtype(new_result.dtype, np.floating):
                tol = 2*np.finfo(new_result.dtype).eps
                tol = 2*np.finfo(np.float64).eps
                result[method], new_result,
                atol=max(tol, 1e-9), rtol=max(tol, 1e-7),
                err_msg="Idempotency check failed for method {}".format(method)
예제 #11
def check_dont_overwrite_parameters(name, estimator_orig):
    # check that fit method only changes or sets private attributes
    if hasattr(estimator_orig.__init__, "deprecated_original"):
        # to not check deprecated classes
    estimator = clone(estimator_orig)
    rnd = np.random.RandomState(0)
    X = 3 * rnd.uniform(size=(20, 3))
    X = pairwise_estimator_convert_X(X, estimator_orig)
    y = (X[:, 0] < 2).astype(np.int)
    y = multioutput_estimator_convert_y_2d(estimator, y)

    if hasattr(estimator, "n_components"):
        estimator.n_components = 1
    if hasattr(estimator, "n_clusters"):
        estimator.n_clusters = 1

    set_random_state(estimator, 1)
    dict_before_fit = estimator.__dict__.copy()
    estimator.fit(X, y)

    dict_after_fit = estimator.__dict__

    public_keys_after_fit = [
        key for key in dict_after_fit.keys() if is_public_parameter(key)

    attrs_added_by_fit = [
        key for key in public_keys_after_fit
        if key not in dict_before_fit.keys()

    # check that fit doesn't add any public attribute
    assert not attrs_added_by_fit, (
        'Estimator adds public attribute(s) during'
        ' the fit method.'
        ' Estimators are only allowed to add private attributes'
        ' either started with _ or ended'
        ' with _ but %s added' % ', '.join(attrs_added_by_fit))

    # check that fit doesn't change any public attribute
    attrs_changed_by_fit = [
        key for key in public_keys_after_fit
        if (dict_before_fit[key] is not dict_after_fit[key])

    assert not attrs_changed_by_fit, (
        'Estimator changes public attribute(s) during'
        ' the fit method. Estimators are only allowed'
        ' to change attributes started'
        ' or ended with _, but'
        ' %s changed' % ', '.join(attrs_changed_by_fit))
예제 #12
def check_regressor_data_not_an_array(name, estimator_orig):
    if name in ['TimeSeriesSVR']:
    X, y = _boston_subset(n_samples=50)
    X = pairwise_estimator_convert_X(X, estimator_orig)
    y = enforce_estimator_tags_y(estimator_orig, y)
    for obj_type in ["NotAnArray", "PandasDataframe"]:
        if obj_type == "PandasDataframe":
            X_ = X[:, :, 0]  # pandas df cant be 3d
            X_ = X
        check_estimators_data_not_an_array(name, estimator_orig, X_, y,
예제 #13
def check_sample_weights_list(name, estimator_orig):
    # check that estimators will accept a 'sample_weight' parameter of
    # type list in the 'fit' function.
    if has_fit_parameter(estimator_orig, "sample_weight"):
        estimator = clone(estimator_orig)
        rnd = np.random.RandomState(0)
        X = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)),
        y = np.arange(10) % 2
        y = multioutput_estimator_convert_y_2d(estimator, y)
        sample_weight = [3] * 10
        # Test that estimators don't raise any exception
        estimator.fit(X, y, sample_weight=sample_weight)
예제 #14
def check_estimators_fit_returns_self(name, estimator_orig,
    """Check if self is returned when calling fit"""
    X, y = make_blobs(random_state=0, n_samples=9, n_features=4)
    y = (y > 1).astype(int)
    # some want non-negative input
    X -= X.min()
    X = pairwise_estimator_convert_X(X, estimator_orig)

    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)

    if readonly_memmap:
        X, y = create_memmap_backed_data([X, y])

    assert estimator.fit(X, y) is estimator
예제 #15
def check_fit2d_predict1d(name, estimator_orig):
    # check by fitting a 2d array and predicting with a 1d array
    rnd = np.random.RandomState(0)
    X = 3 * rnd.uniform(size=(20, 3))
    X = pairwise_estimator_convert_X(X, estimator_orig)
    y = (X[:, 0] > 1).astype(np.int)
    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)

    if hasattr(estimator, "n_components"):
        estimator.n_components = 1
    if hasattr(estimator, "n_clusters"):
        estimator.n_clusters = 1

    set_random_state(estimator, 1)
    estimator.fit(X, y)

    for method in ["predict", "transform", "decision_function",
        if hasattr(estimator, method):
            assert_raise_message(ValueError, "Reshape your data",
                                 getattr(estimator, method), X[0])
예제 #16
def check_classifiers_train(name, classifier_orig, readonly_memmap=False,
    # Case of shapelet models
    if name in ['LearningShapelets', 'TimeSeriesMLPClassifier']:
        X_m, y_m = _create_large_ts_dataset()
        classifier_orig = clone(classifier_orig)
        classifier_orig.max_iter = 1000
        X_m, y_m = _create_small_ts_dataset()
    X_m = X_m.astype(X_dtype)

    X_m, y_m = shuffle(X_m, y_m, random_state=7)

    X_m = TimeSeriesScalerMeanVariance().fit_transform(X_m)

    # generate binary problem from multi-class one
    y_b = y_m[y_m != 2]
    X_b = X_m[y_m != 2]

    # We will test for both binary and multiclass case
    problems = [(X_b, y_b), (X_m, y_m)]

    tags = classifier_orig._get_tags()

    for (X, y) in problems:
        classes = np.unique(y)
        n_classes = len(classes)
        n_samples, n_features, dim = X.shape
        classifier = clone(classifier_orig)
        X = pairwise_estimator_convert_X(X, classifier)

        # raises error on malformed input for fit
        if not tags["no_validation"]:
            with assert_raises(
                msg="The classifier {} does not "
                    "raise an error when incorrect/malformed input "
                    "data for fit is passed. The number of training "
                    "examples is not the same as the number of labels. "
                    "Perhaps use check_X_y in fit.".format(name)):
                classifier.fit(X, y[:-1])

        # fit with lists
        classifier.fit(X.tolist(), y.tolist())
        assert hasattr(classifier, "classes_")
        y_pred = classifier.predict(X)

        assert y_pred.shape == (n_samples,)

        # training set performance
        if not tags['poor_score']:
            assert accuracy_score(y, y_pred) > 0.83

        # raises error on malformed input for predict
        msg_pairwise = (
            "The classifier {} does not raise an error when shape of X in "
            " {} is not equal to (n_test_samples, n_training_samples)")
        msg = ("The classifier {} does not raise an error when the number of "
               "features in {} is different from the number of features in "

        if not tags["no_validation"]:
            if bool(getattr(classifier, "_pairwise", False)):
                with assert_raises(ValueError,
                                   msg=msg_pairwise.format(name, "predict")):
                    classifier.predict(X.reshape(-1, 1))
                if not tags["allow_variable_length"]:
                    with assert_raises(ValueError,
                                       msg=msg.format(name, "predict")):
                    with assert_raises(ValueError,
                                       msg=msg.format(name, "predict")):
                        classifier.predict(X.reshape((-1, 5, 2)))
        if hasattr(classifier, "decision_function"):
                # decision_function agrees with predict
                decision = classifier.decision_function(X)
                if n_classes == 2:
                    if not tags["multioutput_only"]:
                        assert decision.shape == (n_samples,)
                        assert decision.shape == (n_samples, 1)
                    dec_pred = (decision.ravel() > 0).astype(np.int)
                    assert_array_equal(dec_pred, y_pred)
                    assert decision.shape == (n_samples, n_classes)
                    assert_array_equal(np.argmax(decision, axis=1), y_pred)

                # raises error on malformed input for decision_function
                if not tags["no_validation"]:
                    error_msg = msg_pairwise.format(name, "decision_function")
                    if bool(getattr(classifier, "_pairwise", False)):
                        with assert_raises(ValueError, msg=error_msg):
                            classifier.decision_function(X.reshape(-1, 1))
                        if not tags["allow_variable_length"]:
                            with assert_raises(ValueError, msg=error_msg):
                            with assert_raises(ValueError, msg=error_msg):
                                    X.reshape((-1, 5, 2))
            except NotImplementedError:

        if hasattr(classifier, "predict_proba"):
            # predict_proba agrees with predict
            y_prob = classifier.predict_proba(X)
            assert y_prob.shape == (n_samples, n_classes)
            assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
            # check that probas for all classes sum to one
            assert_array_almost_equal(np.sum(y_prob, axis=1),
            if not tags["no_validation"]:
                # raises error on malformed input for predict_proba
                if bool(getattr(classifier_orig, "_pairwise", False)):
                    with assert_raises(ValueError, msg=msg_pairwise.format(
                            name, "predict_proba")):
                        classifier.predict_proba(X.reshape(-1, 1))
                    if not tags["allow_variable_length"]:
                        with assert_raises(ValueError, msg=msg.format(
                                name, "predict_proba")):
                        with assert_raises(ValueError, msg=msg.format(
                                name, "predict_proba")):
                                X.reshape((-1, 5, 2))
            if hasattr(classifier, "predict_log_proba"):
                # predict_log_proba is a transformation of predict_proba
                y_log_prob = classifier.predict_log_proba(X)
                assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-9)
                assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))
def test_pairwise_estimator_convert_X():
    with pytest.warns(DeprecationWarning, match="removed in version 0.24"):
        pairwise_estimator_convert_X([[1, 2]], DummyClassifier())
예제 #18
def check_classifiers_train(name, classifier_orig, readonly_memmap=False):
    X_m, y_m = make_blobs(n_samples=300, random_state=0)
    X_m, y_m = shuffle(X_m, y_m, random_state=7)
    X_m = StandardScaler().fit_transform(X_m)
    # generate binary problem from multi-class one
    y_b = y_m[y_m != 2]
    X_b = X_m[y_m != 2]

    if readonly_memmap:
        X_b, y_b = create_memmap_backed_data([X_b, y_b])

    for (X, y) in [(X_b, y_b)]:
        classes = np.unique(y)
        n_classes = len(classes)
        n_samples, _ = X.shape
        classifier = clone(classifier_orig)
        X = pairwise_estimator_convert_X(X, classifier)
        y = multioutput_estimator_convert_y_2d(classifier, y)

        # raises error on malformed input for fit
        with assert_raises(ValueError,
                           msg="The classifier {} does not "
                           "raise an error when incorrect/malformed input "
                           "data for fit is passed. The number of training "
                           "examples is not the same as the number of labels. "
                           "Perhaps use check_X_y in fit.".format(name)):
            classifier.fit(X, y[:-1])

        # fit
        classifier.fit(X, y)
        # with lists
        classifier.fit(X.tolist(), y.tolist())
        assert hasattr(classifier, "classes_")
        y_pred = classifier.predict(X)

        assert_equal(y_pred.shape, (n_samples, ))
        # training set performance
        assert_greater(accuracy_score(y, y_pred), 0.83)

        # raises error on malformed input for predict
        msg = ("The classifier {} does not raise an error when the number of "
               "features in {} is different from the number of features in "

        with assert_raises(ValueError, msg=msg.format(name, "predict")):
        if hasattr(classifier, "decision_function"):
                # decision_function agrees with predict
                decision = classifier.decision_function(X)
                if n_classes == 2:
                    assert_equal(decision.shape, (n_samples, 1))
                    dec_pred = (decision.ravel() > 0).astype(np.int)
                    assert_array_equal(dec_pred, y_pred)
                    assert_equal(decision.shape, (n_samples, n_classes))
                    assert_array_equal(np.argmax(decision, axis=1), y_pred)

                # raises error on malformed input for decision_function
                with assert_raises(ValueError,
                                   msg=msg.format(name, "decision_function")):
            except NotImplementedError:

        if hasattr(classifier, "predict_proba"):
            # predict_proba agrees with predict
            y_prob = classifier.predict_proba(X)
            assert_equal(y_prob.shape, (n_samples, n_classes))
            assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
            # check that probas for all classes sum to one
            assert_array_almost_equal(np.sum(y_prob, axis=1),
            # raises error on malformed input for predict_proba
            with assert_raises(ValueError,
                               msg=msg.format(name, "predict_proba")):
            if hasattr(classifier, "predict_log_proba"):
                # predict_log_proba is a transformation of predict_proba
                y_log_prob = classifier.predict_log_proba(X)
                assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-9)
                assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))