示例#1
0
def test_cross_validation_with_scorer_object_regressor():
    np.random.seed(123)
    X = np.random.randn(100, 10)
    y = np.random.randn(100)

    forest = RandomForestRegressor(n_estimators=10, random_state=123)
    result1 = cross_validation(forest,
                               X,
                               y,
                               scoring=SCORERS["neg_mean_absolute_error"],
                               cv=10)
    assert result1.shape[0] == 10
    assert isinstance(result1, pd.DataFrame)

    forest = RandomForestRegressor(n_estimators=10, random_state=123)
    result2 = cross_validation(forest,
                               X,
                               y,
                               scoring="neg_mean_absolute_error",
                               cv=10)
    assert result2.shape[0] == 10
    assert isinstance(result2, pd.DataFrame)

    assert np.abs(result1.iloc[:, 0] - result2.iloc[:, 0]).max() <= 10**(-5)
    assert np.abs(result1.iloc[:, 1] - result2.iloc[:, 1]).max() <= 10**(-5)
示例#2
0
def test_cross_validation_regressor_multi_output(cast_data_frame):

    estimator = RandomForestRegressor(n_estimators=10, random_state=123)

    X, y = make_regression(n_samples=10)
    yd2 = np.concatenate((y.reshape((-1, 1)), y.reshape((-1, 1))), axis=1)

    if cast_data_frame:
        yd2 = pd.DataFrame(yd2)

    cv_res = cross_validation(estimator, X, yd2, cv=2, scoring="r2")
    assert cv_res.shape[0] == 2
    assert isinstance(cv_res, pd.DataFrame)
    assert "test_r2" in cv_res.columns
    assert "train_r2" in cv_res.columns

    cv_res, yhat = cross_validation(estimator,
                                    X,
                                    yd2,
                                    cv=2,
                                    scoring="r2",
                                    return_predict=True,
                                    method="predict")

    assert cv_res.shape[0] == 2
    assert isinstance(cv_res, pd.DataFrame)
    assert "test_r2" in cv_res.columns
    assert "train_r2" in cv_res.columns
    assert isinstance(yhat, np.ndarray)
    assert yhat.shape == yd2.shape
def test_cross_validation_with_max_proba_accuracy():
    np.random.seed(123)
    cv = GroupKFold(n_splits=4)

    max_proba_scorer = _GroupProbaScorer(score_func=max_proba_group_accuracy, sign=1, kwargs={})

    X = np.random.randn(100, 10)
    y = 1 * (np.random.randn(100) > 0)
    groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)

    estimator = LogisticRegression(solver="lbfgs", random_state=123)

    cv_res = cross_validation(estimator, X, y, groups, scoring=max_proba_scorer, cv=cv)

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape == (4, 6)

    cv_res = cross_validation(estimator, X, y, groups, scoring={"mp_acc": max_proba_scorer}, cv=cv)

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape == (4, 6)
    assert "train_mp_acc" in cv_res.columns
    assert "test_mp_acc" in cv_res.columns
    assert cv_res["train_mp_acc"].max() <= 1
    assert cv_res["train_mp_acc"].min() >= 0

    assert cv_res["test_mp_acc"].max() <= 1
    assert cv_res["test_mp_acc"].min() >= 0
示例#4
0
def test_approx_cross_validation_dummy(approximate_cv):

    X, y = make_classification(n_samples=100, random_state=123)

    estimator = DummyModel()
    cv_res, yhat = cross_validation(estimator,
                                    X,
                                    y,
                                    cv=10,
                                    no_scoring=True,
                                    return_predict=True,
                                    method="predict",
                                    approximate_cv=approximate_cv)

    assert yhat.ndim == 1
    assert np.abs(yhat - X[:, 0]).max() <= 10**(-5)

    estimator = DummyModel()
    cv_res, yhat = cross_validation(estimator,
                                    X,
                                    y,
                                    cv=10,
                                    no_scoring=False,
                                    return_predict=True,
                                    method="predict",
                                    approximate_cv=approximate_cv)

    assert yhat.ndim == 1
    assert np.abs(yhat - X[:, 0]).max() <= 10**(-5)
def test_approx_cross_validation_fit_params(approximate_cv):
    X, y = make_classification(n_samples=100, random_state=123)

    estimator = DummyModelCheckFitParams()
    with pytest.raises(AssertionError):
        cv_res, yhat = cross_validation(
            estimator,
            X,
            y,
            cv=10,
            no_scoring=True,
            return_predict=True,
            method="predict",
            approximate_cv=approximate_cv,
        )

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        cv=10,
        no_scoring=True,
        return_predict=True,
        method="predict",
        fit_params={"param": "value"},
        approximate_cv=approximate_cv,
    )
def test_approx_cross_validation_raise_error(approximate_cv):

    X, y = make_classification(n_samples=100, random_state=123)

    estimator = DummyModel()
    with pytest.raises(ValueError):
        cv_res, yhat = cross_validation(
            estimator,
            X,
            y,
            cv=10,
            no_scoring=True,
            return_predict=False,
            method="predict",
            approximate_cv=approximate_cv,
        )

    # no_scoring = True AND return_predict = False => Nothing to do ... error
    estimator = DummyModel()
    with pytest.raises(AttributeError):
        cv_res, yhat = cross_validation(
            estimator,
            X,
            y,
            cv=10,
            no_scoring=True,
            return_predict=True,
            method="transform",
            approximate_cv=approximate_cv,
        )
def test_approx_cross_validation_pass_kwargs():
    X, y = make_classification(n_samples=100, random_state=123)

    estimator = DummyModelWithApprox(check_kwargs=True)

    with pytest.raises(AssertionError):
        cv_res, yhat = cross_validation(
            estimator,
            X,
            y,
            cv=10,
            no_scoring=True,
            return_predict=True,
            method="predict",
            fit_params={"param": "value"},
            approximate_cv=True,
        )
        # error because kwargs not passed

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        cv=10,
        no_scoring=True,
        return_predict=True,
        method="predict",
        fit_params={"param": "value"},
        kwargs_param="kwargs_value",
        approximate_cv=True,
    )
def test_cross_validation_with_scorer_object_classifier():
    X = np.random.randn(100, 10)
    y = np.array(["A"] * 33 + ["B"] * 33 + ["C"] * 34)
    forest = RandomForestClassifier(n_estimators=10, random_state=123)

    result1 = cross_validation(forest, X, y, scoring=SCORERS["accuracy"], cv=10)
    assert result1.shape[0] == 10
    assert isinstance(result1, pd.DataFrame)

    result2 = cross_validation(forest, X, y, scoring="accuracy", cv=10)
    assert result2.shape[0] == 10
    assert isinstance(result2, pd.DataFrame)

    assert np.abs(result1.iloc[:, 0] - result2.iloc[:, 0]).max() <= 10 ** (-5)
    assert np.abs(result1.iloc[:, 1] - result2.iloc[:, 1]).max() <= 10 ** (-5)

    result1 = cross_validation(forest, X, y, scoring=SCORERS["neg_log_loss"], cv=10)
    assert result1.shape[0] == 10
    assert isinstance(result1, pd.DataFrame)

    result2 = cross_validation(forest, X, y, scoring="neg_log_loss", cv=10)
    assert result2.shape[0] == 10
    assert isinstance(result2, pd.DataFrame)

    assert np.abs(result1.iloc[:, 0] - result2.iloc[:, 0]).max() <= 10 ** (-5)
    assert np.abs(result1.iloc[:, 1] - result2.iloc[:, 1]).max() <= 10 ** (-5)
def test_approx_cross_validation_transformer(x_data_type, shuffle, graph_pipeline, with_groups):

    if graph_pipeline:
        estimator = GraphPipeline({"ptA": DebugPassThrough(), "ptB": DebugPassThrough()}, edges=[("ptA", "ptB")])
    else:
        estimator = DebugPassThrough()

    X, y = make_classification(n_samples=100, random_state=123)
    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    scoring = ["accuracy", "neg_log_loss"]

    ##################
    ### Score only ###
    ##################
    with pytest.raises(Exception):
        cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0)
        # shouldn't work since DebugPassThrough can't be scored

    #################
    ### Transform ###
    #################
    cv_res, Xhat = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, no_scoring=True
    )

    assert type(Xhat) == type(X)
    assert cv_res is None
    assert Xhat.shape == X.shape

    if isinstance(X, pd.DataFrame):
        assert (Xhat.index == X.index).all()
        assert (Xhat.columns == X.columns).all()

    if isinstance(X, pd.DataFrame):
        assert np.abs(Xhat - X).max().max() <= 10 ** (10 - 10)
    else:
        assert np.max(np.abs(Xhat - X)) <= 10 ** (-10)
def test_cross_validation_classifier_multi_output(add_third_class, cast_data_frame, cast_string):

    estimator = RandomForestClassifier(n_estimators=10, random_state=123)

    X, y = make_classification(n_samples=10)
    yd2 = np.concatenate((y.reshape((-1, 1)), y.reshape((-1, 1))), axis=1)

    if add_third_class:
        yd2[0, 1] = 2

    if cast_string:
        yd2 = yd2.astype("str").astype("object")
        yd2[:, 0] = "cl_a_" + yd2[:, 0]
        yd2[:, 1] = "cl_b_" + yd2[:, 1]

    if cast_data_frame:
        yd2 = pd.DataFrame(yd2)

    cv_res = cross_validation(estimator, X, yd2, cv=3, scoring="log_loss_patched")
    assert cv_res.shape[0] == 3
    assert isinstance(cv_res, pd.DataFrame)
    assert "test_log_loss_patched" in cv_res.columns
    assert "train_log_loss_patched" in cv_res.columns

    cv_res, yhat = cross_validation(
        estimator, X, yd2, cv=3, scoring="log_loss_patched", return_predict=True, method="predict"
    )

    assert cv_res.shape[0] == 3
    assert isinstance(cv_res, pd.DataFrame)
    assert "test_log_loss_patched" in cv_res.columns
    assert "train_log_loss_patched" in cv_res.columns
    assert isinstance(yhat, np.ndarray)
    assert yhat.shape == yd2.shape

    cv_res, yhat_proba = cross_validation(
        estimator, X, yd2, cv=3, scoring="log_loss_patched", return_predict=True, method="predict_proba"
    )

    assert cv_res.shape[0] == 3
    assert isinstance(cv_res, pd.DataFrame)
    assert "test_log_loss_patched" in cv_res.columns
    assert "train_log_loss_patched" in cv_res.columns
    assert isinstance(yhat_proba, list)
    assert len(yhat_proba) == 2
    for j, p in enumerate(yhat_proba):
        assert p.shape == (yd2.shape[0], 2 + 1 * (j == 1) * (add_third_class))
        assert (p.sum(axis=1) - 1).abs().max() <= 10 ** (-10)
        assert isinstance(p, pd.DataFrame)
        assert p.min().min() >= 0
        assert p.max().max() <= 1

        if cast_data_frame:
            assert list(p.columns) == list(np.sort(np.unique(yd2.iloc[:, j])))
        else:
            assert list(p.columns) == list(np.sort(np.unique(yd2[:, j])))
示例#11
0
def test_cross_validation_few_sample_per_classes(with_groups):
    np.random.seed(123)
    X = np.random.randn(100, 2)

    y = np.array(["AA"] * 33 + ["BB"] * 33 + ["CC"] * 33 + ["DD"])
    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    cv = StratifiedKFold(n_splits=10)

    logit = LogisticRegression()

    _, yhat_proba = cross_validation(logit,
                                     X,
                                     y,
                                     groups=groups,
                                     cv=cv,
                                     return_predict=True,
                                     no_scoring=True)
    assert (yhat_proba.max(axis=1) > 0).all()

    assert yhat_proba.shape == (100, 4)
    assert list(yhat_proba.columns) == ["AA", "BB", "CC", "DD"]
def test_approx_cross_validation_cv(approximate_cv):
    X, y = make_classification()

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

    estimator = DebugPassThrough()

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups=None,
        cv=cv,
        verbose=1,
        fit_params={},
        return_predict=True,
        method="transform",
        no_scoring=True,
        stopping_round=None,
        stopping_threshold=None,
        approximate_cv=approximate_cv,
    )
    assert cv_res is None
    assert yhat.ndim == 2
    assert yhat.shape == X.shape
def test_approx_cross_validation_pass_to_method():
    X, y = make_classification(n_samples=100, random_state=123)

    estimator = DummyModelWithApprox()
    cv_res, yhat = cross_validation(
        estimator, X, y, cv=10, no_scoring=True, return_predict=True, method="predict", approximate_cv=True
    )

    assert cv_res is None
    assert yhat.ndim == 1
    assert np.abs(yhat - X[:, 1]).max() <= 10 ** (-5)

    estimator = DummyModelWithApprox()
    cv_res, yhat = cross_validation(
        estimator, X, y, cv=10, no_scoring=False, return_predict=True, method="predict", approximate_cv=True
    )
    assert cv_res is not None
    assert "scoring" in cv_res

    assert yhat.ndim == 1
    assert np.abs(yhat - X[:, 1]).max() <= 10 ** (-5)

    estimator = DummyModelWithApprox()
    cv_res = cross_validation(
        estimator, X, y, cv=10, no_scoring=False, return_predict=False, method="predict", approximate_cv=True
    )
    assert cv_res is not None
    assert "scoring" in cv_res

    assert yhat.ndim == 1
    assert np.abs(yhat - X[:, 1]).max() <= 10 ** (-5)

    estimator = DummyModelWithApprox()
    cv_res = cross_validation(
        estimator,
        X,
        y,
        cv=10,
        scoring=["neg_mean_squared_error"],
        no_scoring=False,
        return_predict=False,
        method="predict",
        approximate_cv=True,
    )
    assert cv_res is not None
    assert "scoring" in cv_res
    assert cv_res["scoring"] == ["neg_mean_squared_error"]
def test_cross_validation_time_serie_split():
    X, y = make_classification(n_samples=100, random_state=123)

    cv = TimeSeriesSplit(n_splits=10)

    model = RandomForestClassifier(n_estimators=10, random_state=123)
    cv_res, yhat = cross_validation(model, X, y, cv=cv, return_predict=True)

    assert yhat is None  # because I can't return predictions
    assert len(cv_res) == 10
    assert isinstance(cv_res, pd.DataFrame)
示例#15
0
    def approx_cross_validation(
        self,
        X,
        y,
        groups=None,
        scoring=None,
        cv=None,
        verbose=1,
        fit_params=None,
        return_predict=False,
        method=None,
        no_scoring=False,
        stopping_round=None,
        stopping_threshold=None,
        _save_outsample_predict=False,
        _use_saved_outsample_predict=False,
    ):
        """ cross validation of the blender of the stacker
        The fold to use to cross-validate the blender are the SAME as the one used to generate 'outsample prediction'
        """

        cv = create_cv(cv,
                       y,
                       classifier=self._is_classifier,
                       shuffle=True,
                       random_state=self.random_state)

        if _use_saved_outsample_predict:
            all_yhat_pred = self.all_yhat_pred
        else:
            all_yhat_pred = self.get_outsample(X,
                                               y,
                                               method=self._method,
                                               groups=groups,
                                               cv=cv)

            if _save_outsample_predict:
                self.all_yhat_pred = all_yhat_pred

        return cross_validation(
            self.blender,
            all_yhat_pred,
            y,
            scoring=scoring,
            cv=cv,
            verbose=verbose,
            fit_params=fit_params,
            return_predict=return_predict,
            method=method,
            no_scoring=no_scoring,
            stopping_round=stopping_round,
            stopping_threshold=stopping_threshold,
        )
def test_cross_validation_passing_of_groups():
    np.random.seed(123)
    X = np.random.randn(100, 10)
    y = np.random.randn(100)
    groups = np.random.randint(0, 20, size=100)

    estimator = TransformerFailNoGroups()

    cv_res, yhat = cross_validation(estimator, X, y, groups, cv=10, no_scoring=True, return_predict=True)
    # Check that it doesn't fail : meaning the estimator has access to the groups

    assert cv_res is None
    assert (yhat == X).all()
def test_RandomTrainTestCv_fail_with_cross_val_predict():
    np.random.seed(123)
    X = np.random.randn(100, 10)
    y = np.random.randn(100)
    
    cv = RandomTrainTestCv(test_size=0.1, random_state=123)
    
    estimator = DecisionTreeRegressor(max_depth=2, random_state=123)
    
    with pytest.raises(ValueError):
        cross_val_predict(estimator, X, y, cv=cv)
        
    res = cross_validation(estimator, X, y, cv=cv, no_scoring=True, return_predict=True)
    assert res == (None, None)
def test_cross_validation_sample_weight():
    X, y = make_classification(n_samples=100, random_state=123)
    sample_weight = np.ones(y.shape[0])
    
    estimator = DummyModelCheckSampleWeight()
    estimator.fit(X, y, sample_weight=sample_weight)

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        cv=10,
        no_scoring=True,
        return_predict=True,
        method="predict",
        fit_params={"sample_weight":sample_weight}
    )
    
    # I just need to check that it works
    assert yhat.shape[0] == y.shape[0]
    
    
    estimator = DummyModelCheckSampleWeight()
    estimator.fit(X, y)

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        cv=10,
        no_scoring=True,
        return_predict=True,
        method="predict"
    )
    
    # I just need to check that it works
    assert yhat.shape[0] == y.shape[0]
示例#19
0
    def approx_cross_validation(self,
                                X,
                                y,
                                groups=None,
                                scoring=None,
                                cv=None,
                                verbose=1,
                                fit_params=None,
                                return_predict=False,
                                method=None,
                                no_scoring=False,
                                stopping_round=None,
                                stopping_threshold=None,
                                nodes_not_to_crossvalidate=None,
                                **kwargs):

        ###################
        ### Preparation ###
        ###################
        _orig_verbose = self.verbose

        self.verbose = verbose

        self._complete_init()

        if nodes_not_to_crossvalidate is None:
            nodes_not_to_crossvalidate = set()

        #################################################################
        ### Prepare the list of nodes that can't be 'cv_transformed' ####
        #################################################################
        nodes_cant_cv_transform = set()
        for node, m in self._models.items():
            cant = True
            if hasattr(m, "can_cv_transform"):
                if m.can_cv_transform():
                    cant = False

            if cant:
                nodes_cant_cv_transform.add(node)

        # verif:
        for node in nodes_cant_cv_transform:
            if node not in self._models:
                raise ValueError(
                    "the node (within nodes_cant_cv_transform) %s isn't in the node of the model"
                    % node)

        for node in nodes_cant_cv_transform:
            if node not in self._models:
                raise ValueError(
                    "the node (within nodes_cant_cv_transform) %s isn't in the node of the model"
                    % node)

        cv = create_cv(
            cv,
            y,
            classifier=sklearn.model_selection._validation.is_classifier(self),
            shuffle=True,
            random_state=123)

        # Split fit_params into a 'step-by-step' dictionnary
        fit_params_step = {name: {} for name in self.complete_graph.nodes}
        if fit_params is not None:
            for key, value in fit_params.items():
                step, param = key.split("__", 1)
                fit_params_step[step][param] = value

        kwargs_step = {name: {} for name in self.complete_graph.nodes}
        if kwargs:
            for key, value in kwargs.items():
                step, param = key.split("__", 1)
                kwargs_step[step][param] = value

        ################################
        ### Pre-calculate everything ###
        ################################
        is_finished, data_dico, result = self._approx_cross_validation_pre_calculation(
            X=X,
            y=y,
            groups=groups,
            scoring=scoring,
            cv=cv,
            verbose=verbose,
            fit_params_step=fit_params_step,
            return_predict=return_predict,
            method=method,
            no_scoring=no_scoring,
            stopping_round=stopping_round,
            stopping_threshold=stopping_threshold,
            nodes_not_to_crossvalidate=nodes_not_to_crossvalidate,
            nodes_cant_cv_transform=nodes_cant_cv_transform,
            kwargs_step=kwargs_step,
        )

        if is_finished:
            if verbose:
                print("CV is finished")
            self.verbose = _orig_verbose
            return result

        ###########################################################
        ### Create a new graphpipeline with the remaining nodes ###
        ###########################################################
        new_graph_pipeline, new_data_dtm = self._approx_cross_validation_create_sub_graph_pipeline(
            data_dico, X)

        if verbose:
            print("here is a new GraphPipeline")
            print(new_graph_pipeline)

            print("")
            print("new_data_dtm")
            print(type(new_data_dtm))

        ############################################################################
        ### Now do a 'classical cross-validation' on the remaining GraphPipeline ###
        ############################################################################
        result = cross_validation(new_graph_pipeline,
                                  new_data_dtm,
                                  y,
                                  groups=groups,
                                  scoring=scoring,
                                  cv=cv,
                                  verbose=verbose,
                                  fit_params=fit_params,
                                  return_predict=return_predict,
                                  method=method,
                                  no_scoring=no_scoring,
                                  stopping_round=stopping_round,
                                  stopping_threshold=stopping_threshold,
                                  approximate_cv=False,
                                  **kwargs)

        self.verbose = _orig_verbose

        return result
def test_approx_cross_validation_early_stop(
    add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups
):

    X, y = make_classification(n_samples=100, random_state=123)

    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    if add_third_class:
        y[0:2] = 2

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    if y_string_class:
        y = np.array(["CL_%d" % i for i in y])

    if add_third_class:
        scoring = ["accuracy"]
    else:
        scoring = ["accuracy", "neg_log_loss"]

    if graph_pipeline:
        estimator = GraphPipeline(
            {"pt": DebugPassThrough(), "lg": LogisticRegression(C=1, random_state=123)}, edges=[("pt", "lg")]
        )
    else:
        estimator = LogisticRegression(C=1, random_state=123)

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups,
        cv=10,
        scoring=scoring,
        verbose=0,
        return_predict=True,
        method="predict",
        stopping_round=1,
        stopping_threshold=1.01,  # So that accuracy is sure to be bellow
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 2
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert yhat is None

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups,
        cv=10,
        scoring=scoring,
        verbose=0,
        return_predict=True,
        method="predict",
        stopping_round=1,
        stopping_threshold=0.0,
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0
def test_cross_validation(add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups):

    X, y = make_classification(n_samples=100, random_state=123)
    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if add_third_class:
        y[0:2] = 2

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    if y_string_class:
        y = np.array(["CL_%d" % i for i in y])

    if add_third_class:
        scoring = ["accuracy"]
    else:
        scoring = ["accuracy", "neg_log_loss"]

    if graph_pipeline:
        estimator = GraphPipeline({"pt": DebugPassThrough(), "lg": LogisticRegression()}, edges=[("pt", "lg")])
    else:
        estimator = LogisticRegression()

    ##################
    ### Only score ###
    ##################

    cv_res = cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0)

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    #####################
    ### Score + Proba ###
    #####################
    cv_res, yhat_proba = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert isinstance(yhat_proba, pd.DataFrame)
    if isinstance(X, pd.DataFrame):
        assert (yhat_proba.index == X.index).all()

    assert yhat_proba.shape == (y.shape[0], 2 + 1 * add_third_class)
    assert yhat_proba.min().min() >= 0
    assert yhat_proba.max().max() <= 1
    assert list(yhat_proba.columns) == list(np.sort(np.unique(y)))

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    #######################
    ### Score + Predict ###
    #######################
    cv_res, yhat = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict"
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0

    assert yhat.shape[0] == y.shape[0]

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ####################
    ### Predict only ###
    ####################
    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups,
        cv=10,
        scoring=scoring,
        verbose=0,
        return_predict=True,
        method="predict",
        no_scoring=True,
    )

    assert yhat.shape[0] == y.shape[0]

    assert cv_res is None
    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0

    with pytest.raises(NotFittedError):
        estimator.predict(X)
示例#22
0
    def _approx_cross_validation_pre_calculation(
        self,
        X,
        y,
        groups,
        scoring,
        cv,
        verbose,
        fit_params_step,
        return_predict,
        method,
        no_scoring,
        stopping_round,
        stopping_threshold,
        nodes_not_to_crossvalidate,
        nodes_cant_cv_transform,
        kwargs_step,
    ):
        """ sub-method to loop through the nodes of the pipeline and pre-compute everything that can be pre-computed """

        data_dico = {}  # Will contain transformed blocks at each node

        nodes_done = set()
        for node in self._nodes_order:

            concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes
            if not concat_at_this_node:
                raise NotImplementedError(
                    "Approx cross-validation does't work if no concatenation (node %s)"
                    % str(node))

            nodes_done.add(node)

            if self.verbose:
                print("start processing node %s ..." % node)

            ### Debugging Help ###
            # if getattr(self,"_return_before_node",None) is not None and getattr(self,"_return_before_node",None) == node:
            #    return data_dico

            model = self._models[node]

            predecessors = list(self.complete_graph.predecessors(node))
            # Carefull : here it is not necessary always in the same order

            #### I'll use the order in which the edges were given

            # Concatenation : alphabetical order

            if len(predecessors) == 0:
                #########################
                ###  No predecessors  ###
                #########################

                # ==> Apply on original data
                lastX = X

            elif len(predecessors) == 1:
                ########################
                ###  One predecessor ###
                ########################

                # ==> Apply on data coming out of last node
                lastX = data_dico[predecessors[0]]
                # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] )

            elif len(predecessors) > 1:
                #######################
                ###  More than one  ###
                #######################
                # ==> concat all the predecessors node and apply it

                ### Fix concatenation order ###
                edges_number = self._get_edges_number(predecessors, node)
                predecessors = sorted(predecessors,
                                      key=lambda p:
                                      (edges_number.get(p, -1), p))
                self._all_concat_order[node] = predecessors

                all_lastX = [
                    data_dico[predecessor] for predecessor in predecessors
                ]

                if self.verbose:
                    print("start aggregation...")

                # if do_fit:
                output_type = guess_output_type(all_lastX)
                self._all_concat_type[node] = output_type
                # else:
                #    output_type = self._all_concat_type[node]
                has_none = False
                for x in all_lastX:
                    if x is None:
                        has_none = True
                        break

                # None in all_lastX

                if has_none:
                    lastX = None
                else:
                    lastX = generic_hstack(all_lastX, output_type=output_type)

            if node != self._terminal_node and lastX is not None:
                # This is not the end of the graph

                if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform:
                    ### 1) Node should BE crossvalitaded  ...
                    ### 2) ... and we CAN use 'cv_transform'

                    if self.verbose:
                        print("do crossvalidation on %s" % node)

                    _, data_dico[node] = cross_validation(
                        model,
                        lastX,
                        y,
                        groups=groups,
                        cv=cv,
                        verbose=verbose,
                        fit_params=fit_params_step[node],
                        return_predict=True,
                        method="transform",
                        no_scoring=True,
                        stopping_round=None,
                        stopping_threshold=None,
                        **kwargs_step[node])

                elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform:
                    ### 1) Node should BE crossvalitated ...
                    ### 2) ... but we can't use 'cv_transform'

                    if self.verbose:
                        print("can't do node %s" % node)
                    data_dico[node] = None  # Can't compute this node

                else:
                    ### Node that shouldn't be cross-validated ###

                    if self.verbose:
                        print("skip crossvalidation on %s" % node)
                    cloned_model = clone(model)
                    if groups is not None and function_has_named_argument(
                            cloned_model.fit_transform, "groups"):
                        data_dico[node] = cloned_model.fit_transform(
                            lastX, y, groups, **fit_params_step[node])
                    else:
                        data_dico[node] = cloned_model.fit_transform(
                            lastX, y, **fit_params_step[node])

            elif lastX is not None:

                ### CV no matter what at the last node ###

                #                if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform:
                #
                #                    # This is the last node of the Graph
                #                    result = approx_cross_validation( model, lastX, y, groups = groups, scoring = scoring, cv = cv ,
                #                                                verbose = verbose, fit_params = fit_params_step[node],
                #                                                return_predict = return_predict , method = method, no_scoring = no_scoring,
                #                                                stopping_round = stopping_round, stopping_threshold = stopping_threshold,
                #                                                **kwargs_step[node])
                #
                #                elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform:
                #                    pass
                #
                #                else:

                # This is the last node of the Graph
                result = cross_validation(
                    model,
                    lastX,
                    y,
                    groups=groups,
                    scoring=scoring,
                    cv=cv,
                    verbose=verbose,
                    fit_params=fit_params_step[node],
                    return_predict=return_predict,
                    method=method,
                    no_scoring=no_scoring,
                    stopping_round=stopping_round,
                    stopping_threshold=stopping_threshold,
                    **kwargs_step[node])

                # Rmk : if we do that so column regarding the time of fit are 'false' : they will only account for the time spent in the last node

                return True, data_dico, result
            #                return result

            else:
                ###
                if self.verbose:
                    print("can't compute node %s because lastX is None" % node)
                data_dico[node] = None
                # return result

        return False, data_dico, None  # None : no result yet
def test_cross_validation0(with_groups):
    np.random.seed(123)
    X = np.random.randn(100, 10)
    y = np.random.randn(100)

    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    forest = RandomForestRegressor(n_estimators=10)
    result = cross_validation(forest, X, y, groups=groups, scoring=["neg_mean_squared_error", "r2"], cv=10)

    with pytest.raises(sklearn.exceptions.NotFittedError):
        forest.predict(X)

    assert isinstance(result, pd.DataFrame)
    assert list(result.columns) == [
        "test_neg_mean_squared_error",
        "test_r2",
        "train_neg_mean_squared_error",
        "train_r2",
        "fit_time",
        "score_time",
        "n_test_samples",
        "fold_nb",
    ]
    assert len(result) == 10

    forest = RandomForestRegressor(n_estimators=10, random_state=123)
    result, yhat = cross_validation(
        forest, X, y, groups, scoring=["neg_mean_squared_error", "r2"], cv=10, return_predict=True
    )
    with pytest.raises(sklearn.exceptions.NotFittedError):
        forest.predict(X)

    assert isinstance(result, pd.DataFrame)
    assert list(result.columns) == [
        "test_neg_mean_squared_error",
        "test_r2",
        "train_neg_mean_squared_error",
        "train_r2",
        "fit_time",
        "score_time",
        "n_test_samples",
        "fold_nb",
    ]

    assert len(result) == 10
    assert yhat.shape == (100,)

    X = np.random.randn(100, 10)
    y = np.array(["A"] * 33 + ["B"] * 33 + ["C"] * 34)
    forest = RandomForestClassifier(n_estimators=10, random_state=123)

    result = cross_validation(forest, X, y, groups, scoring=["accuracy", "neg_log_loss"], cv=10)
    with pytest.raises(sklearn.exceptions.NotFittedError):
        forest.predict(X)

    assert isinstance(result, pd.DataFrame)
    assert list(result.columns) == [
        "test_accuracy",
        "test_neg_log_loss",
        "train_accuracy",
        "train_neg_log_loss",
        "fit_time",
        "score_time",
        "n_test_samples",
        "fold_nb",
    ]

    assert len(result) == 10

    forest = RandomForestClassifier(random_state=123, n_estimators=10)
    result, yhat = cross_validation(
        forest, X, y, groups, scoring=["accuracy", "neg_log_loss"], cv=10, return_predict=True, method="predict"
    )
    with pytest.raises(sklearn.exceptions.NotFittedError):
        forest.predict(X)

    assert yhat.shape == (100,)
    assert set(np.unique(yhat)) == set(("A", "B", "C"))

    forest = RandomForestClassifier(random_state=123, n_estimators=10)
    result, yhat = cross_validation(
        forest, X, y, groups, scoring=["accuracy", "neg_log_loss"], cv=10, return_predict=True, method="predict_proba"
    )

    with pytest.raises(sklearn.exceptions.NotFittedError):
        forest.predict(X)

    assert yhat.shape == (100, 3)
    assert isinstance(yhat, pd.DataFrame)
    assert list(yhat.columns) == ["A", "B", "C"]