def test_create_cv():
    y = np.array([0] * 10 + [1] * 10)
    X = np.random.randn(20, 3)

    cv1 = create_cv(cv=10, y=y, classifier=True)
    assert cv1.__class__.__name__ == "StratifiedKFold"
    assert len(list(cv1.split(X, y))) == 10
    cv1b = create_cv(cv1)
    assert cv1b is cv1

    y2 = np.random.randn(20)
    cv2 = create_cv(cv=10, y=y2)
    assert cv2.__class__.__name__ == "KFold"
    assert len(list(cv2.split(X, y))) == 10

    class PersonalizedCV(object):
        def __init__(self):
            pass

        def split(self, X, y, groups=None):
            pass

    cv = PersonalizedCV()
    cv_res = create_cv(cv)
    assert cv is cv_res
示例#2
0
    def get_outsample(self, X, y, method, groups=None, cv=None):
        """ retrieve 'outsample' prediction using a cross-validation """

        if cv is None:
            cv = self._cv
        else:
            cv = create_cv(cv,
                           y,
                           random_state=self.random_state,
                           classifier=self._is_classifier,
                           shuffle=True)

        ### 1) CV fitting of all models ####
        all_yhat_pred = []
        for model in self.models:

            yhat_pred = maketwodimensions(
                cross_val_predict(model,
                                  X,
                                  y,
                                  groups=groups,
                                  cv=cv,
                                  method=method))
            all_yhat_pred.append(yhat_pred)

        ### 2) concatenate ####
        all_yhat_pred = np.concatenate(all_yhat_pred, axis=1)

        return all_yhat_pred
示例#3
0
    def approx_cross_validation(
        self,
        X,
        y,
        groups=None,
        scoring=None,
        cv=None,
        verbose=1,
        fit_params=None,
        return_predict=False,
        method=None,
        no_scoring=False,
        stopping_round=None,
        stopping_threshold=None,
        _save_outsample_predict=False,
        _use_saved_outsample_predict=False,
    ):
        """ cross validation of the blender of the stacker
        The fold to use to cross-validate the blender are the SAME as the one used to generate 'outsample prediction'
        """

        cv = create_cv(cv,
                       y,
                       classifier=self._is_classifier,
                       shuffle=True,
                       random_state=self.random_state)

        if _use_saved_outsample_predict:
            all_yhat_pred = self.all_yhat_pred
        else:
            all_yhat_pred = self.get_outsample(X,
                                               y,
                                               method=self._method,
                                               groups=groups,
                                               cv=cv)

            if _save_outsample_predict:
                self.all_yhat_pred = all_yhat_pred

        return cross_validation(
            self.blender,
            all_yhat_pred,
            y,
            scoring=scoring,
            cv=cv,
            verbose=verbose,
            fit_params=fit_params,
            return_predict=return_predict,
            method=method,
            no_scoring=no_scoring,
            stopping_round=stopping_round,
            stopping_threshold=stopping_threshold,
        )
示例#4
0
    def fit_transform(self, X, y, groups=None):

        self._already_fitted = True

        if is_classifier(self.model):
            self._is_classifier = True

        elif is_regressor(self.model):
            self._is_classifier = False

        else:
            raise ValueError(
                "model should either be a Classifier or a Regressor")

        if self.cv is None:
            self._cv = None

            return self.fit(X, y).transform(X)
            # No CV in that case

        self._cv = create_cv(self.cv,
                             y,
                             random_state=self.random_state,
                             classifier=self._is_classifier,
                             shuffle=True)

        if self._is_classifier:
            predictions = cross_val_predict(self.model,
                                            X,
                                            y,
                                            groups=groups,
                                            cv=self._cv,
                                            method="predict_proba")
        else:
            predictions = cross_val_predict(self.model,
                                            X,
                                            y,
                                            groups=groups,
                                            cv=self._cv,
                                            method="predict")

        self.fit(X, y)

        result = self._format_predictions(predictions,
                                          is_classifier=self._is_classifier,
                                          target_info=self._target_info)

        return result
示例#5
0
    def fit_transform(self, X, y):

        if y is None:
            raise ValueError("I need a value for 'y'")

        if not isinstance(y, pd.Series):
            sy = pd.Series(y)
        else:
            sy = y

        self.fit(X, sy)

        X = get_rid_of_categories(X)

        if self.cv is None:  # No Cross Validation ...
            target_aggregat, target_aggregat_global = self._fit_aggregat(
                X, y, noise_level=self.noise_level)
            all_results = self._transform_aggregat(X, target_aggregat,
                                                   target_aggregat_global)

        else:
            cv = create_cv(self.cv,
                           y=sy,
                           classifier=not self.is_regression,
                           random_state=123)

            all_results = []
            for train, test in cv.split(X, y):
                target_aggregat, target_aggregat_global = self._fit_aggregat(
                    X.iloc[train, :],
                    sy.iloc[train],
                    noise_level=self.noise_level)

                sub_result = self._transform_aggregat(X.iloc[test, :],
                                                      target_aggregat,
                                                      target_aggregat_global)

                all_results.append(sub_result)

            all_results = pd.concat(all_results, axis=0)
            all_results = all_results.loc[X.index, :]

            assert len(all_results) == len(X)
            assert (all_results.index == X.index).all()
            assert all_results.shape[1] == len(self.get_feature_names())

        return all_results
示例#6
0
    def fit(self, X, y, groups=None):

        self._cv = create_cv(self.cv,
                             y,
                             classifier=self._is_classifier,
                             random_state=self.random_state)

        all_yhat_pred = self.get_outsample(X,
                                           y,
                                           method=self._method,
                                           groups=groups)

        N = y.shape[0]
        assert all_yhat_pred.shape[0] == N

        ### 3) fit blender ####
        self.blender.fit(all_yhat_pred, y)

        ### 4) refit model ####
        for model in self.models:
            model.fit(X, y)

        return self
示例#7
0
    def approx_cross_validation(self,
                                X,
                                y,
                                groups=None,
                                scoring=None,
                                cv=None,
                                verbose=1,
                                fit_params=None,
                                return_predict=False,
                                method=None,
                                no_scoring=False,
                                stopping_round=None,
                                stopping_threshold=None,
                                nodes_not_to_crossvalidate=None,
                                **kwargs):

        ###################
        ### Preparation ###
        ###################
        _orig_verbose = self.verbose

        self.verbose = verbose

        self._complete_init()

        if nodes_not_to_crossvalidate is None:
            nodes_not_to_crossvalidate = set()

        #################################################################
        ### Prepare the list of nodes that can't be 'cv_transformed' ####
        #################################################################
        nodes_cant_cv_transform = set()
        for node, m in self._models.items():
            cant = True
            if hasattr(m, "can_cv_transform"):
                if m.can_cv_transform():
                    cant = False

            if cant:
                nodes_cant_cv_transform.add(node)

        # verif:
        for node in nodes_cant_cv_transform:
            if node not in self._models:
                raise ValueError(
                    "the node (within nodes_cant_cv_transform) %s isn't in the node of the model"
                    % node)

        for node in nodes_cant_cv_transform:
            if node not in self._models:
                raise ValueError(
                    "the node (within nodes_cant_cv_transform) %s isn't in the node of the model"
                    % node)

        cv = create_cv(
            cv,
            y,
            classifier=sklearn.model_selection._validation.is_classifier(self),
            shuffle=True,
            random_state=123)

        # Split fit_params into a 'step-by-step' dictionnary
        fit_params_step = {name: {} for name in self.complete_graph.nodes}
        if fit_params is not None:
            for key, value in fit_params.items():
                step, param = key.split("__", 1)
                fit_params_step[step][param] = value

        kwargs_step = {name: {} for name in self.complete_graph.nodes}
        if kwargs:
            for key, value in kwargs.items():
                step, param = key.split("__", 1)
                kwargs_step[step][param] = value

        ################################
        ### Pre-calculate everything ###
        ################################
        is_finished, data_dico, result = self._approx_cross_validation_pre_calculation(
            X=X,
            y=y,
            groups=groups,
            scoring=scoring,
            cv=cv,
            verbose=verbose,
            fit_params_step=fit_params_step,
            return_predict=return_predict,
            method=method,
            no_scoring=no_scoring,
            stopping_round=stopping_round,
            stopping_threshold=stopping_threshold,
            nodes_not_to_crossvalidate=nodes_not_to_crossvalidate,
            nodes_cant_cv_transform=nodes_cant_cv_transform,
            kwargs_step=kwargs_step,
        )

        if is_finished:
            if verbose:
                print("CV is finished")
            self.verbose = _orig_verbose
            return result

        ###########################################################
        ### Create a new graphpipeline with the remaining nodes ###
        ###########################################################
        new_graph_pipeline, new_data_dtm = self._approx_cross_validation_create_sub_graph_pipeline(
            data_dico, X)

        if verbose:
            print("here is a new GraphPipeline")
            print(new_graph_pipeline)

            print("")
            print("new_data_dtm")
            print(type(new_data_dtm))

        ############################################################################
        ### Now do a 'classical cross-validation' on the remaining GraphPipeline ###
        ############################################################################
        result = cross_validation(new_graph_pipeline,
                                  new_data_dtm,
                                  y,
                                  groups=groups,
                                  scoring=scoring,
                                  cv=cv,
                                  verbose=verbose,
                                  fit_params=fit_params,
                                  return_predict=return_predict,
                                  method=method,
                                  no_scoring=no_scoring,
                                  stopping_round=stopping_round,
                                  stopping_threshold=stopping_threshold,
                                  approximate_cv=False,
                                  **kwargs)

        self.verbose = _orig_verbose

        return result
示例#8
0
    def approx_cross_validation(
        self,
        X,
        y,
        groups=None,
        scoring=None,
        cv=None,
        verbose=1,
        fit_params=None,
        return_predict=False,
        method="transform",
        no_scoring=True,
        stopping_round=None,
        stopping_threshold=None,
    ):

        if is_classifier(self.model):
            _is_classifier = True

        elif is_regressor(self.model):
            _is_classifier = False

        else:
            raise ValueError(
                "model should either be a Classifier or a Regressor")

        if cv is None:
            # I'll use cv of stacker
            raise ValueError("I need a cv do cross-validate")
            # cv = create_cv(self.cv, y, random_state = self.random_state, classifier = self._is_classifier, shuffle = True)

        cv = create_cv(cv,
                       y,
                       random_state=123,
                       classifier=is_classifier,
                       shuffle=True)

        target_info = self._get_target_info(y, is_classifier)
        if not no_scoring:
            raise ValueError("no scoring should be True for a transformer")

        if method != "transform":
            raise ValueError("method should be 'transform' for a transformer")

        if _is_classifier:
            predictions = cross_val_predict(self.model,
                                            X,
                                            y,
                                            groups=groups,
                                            cv=cv,
                                            method="predict_proba")
        else:
            predictions = cross_val_predict(self.model,
                                            X,
                                            y,
                                            groups=groups,
                                            cv=cv,
                                            method="predict")

        result = self._format_predictions(predictions,
                                          is_classifier=_is_classifier,
                                          target_info=target_info)

        # None : no scoring, this is a transformer
        return None, result
示例#9
0
    def approx_cross_validation(
        self,
        X,
        y,
        groups=None,
        scoring=None,
        cv=None,
        verbose=1,
        fit_params=None,
        return_predict=False,
        method="transform",
        no_scoring=True,
        stopping_round=None,
        stopping_threshold=None,
    ):

        if is_classifier(self.model):
            _is_classifier = True

        elif is_regressor(self.model):
            _is_classifier = False

        else:
            raise ValueError(
                "model should either be a Classifier or a Regressor")

        if cv is None:
            # I'll use cv of stacker
            raise ValueError("I need a cv do cross-validate")
            # cv = create_cv(self.cv, y, random_state = self.random_state, classifier = self._is_classifier, shuffle = True)

        cv = create_cv(cv,
                       y,
                       random_state=123,
                       classifier=_is_classifier,
                       shuffle=True)

        if not no_scoring:
            raise ValueError("no scoring should be True for a transformer")

        if method != "transform":
            raise ValueError("method should be 'transform' for a transformer")

        if _is_classifier:
            _nby = len(np.unique(y))

            if _nby == 2:
                all_yhat_pred = maketwodimensions(
                    cross_val_predict(self.model,
                                      X,
                                      y,
                                      groups=groups,
                                      cv=cv,
                                      method="predict_proba")[:, 1])
            else:
                all_yhat_pred = maketwodimensions(
                    cross_val_predict(self.model,
                                      X,
                                      y,
                                      groups=groups,
                                      cv=cv,
                                      method="predict_proba"))
        else:
            all_yhat_pred = maketwodimensions(
                cross_val_predict(self.model,
                                  X,
                                  y,
                                  groups=groups,
                                  cv=cv,
                                  method="predict"))

        # None : no scoring, this is a transformer
        return None, all_yhat_pred
示例#10
0
    def fit_transform(self, X, y, groups=None):

        self._already_fitted = True

        if is_classifier(self.model):
            self._is_classifier = True

        elif is_regressor(self.model):
            self._is_classifier = False

        else:
            raise ValueError(
                "model should either be a Classifier or a Regressor")

        if self.cv is None:
            self._cv = None

            return self.fit(X, y).transform(X)
            # No CV in that case

        else:
            self._cv = create_cv(self.cv,
                                 y,
                                 random_state=self.random_state,
                                 classifier=self._is_classifier,
                                 shuffle=True)

        if self._is_classifier:
            self._nby = len(np.unique(y))

            if self._nby == 2:
                all_yhat_pred = maketwodimensions(
                    cross_val_predict(self.model,
                                      X,
                                      y,
                                      groups=groups,
                                      cv=self._cv,
                                      method="predict_proba")[:, 1])
            else:
                all_yhat_pred = maketwodimensions(
                    cross_val_predict(self.model,
                                      X,
                                      y,
                                      groups=groups,
                                      cv=self._cv,
                                      method="predict_proba"))
        else:
            all_yhat_pred = maketwodimensions(
                cross_val_predict(self.model,
                                  X,
                                  y,
                                  groups=groups,
                                  cv=self._cv,
                                  method="predict"))

        self.model.fit(X, y)

        if self._is_classifier:
            # Classification model
            if self._nby == 2:
                self._feature_names = [
                    "%s__%s" %
                    (self.model.__class__.__name__, self.model.classes_[1])
                ]
            else:
                self._feature_names = [
                    "%s__%s" % (self.model.__class__.__name__, c)
                    for c in self.model.classes_
                ]

        else:
            # Regression model
            self._feature_names = [
                "%s__target" % self.model.__class__.__name__
            ]

        if self.columns_prefix is not None:
            self._feature_names = [
                "%s__%s" % (self.columns_prefix, c)
                for c in self._feature_names
            ]

        if hasattr(all_yhat_pred, "columns"):
            all_yhat_pred.columns = self.get_feature_names()

        return all_yhat_pred