示例#1
0
def _fit_and_score_grid(estimator,
                        X,
                        y,
                        scorer,
                        train,
                        test,
                        grid,
                        fit_params,
                        error_score='raise'):
    '''Doc String'''

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        estimator.fit(X_train, y_train, **fit_params)
    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            scores = [error_score] * len(grid)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        origParams = estimator.get_params()
        scores = [
            _score(estimator.set_params(**params), X_test, y_test, scorer)
            for params in grid
        ]
        estimator.set_params(**origParams)

    return scores
示例#2
0
    def _gen_train_val(self, X, y, cv_split_method):
        X, y, groups = indexable(X, y, None)
        Xs_tr, ys_tr, Xs_cv, ys_cv = [], [], [], []

        if isinstance(cv_split_method, BaseCrossValidator):
            for tr, cv in cv_split_method.split(X, y, groups):
                X_tr, y_tr = _safe_split(self, X, y, tr)
                X_cv, y_cv = _safe_split(self, X, y, cv, tr)
                Xs_tr.append(X_tr)
                Xs_cv.append(X_cv)
                ys_tr.append(y_tr)
                ys_cv.append(y_cv)
        elif cv_split_method.__name__ == 'train_test_split':
            X, X_val, y, y_val = train_test_split(
                X,
                y,
                random_state=self._random_state,
                test_size=self.validation_fraction)
            Xs_tr.append(X_tr)
            Xs_cv.append(X_cv)
            ys_tr.append(y_tr)
            ys_cv.append(y_cv)
        else:
            raise ValueError("Split method should be a "
                             "sklearn.model_selection spliter class...")

        return Xs_tr, ys_tr, Xs_cv, ys_cv
示例#3
0
文件: helpers.py 项目: kant/lale
def cross_val_score(estimator, X, y=None, scoring=accuracy_score, cv=5):
    """
    Use the given estimator to perform fit and predict for splits defined by 'cv' and compute the given score on
    each of the splits.
    :param estimator: A valid sklearn_wrapper estimator
    :param X, y: Valid data and target values that work with the estimator
    :param scoring: a scorer object from sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics)
             Default value is accuracy_score.
    :param cv: an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices.
        Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5.
        Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here.
    :return: cv_results: a list of scores corresponding to each cross validation fold
    """
    if isinstance(cv, int):
        cv = StratifiedKFold(cv)

    cv_results = []
    for train, test in cv.split(X, y):
        X_train, y_train = _safe_split(estimator, X, y, train)
        X_test, y_test = _safe_split(estimator, X, y, test, train)
        trained_estimator = estimator.fit(X_train, y_train)
        predicted_values = trained_estimator.predict(X_test)
        cv_results.append(scoring(y_test, predicted_values))

    return cv_results
示例#4
0
    def _train(self):
        """Trains one iteration of the model called when ``tune.run`` is called.

        Different routines are run depending on if the ``early_stopping`` attribute 
        is True or not. 
        - If ``self.early_stopping`` is True, each fold is fit with `partial_fit`, 
          which stops training the model if the validation score is not improving 
          for a particular fold. 
        - Otherwise, run the full cross-validation procedure.
        In both cases, the average test accuracy is returned over all folds, 
        and is returned as a singleton dictionary with "average_test_score" as the key.
        """
        if self.early_stopping:
            for i, (train, test) in enumerate(self.cv.split(self.X, self.y)):
                X_train, y_train = _safe_split(self.estimator, self.X, self.y,
                                               train)
                X_test, y_test = _safe_split(self.estimator,
                                             self.X,
                                             self.y,
                                             test,
                                             train_indices=train)
                self.estimator[i].partial_fit(X_train, y_train,
                                              np.unique(self.y))
                if self.return_train_score:
                    self.fold_train_scores[i] = self.scoring(
                        self.estimator[i], X_train, y_train)
                self.fold_scores[i] = self.scoring(self.estimator[i], X_test,
                                                   y_test)

            self.mean_scores = sum(self.fold_scores) / len(self.fold_scores)

            if self.return_train_score:
                self.mean_train_scores = sum(self.fold_train_scores) / len(
                    self.fold_train_scores)
                return {
                    "average_test_score": self.mean_scores,
                    "average_train_score": self.mean_train_scores
                }

            return {"average_test_score": self.mean_scores}
        else:
            scores = cross_validate(self.estimator,
                                    self.X,
                                    self.y,
                                    cv=self.cv,
                                    fit_params=self.fit_params,
                                    groups=self.groups,
                                    scoring=self.scoring)
            self.test_accuracy = sum(scores["test_score"]) / len(
                scores["test_score"])
            if self.return_train_score:
                self.train_accuracy = sum(scores["train_score"]) / len(
                    scores["train_score"])
                return {
                    "average_test_score": self.test_accuracy,
                    "average_train_score": self.train_accuracy
                }
            return {"average_test_score": self.test_accuracy}
示例#5
0
def _permutation_test_score(estimator, X, y, groups, cv, scorer):
    """Auxiliary function for permutation_test_score"""
    avg_score = []
    for train, test in cv.split(X, y, groups):
        X_train, y_train = _safe_split(estimator, X, y, train)
        X_test, y_test = _safe_split(estimator, X, y, test, train)
        estimator.fit(X_train, y_train)
        avg_score.append(scorer(estimator, X_test, y_test))
    return np.mean(avg_score)
示例#6
0
def _permutation_test_score(estimator, X, y, groups, cv, scorer):
    """Auxiliary function for permutation_test_score"""
    avg_score = []
    for train, test in cv.split(X, y, groups):
        X_train, y_train = _safe_split(estimator, X, y, train)
        X_test, y_test = _safe_split(estimator, X, y, test, train)
        estimator.fit(X_train, y_train)
        avg_score.append(scorer(estimator, X_test, y_test))
    return np.mean(avg_score)
示例#7
0
def _baf_single_fit(train, test, baf, estimator, X, y, scorer, random_state):
    """"""
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    baf_i = clone(baf)
    baf_i.random_state = random_state
    baf_i._fit(X_train, y_train)
    return baf_i.support_, _score(baf_i.estimator_, baf_i.transform(X_test, ),
                                  y_test, scorer), baf_i.score_
示例#8
0
def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
    """
    Return the score for a fit across one fold.
    """
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    return rfe._fit(
        X_train, y_train, lambda estimator, features: _score(
            estimator, X_test[:, features], y_test, scorer)).scores_
示例#9
0
def _patch_split(estimator, X, y, indices, train_indices=None):
    if isinstance(y, dict):
        mixed_y = {}
        for key, _y in y.items():
            X_subset, y_subset = _safe_split(estimator, X, _y, indices, train_indices=train_indices)
            mixed_y[key] = y_subset
        return X_subset, mixed_y

    else:
        return _safe_split(estimator, X, y, indices, train_indices=train_indices)
示例#10
0
def _permutations(estimator, X, y, cv, scorer):
    """Auxiliary function for permutations"""
    avg_score = []
    for train, test in cv.split(X, y):
        X_train, y_train = _safe_split(estimator, X, y, train)
        X_test, y_test = _safe_split(estimator, X, y, test, train)
        # X_train, X_test = impute_data(X_train, X_test)
        estimator.fit(X_train, y_train)
        avg_score.append(scorer(estimator, X_test, y_test))
    return np.mean(avg_score)
示例#11
0
    def _partial_fit_and_score(
            self,
            estimator,  # type: BaseEstimator
            train,  # type: List[int]
            test,  # type: List[int]
            partial_fit_params,  # type: Dict[str, Any]
    ):
        # type: (...) -> List[Number]

        X_train, y_train = _safe_split(estimator, self.X, self.y, train)
        X_test, y_test = _safe_split(estimator,
                                     self.X,
                                     self.y,
                                     test,
                                     train_indices=train)

        start_time = time()

        try:
            estimator.partial_fit(X_train, y_train, **partial_fit_params)

        except Exception as e:
            if self.error_score == "raise":
                raise e

            elif isinstance(self.error_score, Number):
                fit_time = time() - start_time
                test_score = self.error_score
                score_time = 0.0

                if self.return_train_score:
                    train_score = self.error_score

            else:
                raise ValueError("error_score must be 'raise' or numeric.")

        else:
            fit_time = time() - start_time
            test_score = self.scoring(estimator, X_test, y_test)
            score_time = time() - fit_time - start_time

            if self.return_train_score:
                train_score = self.scoring(estimator, X_train, y_train)

        # Required for type checking but is never expected to fail.
        assert isinstance(fit_time, Number)
        assert isinstance(score_time, Number)

        ret = [test_score, fit_time, score_time]

        if self.return_train_score:
            ret.insert(0, train_score)

        return ret
示例#12
0
def cross_val_score_track_trials(estimator,
                                 X,
                                 y=None,
                                 scoring=accuracy_score,
                                 cv=5):
    """
    Use the given estimator to perform fit and predict for splits defined by 'cv' and compute the given score on 
    each of the splits.

    Parameters
    ----------

    estimator: A valid sklearn_wrapper estimator
    X, y: Valid data and target values that work with the estimator
    scoring: string or a scorer object created using 
        https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer.
        A string from sklearn.metrics.SCORERS.keys() can be used or a scorer created from one of 
        sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics).
        A completely custom scorer object can be created from a python function following the example at 
        https://scikit-learn.org/stable/modules/model_evaluation.html
        The metric has to return a scalar value,
    cv: an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices.
        Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5.
        Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here.

    :return: cv_results: a list of scores corresponding to each cross validation fold
    """
    if isinstance(cv, int):
        cv = StratifiedKFold(cv)

    scorer = check_scoring(estimator, scoring=scoring)
    cv_results: List[float] = []
    log_loss_results = []
    time_results = []
    for train, test in cv.split(X, y):
        X_train, y_train = _safe_split(estimator, X, y, train)
        X_test, y_test = _safe_split(estimator, X, y, test, train)
        start = time.time()
        trained = estimator.fit(X_train, y_train)
        score_value = scorer(trained, X_test, y_test)
        execution_time = time.time() - start
        # not all estimators have predict probability
        try:
            y_pred_proba = trained.predict_proba(X_test)
            logloss = log_loss(y_true=y_test, y_pred=y_pred_proba)
            log_loss_results.append(logloss)
        except BaseException:
            logger.debug("Warning, log loss cannot be computed")
        cv_results.append(score_value)
        time_results.append(execution_time)
    return np.array(cv_results).mean(), np.array(
        log_loss_results).mean(), np.array(execution_time).mean()
示例#13
0
def _safe_split_multi(estimator, X, y, train, test):
    X_train, y_train, X_test, y_test = [], [], [], []
    for x_, y_, tr_, ts_ in zip(X, y, train, test):
        out = [_safe_split(estimator, x__, y_, tr_) for x__ in x_]
        X_tr, y_tr = zip(*out)
        X_train.append(np.array(X_tr))
        y_train.append(y_tr[0])  # they are all equal

        out = [_safe_split(estimator, x__, y_, ts_, tr_) for x__ in x_]
        X_ts, y_ts = zip(*out)
        X_test.append(np.array(X_ts))
        y_test.append(y_ts[0])  # they are all equal
    return X_train, y_train, X_test, y_test
示例#14
0
def _safe_split_multi(estimator, X, y, train, test):
    X_train, y_train, X_test, y_test = [], [], [], []
    for x_, y_, tr_, ts_ in zip(X, y, train, test):
        out = [_safe_split(estimator, x__, y_, tr_) for x__ in x_]
        X_tr, y_tr = zip(*out)
        X_train.append(np.array(X_tr))
        y_train.append(y_tr[0])  # they are all equal

        out = [_safe_split(estimator, x__, y_, ts_, tr_) for x__ in x_]
        X_ts, y_ts = zip(*out)
        X_test.append(np.array(X_ts))
        y_test.append(y_ts[0])  # they are all equal
    return X_train, y_train, X_test, y_test
示例#15
0
    def _partial_fit_and_score(
            self,
            estimator,  # type: BaseEstimator
            train,  # type: List[int]
            test,  # type: List[int]
            partial_fit_params  # type: Dict[str, Any]
    ):
        # type: (...) -> List[float]

        X_train, y_train = _safe_split(estimator, self.X, self.y, train)
        X_test, y_test = _safe_split(estimator,
                                     self.X,
                                     self.y,
                                     test,
                                     train_indices=train)

        start_time = time()

        try:
            estimator.partial_fit(X_train, y_train, **partial_fit_params)

        except Exception as e:
            if self.error_score == 'raise':
                raise e

            elif isinstance(self.error_score, Number):
                fit_time = time() - start_time
                test_score = self.error_score
                score_time = 0.0

                if self.return_train_score:
                    train_score = self.error_score

            else:
                raise ValueError('error_score must be \'raise\' or numeric.')

        else:
            fit_time = time() - start_time
            test_score = self.scoring(estimator, X_test, y_test)
            score_time = time() - fit_time - start_time

            if self.return_train_score:
                train_score = self.scoring(estimator, X_train, y_train)

        ret = [test_score, fit_time, score_time]

        if self.return_train_score:
            ret.insert(0, train_score)

        return ret
示例#16
0
def _rfe_single_fit(rfe, estimator, x_data, y_data, train, test, scorer,
                    **fit_kwargs):
    """Return the score for a fit across one fold."""
    (x_train, y_train) = _safe_split(estimator, x_data, y_data, train)
    (x_test, y_test) = _safe_split(estimator, x_data, y_data, test, train)
    (fit_kwargs_train, _) = _split_fit_kwargs(fit_kwargs, train, test)

    def step_score(estimator, features):
        """Score for a single step in the recursive feature elimination."""
        return _score(estimator, x_test[:, features], y_test, scorer)

    return rfe._fit(x_train,
                    y_train,
                    step_score=step_score,
                    **fit_kwargs_train).scores_
示例#17
0
 def transform(self, X, y=None):
     if self.time_based:
         x_mask = _create_mask(X, self.data_range)
         return X[x_mask]
     else:
         X_train, _ = _safe_split(None, X, None, self.data_range)
         return X_train
示例#18
0
 def fit_transform(self, X, y=None):
     # TODO make checks if the data_range is set correctly
     if self.time_based:
         x_mask = _create_mask(X, self.data_range)
         if y is not None:
             assert len(X) == len(y)
             y_mask = _create_mask(y, self.data_range)
             return X[x_mask], y[y_mask]
         else:
             return X[x_mask]
     else:
         if y is not None:
             X, y = _safe_split(None, X, y, self.data_range)
             return X, y
         else:
             X, y = _safe_split(None, X, y, self.data_range)
             return X
示例#19
0
    def _partial_fit_and_score(self, estimator, train, test,
                               partial_fit_params):
        # type: (...) -> List[float]

        X_train, y_train = _safe_split(estimator, self.X, self.y, train)
        X_test, y_test = _safe_split(estimator,
                                     self.X,
                                     self.y,
                                     test,
                                     train_indices=train)

        start_time = perf_counter()

        try:
            estimator.partial_fit(X_train, y_train, **partial_fit_params)

        except Exception as e:
            if self.error_score == 'raise':
                raise e

            elif isinstance(self.error_score, Number):
                fit_time = perf_counter() - start_time
                test_score = self.error_score
                score_time = 0.0

                if self.return_train_score:
                    train_score = self.error_score

            else:
                raise ValueError("error_score must be 'raise' or numeric.")

        else:
            fit_time = perf_counter() - start_time
            test_score = self.scoring(estimator, X_test, y_test)
            score_time = perf_counter() - fit_time - start_time

            if self.return_train_score:
                train_score = self.scoring(estimator, X_train, y_train)

        ret = [test_score, fit_time, score_time]

        if self.return_train_score:
            ret.insert(0, train_score)

        return ret
示例#20
0
    def test_kernel_precomputed(gbsg2):
        x, y = gbsg2
        from sklearn.metrics.pairwise import pairwise_kernels
        from sklearn.utils.metaestimators import _safe_split

        m = MinlipSurvivalAnalysis(kernel="precomputed", solver="ecos")
        K = pairwise_kernels(x, metric="rbf", gamma=1. / 32)

        train_idx = numpy.arange(50, x.shape[0])
        test_idx = numpy.arange(50)
        X_fit, y_fit = _safe_split(m, K, y, train_idx)
        X_test, y_test = _safe_split(m, K, y, test_idx, train_idx)

        m.fit(X_fit, y_fit)

        p = m.predict(X_test)
        assert_cindex_almost_equal(y_test['cens'], y_test['time'], p,
                                   (0.626514131897712, 457, 269, 17, 0))
示例#21
0
文件: helpers.py 项目: kant/lale
def cross_val_score_track_trials(estimator,
                                 X,
                                 y=None,
                                 scoring=accuracy_score,
                                 cv=5):
    """
    Use the given estimator to perform fit and predict for splits defined by 'cv' and compute the given score on 
    each of the splits.

    :param estimator: A valid sklearn_wrapper estimator
    :param X, y: Valid data and target values that work with the estimator
    :param scoring: a scorer object from sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics)
             Default value is accuracy_score.
    :param cv: an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices.
        Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5.
        Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here.

    :return: cv_results: a list of scores corresponding to each cross validation fold
    """
    if isinstance(cv, int):
        cv = StratifiedKFold(cv)

    cv_results = []
    log_loss_results = []
    time_results = []
    for train, test in cv.split(X, y):
        X_train, y_train = _safe_split(estimator, X, y, train)
        X_test, y_test = _safe_split(estimator, X, y, test, train)
        start = time.time()
        trained_estimator = estimator.fit(X_train, y_train)
        predicted_values = trained_estimator.predict(X_test)
        execution_time = time.time() - start
        # not all estimators have predict probability
        try:
            y_pred_proba = trained_estimator.predict_proba(X_test)
            logloss = log_loss(y_true=y_test, y_pred=y_pred_proba)
            log_loss_results.append(logloss)
        except BaseException:
            logger.debug("Warning, log loss cannot be computed")
        cv_results.append(scoring(y_test, predicted_values))
        time_results.append(execution_time)

    return np.array(cv_results).mean(), np.array(
        log_loss_results).mean(), np.array(execution_time).mean()
示例#22
0
def _incremental_fit_estimator(estimator, X, y, classes, train, test,
                               train_sizes, scorer, verbose):
    """Train estimator on training subsets incrementally and compute scores."""
    train_scores, test_scores = [], []
    partitions = zip(train_sizes, np.split(train, train_sizes)[:-1])
    for n_train_samples, partial_train in partitions:
        train_subset = train[:n_train_samples]
        X_train, y_train = _safe_split(estimator, X, y, train_subset)
        X_partial_train, y_partial_train = _safe_split(estimator, X, y,
                                                       partial_train)
        X_test, y_test = _safe_split(estimator, X, y, test, train_subset)
        if y_partial_train is None:
            estimator.partial_fit(X_partial_train, classes=classes)
        else:
            estimator.partial_fit(X_partial_train, y_partial_train,
                                  classes=classes)
        train_scores.append(_score(estimator, X_train, y_train, scorer))
        test_scores.append(_score(estimator, X_test, y_test, scorer))
    return np.array((train_scores, test_scores)).T
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = ShuffleSplit(test_size=0.25, random_state=0)
    train, test = list(cv.split(X))[0]

    X_train, y_train = _safe_split(clf, X, y, train)
    K_train, y_train2 = _safe_split(clfp, K, y, train)
    assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
    assert_array_almost_equal(y_train, y_train2)

    X_test, y_test = _safe_split(clf, X, y, test, train)
    K_test, y_test2 = _safe_split(clfp, K, y, test, train)
    assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
    assert_array_almost_equal(y_test, y_test2)
示例#24
0
    def test_kernel_precomputed(self):
        from sklearn.metrics.pairwise import pairwise_kernels
        from sklearn.utils.metaestimators import _safe_split

        m = MinlipSurvivalAnalysis(kernel="precomputed", solver="cvxpy")
        K = pairwise_kernels(self.x, metric="rbf")

        train_idx = numpy.arange(50, self.x.shape[0])
        test_idx = numpy.arange(50)
        X_fit, y_fit = _safe_split(m, K, self.y, train_idx)
        X_test, y_test = _safe_split(m, K, self.y, test_idx, train_idx)

        m.fit(X_fit, y_fit)

        p = m.predict(X_test)
        v = concordance_index_censored(y_test['cens'], y_test['time'], p)

        expected = numpy.array([0.508748, 378, 365, 0, 0])

        assert_array_almost_equal(expected, v)
示例#25
0
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = ShuffleSplit(test_size=0.25, random_state=0)
    train, test = list(cv.split(X))[0]

    X_train, y_train = _safe_split(clf, X, y, train)
    K_train, y_train2 = _safe_split(clfp, K, y, train)
    assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
    assert_array_almost_equal(y_train, y_train2)

    X_test, y_test = _safe_split(clf, X, y, test, train)
    K_test, y_test2 = _safe_split(clfp, K, y, test, train)
    assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
    assert_array_almost_equal(y_test, y_test2)
示例#26
0
def _fit_and_score(est, x, y, scorer, train_index, test_index, parameters,
                   fit_params, predict_params):
    """Train survival model on given data and return its score on test data"""
    X_train, y_train = _safe_split(est, x, y, train_index)
    train_params = fit_params.copy()

    # Training
    est.set_params(**parameters)
    est.fit(X_train, y_train, **train_params)

    # Testing
    test_predict_params = predict_params.copy()
    X_test, y_test = _safe_split(est, x, y, test_index, train_index)

    score = scorer(est, X_test, y_test, **test_predict_params)
    if not isinstance(score, numbers.Number):
        raise ValueError("scoring must return a number, got %s (%s) instead." %
                         (str(score), type(score)))

    return score
示例#27
0
def _fit_ovo_binary(estimator, X, y, i, j):
    """Fit a single binary estimator (one-vs-one)."""
    cond = np.logical_or(y == i, y == j)
    y = y[cond]
    y_binary = np.empty(y.shape, np.int)
    y_binary[y == i] = 0
    y_binary[y == j] = 1
    indcond = np.arange(X.shape[0])[cond.reshape(-1,)]
    return _fit_binary(estimator,
                       _safe_split(estimator, X, None, indices=indcond)[0],
                       y_binary, classes=[i, j]), indcond
示例#28
0
def _predict_proba(estimator, X, y, train, test, 
    verbose, parameters, fit_params):
    '''
    Fits an estimator to the training set and outputs probability predictions 
    (and true labels, if applicable) for test set.

    Adapted from mne.decoding.base._fit_and_score()
    '''

    from mne.fixes import _check_fit_params
    from sklearn.utils.metaestimators import _safe_split
    
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    if parameters is not None:
        estimator.set_params(**parameters)

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    
    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    
    y_hat = estimator.predict_proba(X_test)
    
    if y_train is None:
        return y_hat
    else:
        return y_hat, y_test
示例#29
0
文件: train.py 项目: stenpiren/churnr
def _fit_and_predict(estimator, X, y, train, test, class_ratio, verbose,
                     fit_params, method):
    from sklearn.utils.metaestimators import _safe_split
    from sklearn.model_selection._validation import _index_param_value
    from imblearn.under_sampling import RandomUnderSampler

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    rus = RandomUnderSampler(ratio=class_ratio,
                             return_indices=True,
                             random_state=42)
    if len(X.shape) < 2:
        X0 = X[0][train]
        y_train = y[train]

        idxs = rus.fit_sample(X0, y_train)
        X_train = np.empty(shape=(len(idxs), X.shape[0], X0.shape[1]))
        X_test = np.empty(shape=(len(test), X.shape[0], X0.shape[1]))
        for i in range(X.shape[0]):
            X_train[:, i, :] = X[i][train][idxs].toarray()
            X_test[:, i, :] = X[i][test].toarray()

        y_train = to_categorical(y_train[idxs])
    else:
        X_train, y_train = _safe_split(estimator, X, y, train)
        X_test, _ = _safe_split(estimator, X, y, test, train)

        idxs = rus.fit_sample(X_train, y_train)
        X_train = X_train[idxs]
        y_train = y_train[idxs]

    clf = estimator.fit(X_train, y_train, **fit_params)

    func = getattr(estimator, method)

    logger.info('-- predict_proba()')
    predictions = func(X_test)
    return predictions, test, pd.DataFrame(clf.cv_results_)
示例#30
0
    def test_kernel_precomputed(gbsg2):
        x, y = gbsg2
        from sklearn.metrics.pairwise import pairwise_kernels
        from sklearn.utils.metaestimators import _safe_split

        m = MinlipSurvivalAnalysis(kernel="precomputed",
                                   solver="osqp",
                                   max_iter=25000)
        xt = scale(x)
        K = pairwise_kernels(xt, metric="rbf", gamma=0.1)

        train_idx = numpy.arange(200, x.shape[0])
        test_idx = numpy.arange(200)
        X_fit, y_fit = _safe_split(m, K, y, train_idx)
        X_test, y_test = _safe_split(m, K, y, test_idx, train_idx)

        m.fit(X_fit, y_fit)

        p = m.predict(X_test)
        assert_cindex_almost_equal(y_test['cens'], y_test['time'], p,
                                   (0.6518928901200369, 8472, 4524, 0, 3))
示例#31
0
def _fit_and_predict(estimator, X, y, train, test, predict_proba):
    """
    fit the estimator with the train samples and make prediction with the test data
    Args:
        estimator(object): sklearn object
        X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features).
        y (pd.Series): Observed outcome of size (num_subjects,).
        train:
        test:
        predict_proba (bool): If True, the treatment model is a classifier
                                and use 'predict_proba',
                              If False, use 'predict'.

    """
    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, _ = _safe_split(estimator, X, y, test, train)
    estimator.fit(X_train, y_train)
    if predict_proba:
        pred = estimator.predict_proba(X_test)[:, 1]
    else:
        pred = estimator.predict(X_test)

    return pd.Series(pred, index=X_test.index), estimator
示例#32
0
def cross_val_train_predict(estimator,
                            x,
                            y,
                            predict_method: str = "predict",
                            cv: int = 5):
    """ Return fit estimators and predictions of each (Stratified) fold. """
    from sklearn.base import clone, is_classifier
    from sklearn.model_selection._split import check_cv
    from sklearn.utils.metaestimators import _safe_split
    import numpy as np

    splitter = check_cv(cv, y, classifier=is_classifier(estimator))

    estimators = []
    predictions = None
    for train, test in splitter.split(x, y):
        x_train, y_train = _safe_split(estimator, x, y, train)
        x_test, _ = _safe_split(estimator, x, y, test, train)

        fold_estimator = clone(estimator)
        fold_predict = getattr(fold_estimator, predict_method)

        fold_estimator.fit(x_train, y_train)
        estimators.append(fold_estimator)
        fold_prediction = fold_predict(x_test)

        if predictions is None:
            if fold_prediction.ndim == 2:
                predictions = np.empty(shape=(len(y),
                                              fold_prediction.shape[1]))
            else:
                predictions = np.empty(shape=(len(y), ))

        predictions[test] = fold_prediction

    return predictions, estimators
示例#33
0
文件: helpers.py 项目: krprls/lale
def split_with_schemas(estimator, all_X, all_y, indices, train_indices=None):
    subset_X, subset_y = _safe_split(
        estimator, all_X, all_y, indices, train_indices)
    if hasattr(all_X, 'json_schema'):
        n_rows = subset_X.shape[0]
        schema = {
            'type': 'array', 'minItems': n_rows, 'maxItems': n_rows,
            'items': all_X.json_schema['items']}
        lale.datasets.data_schemas.add_schema(subset_X, schema)
    if hasattr(all_y, 'json_schema'):
        n_rows = subset_y.shape[0]
        schema = {
            'type': 'array', 'minItems': n_rows, 'maxItems': n_rows,
            'items': all_y.json_schema['items']}
        lale.datasets.data_schemas.add_schema(subset_y, schema)
    return subset_X, subset_y
示例#34
0
def _worker(estimator_, i, X, y, train, test):
    """Implement the worker resubmission in case of errors."""
    # custom_name = "{}_p_{}_i_{}".format(
    #     ("permutation" if is_permutation_test else "regular"), RANK, i)
    # tmp_name_base = 'tmp_' + custom_name

    worker_logger = logging.getLogger('worker')

    experiment_resubmissions = 0
    experiment_completed = False

    worker_logger.info("{}{} executing job {}".format(NAME, RANK, i))

    while not experiment_completed and \
            experiment_resubmissions <= MAX_RESUBMISSIONS:
        try:

            if experiment_resubmissions > 0:
                worker_logger.warning("{}{} resubmitting experiment {}".format(NAME, RANK, i))

            # tmp_name = tmp_name_base + '_submission_{}'.format(
            #     experiment_resubmissions + 1)
            # run_experiment(data, labels, None, config,
            #                is_permutation_test, experiments_folder_path,
            #                tmp_name)
            # TODO necessary?
            estimator = clone(estimator_.estimator)

            # need to get the deepest estimator to use _safe_split
            estimator__ = clone(estimator)
            while hasattr(estimator__, 'estimator'):
                estimator__ = clone(estimator__.estimator)

            X_train, y_train = _safe_split(estimator__, X, y, train)
            X_test, y_test = _safe_split(estimator__, X, y, test, train)

            if estimator_.shuffle_y:
                random_state = check_random_state(estimator_.random_state)
                y_train = _shuffle(y_train, estimator_.groups, random_state)

            worker_logger.info("{}{} fitting experiment {} - starting".format(NAME, RANK, i))
            estimator.fit(X_train, y_train)
            worker_logger.info("{}{} fitting experiment {} - completed".format(NAME, RANK, i))

            worker_logger.debug("{}{} scoring experiment {} - starting".format(NAME, RANK, i))
            yts_pred = estimator.predict(X_test)
            ytr_pred = estimator.predict(X_train)
            lr_score = estimator_.scorer_(estimator, X_train, y_train)
            ts_score = estimator_.scorer_(estimator, X_test, y_test)
            worker_logger.debug("{}{} scoring experiment {} - complete".format(NAME, RANK, i))

            if hasattr(estimator, 'cv_results_'):
                # In case in which the estimator is a CV object
                cv_results = estimator.cv_results_
            else:
                cv_results = None

            cv_results_ = {
                'split_i': i,
                'learn_score': lr_score,
                'test_score': ts_score,
                'cv_results_': cv_results,
                'ytr_pred': ytr_pred,
                'yts_pred': yts_pred,
                'test_index': test,
                'train_index': train,
                'estimator': estimator
            }

            experiment_completed = True

            # ### Dump partial results
            if estimator_.experiments_folder is not None:
                worker_logger.debug("{}{} saving results for experiment {}".format(NAME, RANK, i))
                pkl_name = (
                    'permutation' if estimator_.shuffle_y else 'regular') + \
                    '_%d.pkl' % i

                pkl.dump(cv_results_, gzip.open(os.path.join(
                    estimator_.experiments_folder, pkl_name), 'wb'))

        except StandardError as error:
            # If somethings out of the ordinary happens,
            # resubmit the job
            experiment_resubmissions += 1
            warnings.warn(
                "[{}_{}] failed experiment {}, resubmission #{}\n"
                "Exception raised: {}".format(
                    NAME, RANK, i, experiment_resubmissions, error))

    if not experiment_completed:
        warnings.warn(
            "[{}_{}] failed to complete experiment {}, "
            "max resubmissions limit reached".format(NAME, RANK, i))
        return {}
    else:
        if not IS_MPI_JOB and estimator_.verbose:

            worker_logger.info("[{}{}]: {} job {} completed".format(NAME, RANK, ('permutation' if estimator_.shuffle_y else 'regular'), i))

        return cv_results_
示例#35
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, return_n_test_samples=False,
                   return_times=False, error_score='raise'):
    """
    Fit estimator and compute scores for a given dataset split.
    """
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        LOG.info("[CV] %s %s", msg, (64 - len(msg)) * '.')

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        test_score = [_score(estimator, X_test, y_test, s) for s in scorer]
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_score = [_score(estimator, X_train, y_train, s)
                           for s in scorer]

    if verbose > 2:
        msg += ", score=".join(('%f' % ts for ts in test_score))
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        LOG.info("[CV] %s %s", (64 - len(end_msg)) * '.', end_msg)

    ret = [train_score, test_score] if return_train_score else [test_score]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    return ret