예제 #1
0
def test_check_fit_params(indices):
    X = np.random.randn(4, 2)
    fit_params = {
        'list': [1, 2, 3, 4],
        'array': np.array([1, 2, 3, 4]),
        'sparse-col': sp.csc_matrix([1, 2, 3, 4]).T,
        'sparse-row': sp.csc_matrix([1, 2, 3, 4]),
        'scalar-int': 1,
        'scalar-str': 'xxx',
        'None': None,
    }
    result = _check_fit_params(X, fit_params, indices)
    indices_ = indices if indices is not None else list(range(X.shape[0]))

    for key in ['sparse-row', 'scalar-int', 'scalar-str', 'None']:
        assert result[key] is fit_params[key]

    assert result['list'] == _safe_indexing(fit_params['list'], indices_)
    assert_array_equal(
        result['array'], _safe_indexing(fit_params['array'], indices_)
    )
    assert_allclose_dense_sparse(
        result['sparse-col'],
        _safe_indexing(fit_params['sparse-col'], indices_)
    )
예제 #2
0
    def fit(self, X, Y, sample_weight=None, **fit_params):
        """Fit the model to data matrix X and targets Y.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            input data.

        Y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets. An indicator matrix turns on multilabel
            estimation.

        sample_weight : array-like of shape (n_samples,) or None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying classifier supports sample
            weights.

        **fit_params : dict of string -> object
            Parameters passed to the ``estimator.fit`` method of each step.

        Returns
        -------
        self : object
        """
        self._validate_estimators()

        for _, est in self.estimators:
            if not hasattr(est, 'fit'):
                raise AttributeError(
                    'Every base estimator should implement a fit method.')

        X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True)

        if is_classifier(self):
            check_classification_targets(Y)

        if Y.ndim == 1:
            raise ValueError(
                'Output Y must have at least two dimensions for multi-output classification but has only one.'
            )

        if sample_weight is not None and any([
                not has_fit_parameter(clf, 'sample_weight')
                for _, clf in self.estimators
        ]):
            raise ValueError(
                'One of base estimators does not support sample weights.')

        fit_params_validated = _check_fit_params(X, fit_params)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(clf, X, Y[:, i], sample_weight, **
                                    fit_params_validated)
            for i, (_, clf) in zip(range(Y.shape[1]), self.estimators))

        self.classes_ = [est.classes_ for est in self.estimators_]

        return self
예제 #3
0
    def _base_est_fit_predict(self, X, y, **fit_params):
        """Fit the base estimators on CV training folds, and return their
        out-of-sample predictions on the test folds as features for the
        meta-estimator. Also return the fit_params for the meta-estimator.
        """
        y = y.squeeze()
        # Construct CV iterator
        cv = self._check_cv(y=y)
        # Extract CV indices since we need them twice, and un-seeded CV
        # generators with `shuffle=True` split differently each time.
        train_inds = []
        test_inds = []
        for train, test in cv.split(X, y):
            train_inds.append(train)
            test_inds.append(test)

        fit_params_ests = self._extract_fit_params(**fit_params)
        _fit_predict = self._get_fit_predict_function()

        _jobs = []

        # Loop over CV folds to get out-of-sample predictions, which become the
        # features for the meta-estimator.
        for train, test in zip(train_inds, test_inds):
            for name, est in self.estimator_list[:-1]:
                # adapted from sklearn.model_selection._fit_and_predict
                # Adjust length of sample weights
                fit_params_est_adjusted = _check_fit_params(
                    X, fit_params_ests[name], train)

                # Fit estimator on training set and score out-of-sample
                _jobs.append(
                    delayed(_fit_predict)(clone(est), X[train], y[train],
                                          X[test], **fit_params_est_adjusted))

        _out = Parallel(n_jobs=self.n_jobs,
                        verbose=self.verbose,
                        pre_dispatch=self.pre_dispatch)(_jobs)

        # Extract the results from joblib
        Xmeta, ymeta = None, None
        for test in test_inds:
            ybase = np.empty((y[test].shape[0], 0))
            for name, est in self.estimator_list[:-1]:
                # Build design matrix out of out-of-sample predictions
                ybase = np.hstack((ybase, _out.pop(0)))

            # Append the test outputs to what will eventually be the features
            # for the meta-estimator.
            if Xmeta is not None:
                ymeta = np.concatenate((ymeta, y[test]))
                Xmeta = np.vstack((Xmeta, ybase))
            else:
                Xmeta = ybase
                ymeta = y[test]

        return Xmeta, ymeta, fit_params_ests[self.meta_estimator_name]
예제 #4
0
def test_check_fit_params(indices):
    X = np.random.randn(4, 2)
    fit_params = {
        "list": [1, 2, 3, 4],
        "array": np.array([1, 2, 3, 4]),
        "sparse-col": sp.csc_matrix([1, 2, 3, 4]).T,
        "sparse-row": sp.csc_matrix([1, 2, 3, 4]),
        "scalar-int": 1,
        "scalar-str": "xxx",
        "None": None,
    }
    result = _check_fit_params(X, fit_params, indices)
    indices_ = indices if indices is not None else list(range(X.shape[0]))

    for key in ["sparse-row", "scalar-int", "scalar-str", "None"]:
        assert result[key] is fit_params[key]

    assert result["list"] == _safe_indexing(fit_params["list"], indices_)
    assert_array_equal(result["array"], _safe_indexing(fit_params["array"], indices_))
    assert_allclose_dense_sparse(
        result["sparse-col"], _safe_indexing(fit_params["sparse-col"], indices_)
    )
예제 #5
0
def fit_and_score_te_oracle(estimator,
                            X,
                            y,
                            w,
                            p,
                            t,
                            scorer,
                            train,
                            test,
                            parameters=None,
                            fit_params=None,
                            return_train_score=False,
                            return_parameters=False,
                            return_times=False,
                            return_estimator=False,
                            error_score=np.nan,
                            return_test_score_only=False):
    """Fit estimator and compute scores for a given dataset split, using oracle knowledge of
    treatment effects. Based on sklearn.model_selection._validation _fit_and_score, adapted to
    allow more inputs (treatments and treatment effects)

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    X : array-like of shape (n_samples, n_features)
            The features to fit to
    y : array-like of shape (n_samples,) or (n_samples, )
            The outcome variable
    w: array-like of shape (n_samples,)
            The treatment indicator
    p: array-like of shape (n_samples,)
            The treatment propensity
    t: array-like of shape (n_samples,)
        the true treatment effect to evaluate against
    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.
        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.
        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.
    train : array-like of shape (n_train_samples,)
        Indices of training samples.
    test : array-like of shape (n_test_samples,)
        Indices of test samples.
    error_score : 'raise' or numeric, default=np.nan
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, FitFailedWarning is raised. This parameter
        does not affect the refit step, which will always raise the error.
    parameters : dict or None
        Parameters to be set on the estimator.
    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.
    return_train_score : bool, default=False
        Compute and return score on training set.
    return_parameters : bool, default=False
        Return parameters that has been used for the estimator.
    return_times : bool, default=False
        Whether to return the fit/score times.
    return_estimator : bool, default=False
        Whether to return the fitted estimator.
    return_test_score_only: bool, default=False
        Whether to only return a test score

    Returns
    -------
    train_scores : dict of scorer name -> float
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.
    test_scores : float or dict of scorer name -> float
        If return_test_score_only and scorer == str, then returns only test score. Otherwise,
        s on testing set (for all the scorers)
    n_test_samples : int
        Number of test samples.
    fit_time : float
        Time spent for fitting in seconds.
    score_time : float
        Time spent for scoring in seconds.
    parameters : dict or None
        The parameters that have been evaluated.
    estimator : estimator object
        The fitted estimator
    """
    if not isinstance(estimator, BaseTEModel):
        raise ValueError("This method works only for BaseTEModel")

    scorers, _ = _check_multimetric_scoring(estimator, scoring=scorer)

    # Adjust length of sample weights (if ant)
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    train_scores = {}
    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for k, v in parameters.items():
            cloned_parameters[k] = clone(v, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.time()

    X_train, y_train, w_train, p_train, t_train = _safe_split_te(
        X, y, w, p, t, train)
    X_test, y_test, w_test, p_test, t_test = _safe_split_te(
        X, y, w, p, t, test)

    try:
        estimator.fit(X_train, y_train, w_train, p_train, **fit_params)

    except Exception as e:
        if return_test_score_only:
            if error_score == 'raise':
                raise
            else:
                return np.nan
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
                if return_train_score:
                    train_scores = test_scores.copy()
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" % (error_score, format_exc()),
                FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time

        try:
            test_scores = _score(estimator, X_test, t_test, scorers)
        except Exception:
            if return_test_score_only:
                if error_score == 'raise':
                    raise
                else:
                    return np.nan

        score_time = time.time() - start_time - fit_time

        if return_test_score_only:
            if type(scorer) == str:
                return test_scores['score']
            else:
                return test_scores

        if return_train_score:
            train_scores = _score(estimator, X_train, t_train, scorers)

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    if return_estimator:
        ret.append(estimator)
    return ret
예제 #6
0
    def fit(self, X, y, groups=None, **fit_params):
        # sklearn prep
        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
        refit_metric = "score"

        if callable(self.scoring):
            scorers = self.scoring
        elif self.scoring is None or isinstance(self.scoring, str):
            scorers = check_scoring(self.estimator, self.scoring)
        else:
            scorers = _check_multimetric_scoring(self.estimator, self.scoring)
            # sklearn < 0.24.0 compatibility
            if isinstance(scorers, tuple):
                scorers = scorers[0]

            self._check_refit_for_multimetric(scorers)
            refit_metric = self.refit

        X, y, groups = indexable(X, y, groups)
        fit_params = _check_fit_params(X, fit_params)
        n_splits = cv.get_n_splits(X, y, groups)
        base_estimator = clone(self.estimator)
        rng = check_random_state(self.random_state)
        np.random.set_state(rng.get_state(legacy=True))
        np_random_seed = rng.get_state(legacy=True)[1][0]

        n_jobs, actual_iterations = self._calculate_n_jobs_and_actual_iters()

        # default port is 9090, we must have one, this is how BOHB workers communicate (even locally)
        run_id = f"HpBandSterSearchCV_{time.time()}"
        _nameserver = hpns.NameServer(run_id=run_id,
                                      host=self.nameserver_host,
                                      port=self.nameserver_port)

        gc.collect()

        if self.verbose > 1:
            _logger.setLevel(logging.DEBUG)
        elif self.verbose > 0:
            _logger.setLevel(logging.INFO)
        else:
            _logger.setLevel(logging.ERROR)

        if "logger" in self.bohb_kwargs:
            self.bohb_kwargs.pop("logger")

        with NameServerContext(_nameserver):
            workers = []
            # each worker is a separate thread
            for i in range(n_jobs):
                # SklearnWorker clones the estimator
                w = SklearnWorker(
                    min_budget=self.min_budget,
                    max_budget=self.max_budget,
                    base_estimator=self.estimator,
                    X=X,
                    y=y,
                    cv=cv,
                    cv_n_splits=n_splits,
                    groups=groups,
                    scoring=scorers,
                    metric=refit_metric,
                    fit_params=fit_params,
                    nameserver=self.nameserver_host,
                    nameserver_port=self.nameserver_port,
                    run_id=run_id,
                    id=i,
                    return_train_score=self.return_train_score,
                    error_score=self.error_score,
                    resource_name=self.resource_name,
                    resource_type=self.resource_type,
                    random_state=rng,
                    logger=_logger,
                )
                w.run(background=True)
                workers.append(w)

            converted_min_budget = float(workers[0].min_budget)
            converted_max_budget = float(workers[0].max_budget)
            self.resource_name_ = workers[0].resource_name

            if (self.resource_name_
                    in self.param_distributions.get_hyperparameter_names()):
                _logger.warning(
                    f"Found hyperparameter with name '{self.resource_name_}', same as resource_name_. Removing it from ConfigurationSpace."
                )
                param_distributions = CS.ConfigurationSpace(
                    name=self.param_distributions.name,
                    meta=self.param_distributions.meta,
                )
                param_distributions.add_hyperparameters([
                    x for x in self.param_distributions.get_hyperparameters()
                    if x.name != self.resource_name_
                ])
            else:
                param_distributions = deepcopy(self.param_distributions)
            param_distributions.seed = np_random_seed

            # sleep for a moment to make sure all workers are initialized
            sleep(0.2)

            # BOHB by default
            if isinstance(self.optimizer, str):
                optimizer = self._optimizer_dict[self.optimizer.lower()](
                    configspace=param_distributions,
                    run_id=run_id,
                    min_budget=converted_min_budget,
                    max_budget=converted_max_budget,
                    logger=_logger,
                    **self.bohb_kwargs,
                )
            else:
                optimizer = self.optimizer(
                    configspace=param_distributions,
                    run_id=run_id,
                    min_budget=converted_min_budget,
                    max_budget=converted_max_budget,
                    logger=_logger,
                    **self.bohb_kwargs,
                )
            with OptimizerContext(
                    optimizer,
                    n_iterations=actual_iterations,
            ) as res:
                self._res = res

        id2config = self._res.get_id2config_mapping()
        incumbent = self._res.get_incumbent_id()
        runs_all = self._res.get_all_runs()
        self.best_params_ = id2config[incumbent]["config"]

        resource_type = workers[0].resource_type
        self.n_resources_ = [resource_type(x) for x in optimizer.budgets]
        self.min_resources_ = self.n_resources_[0]
        self.max_resources_ = self.n_resources_[-1]

        results, new_refit_metric = self._runs_to_results(
            runs_all, id2config, scorers, n_splits, self.n_resources_)

        if new_refit_metric is not None:
            refit_metric = new_refit_metric

        iter_counter = sorted(Counter(results["iter"]).items())
        self.n_candidates_ = [x[1] for x in iter_counter]
        self.n_remaining_candidates_ = iter_counter[-1][1]
        self.n_iterations_ = iter_counter[-1][0] + 1

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            # If callable, refit is expected to return the index of the best
            # parameter set.
            if callable(self.refit):
                self.best_index_ = self.refit(results)
                if not isinstance(self.best_index_, numbers.Integral):
                    raise TypeError("best_index_ returned is not an integer")
                if self.best_index_ < 0 or self.best_index_ >= len(
                        results["params"]):
                    raise IndexError("best_index_ index out of range")
            else:
                self.best_index_ = results["rank_test_%s" %
                                           refit_metric].argmin()
                self.best_score_ = results["mean_test_%s" %
                                           refit_metric][self.best_index_]
            self.best_params_ = results["params"][self.best_index_]

        _logger.info(
            f"\nBest {refit_metric}: {self._res.get_runs_by_id(incumbent)[-1].info['test_score_mean']}"
        )
        _logger.info(f"Best found configuration: {self.best_params_}")
        _logger.info(
            f"A total of {len(id2config.keys())} unique configurations where sampled."
        )
        _logger.info(f"A total of {len(runs_all)} runs where executed.")
        _logger.info(
            f"Total budget of resource '{self.resource_name_}' corresponds to {sum([r.budget for r in runs_all]) / converted_max_budget} full function evaluations."
        )

        gc.collect()

        if self.refit:
            # we clone again after setting params in case some
            # of the params are estimators as well.
            refit_params = self.best_params_.copy()
            if self.resource_name_ != "n_samples":
                refit_params[self.resource_name_] = self.max_resources_
            self.best_estimator_ = clone(
                clone(base_estimator).set_params(**refit_params))
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #7
0
def _fit_and_score_open_set(estimator,
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            scorer,
                            verbose,
                            parameters,
                            fit_params,
                            return_train_score=False,
                            return_parameters=False,
                            return_n_test_samples=False,
                            return_times=False,
                            return_estimator=False,
                            error_score=np.nan):
    """
    Adapts the method sklearn.model_selection._validation._fit_and_score to the open-set setting, where some labels correspond to unknown camera models.

    Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X_train : array-like of shape (n_samples, n_features)
        The data to fit.

    y_train : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
        The target variable to try to predict in the case of supervised learning.

    X_test : array-like of shape (n_samples, n_features)
        The test data

    y_test : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
        The test labels

    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.

        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.

        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.

    verbose : int
        The verbosity level.

    error_score : 'raise' or numeric, default=np.nan
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, FitFailedWarning is raised. This parameter
        does not affect the refit step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : bool, default=False
        Compute and return score on training set.

    return_parameters : bool, default=False
        Return parameters that has been used for the estimator.

    return_n_test_samples : bool, default=False
        Whether to return the ``n_test_samples``

    return_times : bool, default=False
        Whether to return the fit/score times.

    return_estimator : bool, default=False
        Whether to return the fitted estimator.

    Returns
    -------
    train_scores : dict of scorer name -> float
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.

    test_scores : dict of scorer name -> float
        Score on testing set (for all the scorers).

    n_test_samples : int
        Number of test samples.

    fit_time : float
        Time spent for fitting in seconds.

    score_time : float
        Time spent for scoring in seconds.

    parameters : dict or None
        The parameters that have been evaluated.

    estimator : estimator object
        The fitted estimator
    """
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X_train, fit_params)

    train_scores = {}
    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for k, v in parameters.items():
            cloned_parameters[k] = clone(v, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.time()

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
                if return_train_score:
                    train_scores = test_scores.copy()
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" % (error_score, format_exc()),
                FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        test_scores = _score(estimator, X_test, y_test, scorer)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer)
    if verbose > 2:
        if isinstance(test_scores, dict):
            for scorer_name in sorted(test_scores):
                msg += ", %s=" % scorer_name
                if return_train_score:
                    msg += "(train=%.3f," % train_scores[scorer_name]
                    msg += " test=%.3f)" % test_scores[scorer_name]
                else:
                    msg += "%.3f" % test_scores[scorer_name]
        else:
            msg += ", score="
            msg += ("%.3f" % test_scores if not return_train_score else
                    "(train=%.3f, test=%.3f)" % (train_scores, test_scores))

    if verbose > 1:
        total_time = score_time + fit_time
        print(_message_with_time('CV', msg, total_time))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    if return_estimator:
        ret.append(estimator)
    return ret
예제 #8
0
def fp_fit_and_score(estimator,
                     X_original,
                     y_original,
                     X_fingerprinted,
                     y_fingerprinted,
                     scorer,
                     train_original,
                     test_original,
                     train_fingerprinted,
                     test_fingerprinted,
                     verbose,
                     parameters,
                     fit_params,
                     return_train_score=False,
                     return_parameters=False,
                     return_n_test_samples=False,
                     return_times=False,
                     return_estimator=False,
                     split_progress=None,
                     candidate_progress=None,
                     error_score=np.nan):
    if not isinstance(error_score, numbers.Number) and error_score != 'raise':
        raise ValueError(
            "error_score must be the string 'raise' or a numeric value. "
            "(Hint: if using 'raise', please make sure that it has been "
            "spelled correctly.)")

    progress_msg = ""
    if verbose > 2:
        if split_progress is not None:
            progress_msg = f" {split_progress[0]+1}/{split_progress[1]}"
        if candidate_progress and verbose > 9:
            progress_msg += (f"; {candidate_progress[0]+1}/"
                             f"{candidate_progress[1]}")

    if verbose > 1:
        if parameters is None:
            params_msg = ''
        else:
            sorted_keys = sorted(parameters)  # Ensure deterministic o/p
            params_msg = (', '.join(f'{k}={parameters[k]}'
                                    for k in sorted_keys))
    if verbose > 9:
        start_msg = f"[CV{progress_msg}] START {params_msg}"
        print(f"{start_msg}{(80 - len(start_msg)) * '.'}")

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X_fingerprinted, fit_params,
                                   train_fingerprinted)

    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for k, v in parameters.items():
            cloned_parameters[k] = clone(v, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.time()

    # here I need to make sure to split the fingerprinted data IN THE SAME WAY
    # original train data should be unused
    # fingerprinted test data should be unused
    X_train_original, y_train_original = _safe_split(estimator, X_original,
                                                     y_original,
                                                     train_original)
    X_test_original, y_test_original = _safe_split(estimator, X_original,
                                                   y_original, test_original,
                                                   train_original)

    X_train_fingerprinted, y_train_fingerprinted = _safe_split(
        estimator, X_fingerprinted, y_fingerprinted, train_fingerprinted)
    X_test_fingerprinted, y_test_fingerprinted = _safe_split(
        estimator, X_fingerprinted, y_fingerprinted, test_fingerprinted,
        train_fingerprinted)

    result = {}
    # fit the model on FINGERPRINTED data
    try:
        if y_train_fingerprinted is None:
            estimator.fit(X_train_fingerprinted, **fit_params)
        else:
            estimator.fit(X_train_fingerprinted, y_train_fingerprinted,
                          **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
                if return_train_score:
                    train_scores = test_scores.copy()
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" % (error_score, format_exc()),
                FitFailedWarning)
        result["fit_failed"] = True
    else:
        result["fit_failed"] = False

        fit_time = time.time() - start_time
        # obtain test scores from testing ORIGINAL test data against ORIGINAL target
        test_scores = _score(estimator, X_test_original, y_test_original,
                             scorer, error_score)
        # VERIFICATION PRINTOUTS
        # print(len(X_train_original.index))
        # print(len(X_train_fingerprinted.index))
        # print(type(X_test_original.index))
        # print(X_train_original.index)
        # print(X_train_fingerprinted.index)
        # print(X_train_original.index.equals(X_train_fingerprinted.index))
        # print(X_train_original.columns[1])
        # print(X_train_original[X_train_original.columns[1]].compare
        #       (X_train_fingerprinted[X_train_fingerprinted.columns[1]]))
        # print('----------------')
        # print(X_test_original[X_test_original.columns[1]].compare(X_test_fingerprinted[X_test_fingerprinted.columns[1]]))
        # print('________________')
        # print('Target should look the same')
        # print(y_train_fingerprinted.compare(y_train_original))
        # print('________________')
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            # train scores are based on FINGERPRINTED data
            train_scores = _score(estimator, X_train_fingerprinted,
                                  y_train_fingerprinted, scorer, error_score)

    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = f"[CV{progress_msg}] END "
        result_msg = params_msg + (";" if params_msg else "")
        if verbose > 2 and isinstance(test_scores, dict):
            for scorer_name in sorted(test_scores):
                result_msg += f" {scorer_name}: ("
                if return_train_score:
                    scorer_scores = train_scores[scorer_name]
                    result_msg += f"train={scorer_scores:.3f}, "
                result_msg += f"test={test_scores[scorer_name]:.3f})"
        result_msg += f" total time={logger.short_format_time(total_time)}"

        # Right align the result_msg
        end_msg += "." * (80 - len(end_msg) - len(result_msg))
        end_msg += result_msg
        print(end_msg)

    result["test_scores"] = test_scores
    if return_train_score:
        result["train_scores"] = train_scores
    if return_n_test_samples:
        result["n_test_samples"] = _num_samples(X_test_original)
    if return_times:
        result["fit_time"] = fit_time
        result["score_time"] = score_time
    if return_parameters:
        result["parameters"] = parameters
    if return_estimator:
        result["estimator"] = estimator
    return result
예제 #9
0
def _fit_and_score(estimator,
                   X,
                   y,
                   scorer,
                   train,
                   test,
                   verbose,
                   parameters,
                   fit_params,
                   return_train_score=False,
                   return_parameters=False,
                   return_n_test_samples=False,
                   return_times=False,
                   return_estimator=False,
                   split_progress=None,
                   candidate_progress=None,
                   error_score=np.nan,
                   online_train_val_split=False):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    X : array-like of shape (n_samples, n_features)
        The data to fit.
    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
        The target variable to try to predict in the case of
        supervised learning.
    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.
        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.
        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.
    train : array-like of shape (n_train_samples,)
        Indices of training samples.
    test : array-like of shape (n_test_samples,)
        Indices of test samples.
    verbose : int
        The verbosity level.
    error_score : 'raise' or numeric, default=np.nan
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, FitFailedWarning is raised.
    parameters : dict or None
        Parameters to be set on the estimator.
    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.
    return_train_score : bool, default=False
        Compute and return score on training set.
    return_parameters : bool, default=False
        Return parameters that has been used for the estimator.
    split_progress : {list, tuple} of int, default=None
        A list or tuple of format (<current_split_id>, <total_num_of_splits>).
    candidate_progress : {list, tuple} of int, default=None
        A list or tuple of format
        (<current_candidate_id>, <total_number_of_candidates>).
    return_n_test_samples : bool, default=False
        Whether to return the ``n_test_samples``.
    return_times : bool, default=False
        Whether to return the fit/score times.
    return_estimator : bool, default=False
        Whether to return the fitted estimator.
    Returns
    -------
    result : dict with the following attributes
        train_scores : dict of scorer name -> float
            Score on training set (for all the scorers),
            returned only if `return_train_score` is `True`.
        test_scores : dict of scorer name -> float
            Score on testing set (for all the scorers).
        n_test_samples : int
            Number of test samples.
        fit_time : float
            Time spent for fitting in seconds.
        score_time : float
            Time spent for scoring in seconds.
        parameters : dict or None
            The parameters that have been evaluated.
        estimator : estimator object
            The fitted estimator.
        fit_failed : bool
            The estimator failed to fit.
    """
    if not isinstance(error_score, numbers.Number) and error_score != 'raise':
        raise ValueError(
            "error_score must be the string 'raise' or a numeric value. "
            "(Hint: if using 'raise', please make sure that it has been "
            "spelled correctly.)")

    progress_msg = ""
    if verbose > 2:
        if split_progress is not None:
            progress_msg = f" {split_progress[0]+1}/{split_progress[1]}"
        if candidate_progress and verbose > 9:
            progress_msg += (f"; {candidate_progress[0]+1}/"
                             f"{candidate_progress[1]}")

    if verbose > 1:
        if parameters is None:
            params_msg = ''
        else:
            sorted_keys = sorted(parameters)  # Ensure deterministic o/p
            params_msg = (', '.join(f'{k}={parameters[k]}'
                                    for k in sorted_keys))
    if verbose > 9:
        start_msg = f"[CV{progress_msg}] START {params_msg}"
        print(f"{start_msg}{(80 - len(start_msg)) * '.'}")

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for k, v in parameters.items():
            cloned_parameters[k] = clone(v, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.time()

    if online_train_val_split:
        # inject the train and test data into the corresponding Subset selectors
        set_train_estim = False
        set_test_estim = False
        for estim in estimator:
            if set_train_estim and set_test_estim:
                break
            if isinstance(estim, TrainSubset):
                estim.date_range = [train]
                set_train_estim = True

            if isinstance(estim, TestSubset):
                estim.date_range = [test]
                set_test_estim = True
            if isinstance(estim, CVSubset) and isinstance(
                    estim, EvalOnlyWrapper):
                estim.set_range(test)
                set_test_estim = True
            if isinstance(estim, CVSubset) and isinstance(
                    estim, TrainOnlyWrapper):
                estim.set_range(train)
                set_train_estim = True

        if not set_train_estim or not set_test_estim:
            raise ValueError(
                "when specifying online learning a KeepTrain and KeepTest have to be in the pipeline"
            )
    else:
        X_train, y_train = _safe_split(estimator, X, y, train)
        X_test, y_test = _safe_split(estimator, X, y, test, train)

    result = {}
    try:
        if online_train_val_split:
            estimator = estimator.train()
            estimator.fit(X, y, **fit_params)
        else:
            if y_train is None:
                estimator.fit(X_train, **fit_params)
            else:
                estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
                if return_train_score:
                    train_scores = test_scores.copy()
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" % (error_score, format_exc()),
                FitFailedWarning)
        result["fit_failed"] = True
        y_sample_len = len(test)
    else:
        result["fit_failed"] = False

        fit_time = time.time() - start_time

        estimator.eval()
        if online_train_val_split:
            # select estimator without the classifier and transform x and y
            # to retrieve y_test
            _, y_prime = estimator[:-1].transform(X, y)
            if isinstance(y_prime, pd.DataFrame) and len(y_prime.columns) == 1:
                y_prime = y_prime.T.values.squeeze()

            y_sample_len = len(y_prime)
            test_scores = _score(estimator, X, y_prime, scorer)
        else:
            test_scores = _score(estimator, X_test, y_test, scorer,
                                 error_score)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            if online_train_val_split:
                estimator.train()

                _, y_prime = estimator[:-1].transform(X, y)
                if isinstance(y_prime, pd.DataFrame) and len(
                        y_prime.columns) == 1:
                    y_prime = y_prime.T.values.squeeze()
                train_scores = _score(estimator, X, y_prime, scorer)

                estimator.eval()
            else:
                train_scores = _score(estimator, X_train, y_train, scorer,
                                      error_score)

    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = f"[CV{progress_msg}] END "
        result_msg = params_msg + (";" if params_msg else "")
        if verbose > 2 and isinstance(test_scores, dict):
            for scorer_name in sorted(test_scores):
                result_msg += f" {scorer_name}: ("
                if return_train_score:
                    scorer_scores = train_scores[scorer_name]
                    result_msg += f"train={scorer_scores:.3f}, "
                result_msg += f"test={test_scores[scorer_name]:.3f})"
        result_msg += f" total time={logger.short_format_time(total_time)}"

        # Right align the result_msg
        end_msg += "." * (80 - len(end_msg) - len(result_msg))
        end_msg += result_msg
        print(end_msg)

    result["test_scores"] = test_scores
    if return_train_score:
        result["train_scores"] = train_scores
    if return_n_test_samples:
        if online_train_val_split:
            result["n_test_samples"] = y_sample_len
        else:
            result["n_test_samples"] = _num_samples(X_test)
    if return_times:
        result["fit_time"] = fit_time
        result["score_time"] = score_time
    if return_parameters:
        result["parameters"] = parameters
    if return_estimator:
        result["estimator"] = estimator
    return result
예제 #10
0
def _fit_and_predict(estimator, X, y, sample_weight, sample_weight_steps, train, test, verbose, fit_params,
                     method):
    """Fit estimator and predict values for a given dataset split.

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    sample_weight: array-like, optional

    sample_weight_steps: array-like, optional

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    method : string
        Invokes the passed method name of the passed estimator.

    Returns
    -------
    predictions : sequence
        Result of calling 'estimator.method'

    test : array-like
        This is the value of the test parameter
    """
    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    X_train, y_train, sample_weight_train = weighted_safe_split(estimator, X, y, sample_weight, train)
    X_test, _, _ = weighted_safe_split(estimator, X, y, sample_weight, test, train)

    if y_train is None and sample_weight_train is None:
        estimator.fit(X_train, **fit_params)
    elif sample_weight_train is None:
        estimator.fit(X_train, y_train, **fit_params)
    else:
        if sample_weight_steps is not None:
            for step in sample_weight_steps:
                fit_params[step + '__sample_weight'] = sample_weight_train
        else:
            fit_params['sample_weight'] = sample_weight_train
        estimator.fit(X_train, y_train, **fit_params)
    func = getattr(estimator, method)
    predictions = func(X_test)
    if method in ['decision_function', 'predict_proba', 'predict_log_proba']:
        n_classes = len(set(y))
        if n_classes != len(estimator.classes_):
            recommendation = (
                'To fix this, use a cross-validation '
                'technique resulting in properly '
                'stratified folds')
            warnings.warn('Number of classes in training fold ({}) does '
                          'not match total number of classes ({}). '
                          'Results may not be appropriate for your use case. '
                          '{}'.format(len(estimator.classes_),
                                      n_classes, recommendation),
                          RuntimeWarning)
            if method == 'decision_function':
                if (predictions.ndim == 2 and
                        predictions.shape[1] != len(estimator.classes_)):
                    # This handles the case when the shape of predictions
                    # does not match the number of classes used to train
                    # it with. This case is found when sklearn.svm.SVC is
                    # set to `decision_function_shape='ovo'`.
                    raise ValueError('Output shape {} of {} does not match '
                                     'number of classes ({}) in fold. '
                                     'Irregular decision_function outputs '
                                     'are not currently supported by '
                                     'cross_val_predict'.format(
                                        predictions.shape, method,
                                        len(estimator.classes_),
                                        recommendation))
                if len(estimator.classes_) <= 2:
                    # In this special case, `predictions` contains a 1D array.
                    raise ValueError('Only {} class/es in training fold, this '
                                     'is not supported for decision_function '
                                     'with imbalanced folds. {}'.format(
                                        len(estimator.classes_),
                                        recommendation))

            float_min = np.finfo(predictions.dtype).min
            default_values = {'decision_function': float_min,
                              'predict_log_proba': float_min,
                              'predict_proba': 0}
            predictions_for_all_classes = np.full((_num_samples(predictions),
                                                   n_classes),
                                                  default_values[method])
            predictions_for_all_classes[:, estimator.classes_] = predictions
            predictions = predictions_for_all_classes
    return predictions, test
예제 #11
0
파일: train.py 프로젝트: tjcreedy/tbaf-ml
def _fitnscore_cust(estimator,
                    X,
                    y,
                    scorer,
                    train,
                    test,
                    verbose,
                    parameters,
                    fit_params,
                    return_train_score=False,
                    return_parameters=False,
                    return_n_test_samples=False,
                    return_times=False,
                    return_n_iter=False,
                    return_cm=False,
                    return_roc=False,
                    return_prc=False,
                    return_threshc=False,
                    return_estimator=False,
                    error_score=np.nan):
    # estimator, scorer, return_train_score, return_parameters, return_n_test_samples, return_times, return_n_iter, return_cm, return_roc, return_prc, return_threshc, return_estimator, error_score, verbose = clone(base_estimator), scorers, True, True, True, True, True, True, True, True, True, True, np.nan, 5
    # parameters, (train, test) = next(product(candidate_params, cv.split(X, y, groups)))

    import numbers
    from traceback import format_exc
    from sklearn.utils.validation import _check_fit_params, _num_samples
    from sklearn.utils.metaestimators import _safe_split
    from sklearn.exceptions import FitFailedWarning
    from sklearn.model_selection._validation import _score
    from sklearn.utils import _message_with_time

    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    train_scores = {}
    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for k, v in parameters.items():
            cloned_parameters[k] = clone(v, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=ConvergenceWarning)
            if y_train is None:
                estimator.fit(X_train, **fit_params)
            else:
                estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
                if return_train_score:
                    train_scores = test_scores.copy()
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" % (error_score, format_exc()),
                FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        test_scores = _score(estimator, X_test, y_test, scorer)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer)
        if isinstance(estimator, pipeline.Pipeline):
            n_iter = estimator.steps[-1][1].n_iter_
        else:
            n_iter = estimator.n_iter_
        if return_cm:
            cm = dict(
                zip([
                    'true_negatives', 'false_positives', 'false_negatives',
                    'true_positives'
                ], [0] * 4))
            for true, pred in zip(y_test, estimator.predict(X_test)):
                # true, pred = next(zip(y_test, estimator.predict(X_test)))
                s = sum([true, pred])
                if s == 2:
                    cm['true_positives'] += 1
                elif s == 0:
                    cm['true_negatives'] += 1
                elif true == 1:
                    cm['false_negatives'] += 1
                elif pred == 1:
                    cm['false_positives'] += 1
                else:
                    raise Exception(f"Values {true} and {pred} not valid for"
                                    " computing confusion matrix")
        if return_roc or return_prc or return_threshc:
            df = estimator.decision_function(X_test)
            if return_roc:
                roc = metrics.roc_curve(y_test, df)
                auc = metrics.auc(roc[1], roc[2])
            if return_threshc or return_prc:
                prc = metrics.precision_recall_curve(y_test, df)
                ap = metrics.average_precision_score(y_test, df)
            if return_roc:
                roc_out = {'fpr': roc[0], 'tpr': roc[1], 'auc': auc}
            if return_prc:
                prc_out = {'rcl': prc[1], 'pre': prc[0], 'ap': ap}
            if return_threshc:
                threshc_out = {
                    'thr': prc[2],
                    'rcl': prc[1][:-1],
                    'pre': prc[0][:-1]
                }

    if verbose > 2:
        if isinstance(test_scores, dict):
            for scorer_name in sorted(test_scores):
                msg += ", %s=" % scorer_name
                if return_train_score:
                    msg += "(train=%.3f," % train_scores[scorer_name]
                    msg += " test=%.3f)" % test_scores[scorer_name]
                else:
                    msg += "%.3f" % test_scores[scorer_name]
        else:
            msg += ", score="
            msg += ("%.3f" % test_scores if not return_train_score else
                    "(train=%.3f, test=%.3f)" % (train_scores, test_scores))

    if verbose > 1:
        total_time = score_time + fit_time
        print(_message_with_time('CV', msg, total_time))

    ret = {'test_scores': test_scores}
    if return_train_score:
        ret['train_scores'] = train_scores
    if return_n_test_samples:
        ret['n_test_samples'] = _num_samples(X_test)
    if return_times:
        ret['fit_time'] = fit_time
        ret['score_time'] = score_time
    if return_parameters:
        ret['parameters'] = parameters
    if return_n_iter:
        ret['n_iter'] = n_iter
    if return_estimator:
        ret['estimator'] = estimator
    if return_cm:
        ret['confusion_matrix'] = cm
    if return_roc:
        ret['roc_values'] = roc_out
    if return_prc:
        ret['prc_values'] = prc_out
    if return_threshc:
        ret['threshc_values'] = threshc_out
    return ret
예제 #12
0
파일: train.py 프로젝트: tjcreedy/tbaf-ml
    def fit(self, X, y=None, *, groups=None, **fit_params):
        """Run fit with all sets of parameters.
        
        Parameters
        ----------
        
        X : array-like of shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        
        y : array-like of shape (n_samples, n_output) \
            or (n_samples,), default=None
            Target relative to X for classification or regression;
            None for unsupervised learning.
        
        groups : array-like of shape (n_samples,), default=None
            Group labels for the samples used while splitting the dataset into
            train/test set. Only used in conjunction with a "Group" :term:`cv`
            instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
        
        **fit_params : dict of str -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        # self, X, y, groups, fit_params = gs, train, cls, None, {}
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, str) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers) and not callable(self.refit):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key or a "
                                 "callable to refit an estimator with the "
                                 "best parameter setting on the whole "
                                 "data and make the best_* attributes "
                                 "available for that metric. If this is "
                                 "not needed, refit should be set to "
                                 "False explicitly. %r was passed." %
                                 self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        fit_params = _check_fit_params(X, fit_params)

        n_splits = cv.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)

        parallel = Parallel(n_jobs=self.n_jobs,
                            verbose=self.verbose,
                            pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(scorer=scorers,
                                    fit_params=fit_params,
                                    return_train_score=self.return_train_score,
                                    return_n_test_samples=True,
                                    return_times=True,
                                    return_parameters=False,
                                    return_n_iter=True,
                                    return_cm=True,
                                    return_roc=True,
                                    return_prc=True,
                                    return_threshc=True,
                                    return_estimator=False,
                                    error_score=self.error_score,
                                    verbose=self.verbose)
        results = {}
        with parallel:
            all_candidate_params = []
            all_out = []

            def evaluate_candidates(candidate_params):
                # candidate_params = model_selection.ParameterGrid(self.param_grid)
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                if self.verbose > 0:
                    print("Fitting {0} folds for each of {1} candidates,"
                          " totalling {2} fits".format(
                              n_splits, n_candidates, n_candidates * n_splits))

                out = parallel(
                    delayed(_fitnscore_cust)(clone(base_estimator),
                                             X,
                                             y,
                                             train=train,
                                             test=test,
                                             parameters=parameters,
                                             **fit_and_score_kwargs)
                    for parameters, (train, test) in product(
                        candidate_params, cv.split(X, y, groups)))

                if len(out) < 1:
                    raise ValueError('No fits were performed. '
                                     'Was the CV iterator empty? '
                                     'Were there no candidates?')
                elif len(out) != n_candidates * n_splits:
                    raise ValueError('cv.split and cv.get_n_splits returned '
                                     'inconsistent results. Expected {} '
                                     'splits, got {}'.format(
                                         n_splits,
                                         len(out) // n_candidates))

                all_candidate_params.extend(candidate_params)
                all_out.extend(out)

                nonlocal results
                results = self._format_results(all_candidate_params, scorers,
                                               n_splits, all_out)
                return results

            self._run_search(evaluate_candidates)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            # If callable, refit is expected to return the index of the best
            # parameter set.
            if callable(self.refit):
                self.best_index_ = self.refit(results)
                if not isinstance(self.best_index_, numbers.Integral):
                    raise TypeError('best_index_ returned is not an integer')
                if (self.best_index_ < 0
                        or self.best_index_ >= len(results["params"])):
                    raise IndexError('best_index_ index out of range')
            else:
                self.best_index_ = results["rank_test_%s" %
                                           refit_metric].argmin()
                self.best_score_ = results["mean_test_%s" %
                                           refit_metric][self.best_index_]
            self.best_params_ = results["params"][self.best_index_]

        if self.refit:
            # we clone again after setting params in case some
            # of the params are estimators as well.
            self.best_estimator_ = clone(
                clone(base_estimator).set_params(**self.best_params_))
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #13
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------

        X : array-like of shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like of shape (n_samples, n_output) or (n_samples,), optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set. Only used in conjunction with a "Group" :term:`cv`
            instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, str) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers) and not callable(self.refit):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key or a "
                                 "callable to refit an estimator with the "
                                 "best parameter setting on the whole "
                                 "data and make the best_* attributes "
                                 "available for that metric. If this is "
                                 "not needed, refit should be set to "
                                 "False explicitly. %r was passed." %
                                 self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        fit_params = _check_fit_params(X, fit_params)

        n_splits = cv.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)

        # handled in fit functions below
        # parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
        #                     pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(scorer=scorers,
                                    fit_params=fit_params,
                                    return_train_score=self.return_train_score,
                                    return_n_test_samples=True,
                                    return_times=True,
                                    return_parameters=False,
                                    error_score=self.error_score,
                                    verbose=self.verbose)

        # sklearn code above
        if self.spark is None:
            fitting_function = self._run_sklearn_fit
        else:
            fitting_function = self._run_skspark_fit

        results = fitting_function(base_estimator=base_estimator,
                                   X=X,
                                   y=y,
                                   scorers=scorers,
                                   cv=cv,
                                   groups=groups,
                                   n_splits=n_splits,
                                   fit_and_score_kwargs=fit_and_score_kwargs)
        # sklearn code below

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            # If callable, refit is expected to return the index of the best
            # parameter set.
            if callable(self.refit):
                self.best_index_ = self.refit(results)
                if not isinstance(self.best_index_, numbers.Integral):
                    raise TypeError('best_index_ returned is not an integer')
                if (self.best_index_ < 0
                        or self.best_index_ >= len(results["params"])):
                    raise IndexError('best_index_ index out of range')
            else:
                self.best_index_ = results["rank_test_%s" %
                                           refit_metric].argmin()
                self.best_score_ = results["mean_test_%s" %
                                           refit_metric][self.best_index_]
            self.best_params_ = results["params"][self.best_index_]

        if self.refit:
            # we clone again after setting params in case some
            # of the params are estimators as well.
            self.best_estimator_ = clone(
                clone(base_estimator).set_params(**self.best_params_))
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #14
0
파일: _search.py 프로젝트: TUD-STKS/PyRCN
    def fit(self,
            X: np.ndarray,
            y: Optional[np.ndarray] = None,
            *,
            groups: Optional[np.ndarray] = None,
            **fit_params: dict) -> SHGOSearchCV:
        """
        Run the optimization based on the parameters defined before.

        Parameters
        ----------
        X : np.ndarray of shape (n_samples, n_features)
             Training vector, where `n_samples` is the number of samples and
            `n_features` is the number of features.
        y : np.ndarray of shape(n_samples, n_output) or (n_samples, ),
        default = None
            Target relative to X for classification or regression; None for
            unsupervised learning.
        groups : np.ndarray of shape(n_samples, ), default = None
            Group labels for the samples used while splitting the dataset into
            train/test set. Only used in conjunction with a "Group" :term:`cv`
            instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
        **fit_params : dict of str -> object
            Parameters passed to the ``fit`` method of the estimator.

        Returns
        -------
        self : object
            Instance of fitted estimator.
        """
        estimator = self.estimator
        func = self.func

        X, y, groups = indexable(X, y, groups)
        fit_params = _check_fit_params(X, fit_params)

        cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
        n_splits = cv_orig.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)
        param_names = sorted(self.params)
        bounds = [self.params[name] for name in param_names]
        constraints = self.constraints
        train = [None] * n_splits
        test = [None] * n_splits
        for k, (tr, te) in enumerate(self.cv.split(X, y, groups)):
            train[k] = tr
            test[k] = te

        res = optimize.shgo(func=func,
                            bounds=bounds,
                            constraints=constraints,
                            args=(param_names, clone(base_estimator), X, y,
                                  train, test))

        result = {}
        for param_name, param_value in zip(param_names, res.x):
            result[param_name] = param_value
        self.best_params_ = result

        if self.refit:
            # we clone again after setting params in case some
            # of the params are estimators as well.
            self.best_estimator_ = clone(
                clone(base_estimator).set_params(**self.best_params_))
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

            if hasattr(self.best_estimator_, "feature_names_in_"):
                self.feature_names_in_ = self.best_estimator_.feature_names_in_
        self.n_splits_ = n_splits

        return self
예제 #15
0
    def fit(self, X, y=None, **fit_params):
        cv = check_cv(self.cv, y, classifier=is_classifier(self))
        fit_params = _check_fit_params(X, fit_params)

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self, scoring=self.scoring)
        best_scorer = list(scorers.keys())[0]

        self.results_ = {}
        self.summary = pd.DataFrame(columns=['model', 'params'] + [
            f'{prefix} {scoring}' for scoring in scorers.keys()
            for prefix in ('mean', 'std')
        ])

        self.estimators_ = []
        estimators, params = self._validate_estimators()
        for i in range(len(estimators)):
            prefix = ''
            _params = params[i].copy()
            _name = estimators[i].__class__.__name__

            if self.verbose:
                print(
                    _name, ''.join(f'\n  {name}: {str(v)}'
                                   for name, v in _params.items()))

            search = _params.pop('search', 'grid')
            n_iter = _params.pop('n_iter', 5)

            if is_classifier(self):
                prefix = 'base_estimator__'
                estimators[i] = CalibratedClassifierCV(
                    base_estimator=estimators[i])

            _prefixed_params = {
                prefix + name: v
                for name, v in _params.items()
            }

            if search == 'grid':
                estimator_ = GridSearchCV(estimators[i],
                                          _prefixed_params,
                                          scoring=self.scoring,
                                          refit=best_scorer,
                                          cv=self.cv,
                                          n_jobs=self.n_jobs).fit(X, y)
            else:
                estimator_ = RandomizedSearchCV(
                    estimators[i],
                    _prefixed_params,
                    scoring=self.scoring,
                    refit=best_scorer,
                    cv=self.cv,
                    n_jobs=self.n_jobs,
                    n_iter=n_iter,
                    random_state=self.random_state).fit(X, y)

            self.estimators_.append(estimator_.best_estimator_)

            for name, v in estimator_.cv_results_.items():
                v = v[estimator_.best_index_]
                v = {key[len(prefix):]: value
                     for key, value in v.items()} if name == 'params' else v

                if name in self.results_:
                    self.results_[name] = np.append(self.results_[name], v)

                    if name[:4] == 'rank':
                        score_rank = self.results_['mean' + name[4:]]
                        self.results_[name] = len(score_rank) - np.argsort(
                            score_rank)
                else:
                    self.results_[name] = np.array([v])

            summary = {
                'model':
                _name,
                'params':
                '; '.join(f'{name}: {v}'
                          for name, v in self.results_['params'][-1].items())
            }
            for scorer in scorers.keys():
                summary['mean ' + scorer] = self.results_['mean_test_' +
                                                          scorer][-1]
                summary['std ' + scorer] = self.results_['std_test_' +
                                                         scorer][-1]

            self.summary = self.summary.append(pd.Series(summary),
                                               ignore_index=True)

            self.best_index_ = np.argmin(self.results_['rank_test_' +
                                                       best_scorer])
            self.best_score_ = self.results_['rank_test_' +
                                             best_scorer][self.best_index_]
            self.best_estimator_ = self.estimators_[self.best_index_]

            if self.verbose:
                print('\nBest score:')
                for name, v in self.results_.items():
                    if name[:9] == 'mean_test':
                        print('  {}: {:.4f}'.format(name[10:], v[-1]), end=' ')
                    elif name[:8] == 'std_test':
                        print('+/- {:.4f}'.format(v[-1]))
                print('=' * 50, end='\n\n')

        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        if self.verbose:
            print('Best model:')
            print(f"  {self.summary['model'].iloc[self.best_index_]}")
            print('\n'.join(f'    {key}: {str(value)}' for key, value in
                            self.results_['params'][self.best_index_].items()))
            for key, value in self.results_.items():
                if key[:9] == 'mean_test':
                    print('  {}: {:.4f}'.format(key[10:],
                                                value[self.best_index_]),
                          end=' ')
                elif key[:8] == 'std_test':
                    print('+/- {:.4f}'.format(value[self.best_index_]))

        return self
def _fit_and_score(estimator,
                   X,
                   y,
                   scorer,
                   train,
                   test,
                   verbose,
                   parameters,
                   fit_params,
                   return_train_score=False,
                   return_parameters=False,
                   return_n_test_samples=False,
                   return_times=False,
                   return_estimator=False,
                   split_progress=None,
                   candidate_progress=None,
                   error_score=np.nan):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    X : array-like of shape (n_samples, n_features)
        The data to fit.
    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
        The target variable to try to predict in the case of
        supervised learning.
    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.
        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.
        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.
    train : array-like of shape (n_train_samples,)
        Indices of training samples.
    test : array-like of shape (n_test_samples,)
        Indices of test samples.
    verbose : int
        The verbosity level.
    error_score : 'raise' or numeric, default=np.nan
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, FitFailedWarning is raised. This parameter
        does not affect the refit step, which will always raise the error.
    parameters : dict or None
        Parameters to be set on the estimator.
    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.
    return_train_score : bool, default=False
        Compute and return score on training set.
    return_parameters : bool, default=False
        Return parameters that has been used for the estimator.
    split_progress : list or tuple, optional, default: None
        A list or tuple of format (<current_split_id>, <total_num_of_splits>)
    candidate_progress : list or tuple, optional, default: None
        A list or tuple of format
        (<current_candidate_id>, <total_number_of_candidates>)
    return_n_test_samples : bool, default=False
        Whether to return the ``n_test_samples``
    return_times : bool, default=False
        Whether to return the fit/score times.
    return_estimator : bool, default=False
        Whether to return the fitted estimator.
    Returns
    -------
    result : dict with the following attributes
        train_scores : dict of scorer name -> float
            Score on training set (for all the scorers),
            returned only if `return_train_score` is `True`.
        test_scores : dict of scorer name -> float
            Score on testing set (for all the scorers).
        n_test_samples : int
            Number of test samples.
        fit_time : float
            Time spent for fitting in seconds.
        score_time : float
            Time spent for scoring in seconds.
        parameters : dict or None
            The parameters that have been evaluated.
        estimator : estimator object
            The fitted estimator.
    """
    progress_msg = ""
    if verbose > 2:
        if split_progress is not None:
            progress_msg = f" {split_progress[0]+1}/{split_progress[1]}"
        if candidate_progress and verbose > 9:
            progress_msg += (f"; {candidate_progress[0]+1}/"
                             f"{candidate_progress[1]}")

    if verbose > 1:
        if parameters is None:
            params_msg = ''
        else:
            sorted_keys = sorted(parameters)  # Ensure deterministic o/p
            params_msg = (', '.join(f'{k}={parameters[k]}'
                                    for k in sorted_keys))
    if verbose > 9:
        start_msg = f"[CV{progress_msg}] START {params_msg}"
        print(f"{start_msg}{(80 - len(start_msg)) * '.'}")

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for k, v in parameters.items():
            cloned_parameters[k] = clone(v, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    # Fit the estimator
    # Let the custom bag scorer handle fitting of the estimator
    result = {}
    try:
        estimator = _BagScorer_estimator_fit(estimator, X_train, y_train,
                                             scorer, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
                if return_train_score:
                    train_scores = test_scores.copy()
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" % (error_score, format_exc()),
                FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        # The estimator is fitted to data correctly
        # Calculate scoring of estimator using custom scorer
        fit_time = time.time() - start_time
        test_scores = _score(estimator, X_test, y_test, scorer)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer)

    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = f"[CV{progress_msg}] END "
        result_msg = params_msg + (";" if params_msg else "")
        if verbose > 2:
            if isinstance(test_scores, dict):
                for scorer_name in sorted(test_scores):
                    result_msg += f" {scorer_name}: ("
                    if return_train_score:
                        result_msg += (f"train="
                                       f"{train_scores[scorer_name]:.3f}, ")
                    result_msg += f"test={test_scores[scorer_name]:.3f})"
        result_msg += f" total time={logger.short_format_time(total_time)}"

        # Right align the result_msg
        end_msg += "." * (80 - len(end_msg) - len(result_msg))
        end_msg += result_msg
        print(end_msg)

    result["test_scores"] = test_scores
    if return_train_score:
        result["train_scores"] = train_scores
    if return_n_test_samples:
        # Return the number of bags
        result["n_test_samples"] = _num_samples(X_test)
    if return_times:
        result["fit_time"] = fit_time
        result["score_time"] = score_time
    if return_parameters:
        result["parameters"] = parameters
    if return_estimator:
        result["estimator"] = estimator

    return result
예제 #17
0
    def fit(self, X, y=None, *, groups=None, **fit_params):
        self.initialize_fitting(X, y)

        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, str) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers) and not callable(self.refit):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key or a "
                                 "callable to refit an estimator with the "
                                 "best parameter setting on the whole "
                                 "data and make the best_* attributes "
                                 "available for that metric. If this is "
                                 "not needed, refit should be set to "
                                 "False explicitly. %r was passed." %
                                 self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        fit_params = _check_fit_params(X, fit_params)

        n_splits = cv.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)

        parallel = Parallel(n_jobs=self.n_jobs,
                            verbose=self.verbose,
                            pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(scorer=scorers,
                                    fit_params=fit_params,
                                    return_train_score=self.return_train_score,
                                    return_n_test_samples=True,
                                    return_times=True,
                                    return_parameters=False,
                                    error_score=self.error_score,
                                    verbose=self.verbose)
        results = {}
        with parallel:
            all_candidate_params = []
            all_out = []

            def evaluate_candidates(candidate_params):
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                if self.verbose > 0:
                    print("Fitting {0} folds for each of {1} candidates,"
                          " totalling {2} fits".format(
                              n_splits, n_candidates, n_candidates * n_splits))

                out = parallel(
                    delayed(self._fit_score_and_log)(clone(base_estimator),
                                                     X,
                                                     y,
                                                     train=train,
                                                     test=test,
                                                     parameters=parameters,
                                                     **fit_and_score_kwargs)
                    for parameters, (train, test) in product(
                        candidate_params, cv.split(X, y, groups)))

                if len(out) < 1:
                    raise ValueError('No fits were performed. '
                                     'Was the CV iterator empty? '
                                     'Were there no candidates?')
                elif len(out) != n_candidates * n_splits:
                    raise ValueError('cv.split and cv.get_n_splits returned '
                                     'inconsistent results. Expected {} '
                                     'splits, got {}'.format(
                                         n_splits,
                                         len(out) // n_candidates))

                all_candidate_params.extend(candidate_params)
                all_out.extend(out)

                nonlocal results
                results = self._format_results(all_candidate_params, scorers,
                                               n_splits, all_out)
                return results

            self._run_search(evaluate_candidates)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            # If callable, refit is expected to return the index of the best
            # parameter set.
            if callable(self.refit):
                self.best_index_ = self.refit(results)
                if not isinstance(self.best_index_, numbers.Integral):
                    raise TypeError('best_index_ returned is not an integer')
                if (self.best_index_ < 0
                        or self.best_index_ >= len(results["params"])):
                    raise IndexError('best_index_ index out of range')
            else:
                self.best_index_ = results["rank_test_%s" %
                                           refit_metric].argmin()
                self.best_score_ = results["mean_test_%s" %
                                           refit_metric][self.best_index_]
            self.best_params_ = results["params"][self.best_index_]

        if self.refit:
            # we clone again after setting params in case some
            # of the params are estimators as well.
            self.best_estimator_ = clone(
                clone(base_estimator).set_params(**self.best_params_))
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #18
0
    def fit(self, Xs, y=None, *, groups=None, **fit_params):
        """Run fit with all sets of parameters.
        Parameters
        ----------
        Xs : array-like of shape (n_samples, n_features)
            Training vector, where `n_samples` is the number of samples and
            `n_features` is the number of features.
        y : array-like of shape (n_samples, n_output) \
            or (n_samples,), default=None
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like of shape (n_samples,), default=None
            Group labels for the samples used while splitting the dataset into
            train/test set. Only used in conjunction with a "Group" :term:`cv`
            instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
        **fit_params : dict of str -> object
            Parameters passed to the ``fit`` method of the estimator.
        Returns
        -------
        self : object
            Instance of fitted estimator.
        """
        estimator = self.estimator
        refit_metric = "score"

        if callable(self.scoring):
            scorers = self.scoring
        elif self.scoring is None or isinstance(self.scoring, str):
            scorers = check_scoring(self.estimator, self.scoring)
        else:
            scorers = _check_multimetric_scoring(self.estimator, self.scoring)
            self._check_refit_for_multimetric(scorers)
            refit_metric = self.refit

        fit_params = _check_fit_params(Xs[0], fit_params)

        cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
        n_splits = cv_orig.get_n_splits(Xs[0], y, groups)

        base_estimator = clone(self.estimator)

        parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(
            scorer=scorers,
            fit_params=fit_params,
            return_train_score=self.return_train_score,
            return_n_test_samples=True,
            return_times=True,
            return_parameters=False,
            error_score=self.error_score,
            verbose=self.verbose,
        )
        results = {}
        with parallel:
            all_candidate_params = []
            all_out = []
            all_more_results = defaultdict(list)

            def evaluate_candidates(candidate_params,
                                    cv=None,
                                    more_results=None):
                cv = cv or cv_orig
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                if self.verbose > 0:
                    print("Fitting {0} folds for each of {1} candidates,"
                          " totalling {2} fits".format(
                              n_splits, n_candidates, n_candidates * n_splits))

                X_transformed, _, _, n_features = check_Xs(
                    Xs, copy=True, return_dimensions=True)
                pipeline = Pipeline([
                    ("splitter", SimpleSplitter(n_features)),
                    ("estimator", clone(base_estimator)),
                ])
                pipeline.fit(np.hstack(Xs))
                out = parallel(
                    delayed(_fit_and_score)(
                        pipeline,
                        np.hstack(Xs),
                        y,
                        train=train,
                        test=test,
                        parameters={
                            f"estimator__{k}": v
                            for k, v in parameters.items()
                        },
                        split_progress=(split_idx, n_splits),
                        candidate_progress=(cand_idx, n_candidates),
                        **fit_and_score_kwargs,
                    ) for (cand_idx,
                           parameters), (split_idx, (train, test)) in product(
                               enumerate(candidate_params),
                               enumerate(cv.split(Xs[0], y, groups)),
                           ))

                if len(out) < 1:
                    raise ValueError("No fits were performed. "
                                     "Was the CV iterator empty? "
                                     "Were there no candidates?")
                elif len(out) != n_candidates * n_splits:
                    raise ValueError("cv.split and cv.get_n_splits returned "
                                     "inconsistent results. Expected {} "
                                     "splits, got {}".format(
                                         n_splits,
                                         len(out) // n_candidates))

                # For callable self.scoring, the return type is only know after
                # calling. If the return type is a dictionary, the error scores
                # can now be inserted with the correct key. The type checking
                # of out will be done in `_insert_error_scores`.
                if callable(self.scoring):
                    _insert_error_scores(out, self.error_score)

                all_candidate_params.extend(candidate_params)
                all_out.extend(out)

                if more_results is not None:
                    for key, value in more_results.items():
                        all_more_results[key].extend(value)

                nonlocal results
                results = self._format_results(all_candidate_params, n_splits,
                                               all_out, all_more_results)

                return results

            self._run_search(evaluate_candidates)

            # multimetric is determined here because in the case of a callable
            # self.scoring the return type is only known after calling
            first_test_score = all_out[0]["test_scores"]
            self.multimetric_ = isinstance(first_test_score, dict)

            # check refit_metric now for a callabe scorer that is multimetric
            if callable(self.scoring) and self.multimetric_:
                self._check_refit_for_multimetric(first_test_score)
                refit_metric = self.refit

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = self._select_best_index(self.refit,
                                                       refit_metric, results)
            if not callable(self.refit):
                # With a non-custom callable, we can select the best score
                # based on the best index
                self.best_score_ = results[f"mean_test_{refit_metric}"][
                    self.best_index_]
            self.best_params_ = results["params"][self.best_index_]

        if self.refit:
            # we clone again after setting params in case some
            # of the params are estimators as well.
            self.best_estimator_ = clone(
                clone(base_estimator).set_params(**self.best_params_))
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(Xs, y, **fit_params)
            else:
                self.best_estimator_.fit(Xs, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

            if hasattr(self.best_estimator_, "feature_names_in_"):
                self.feature_names_in_ = self.best_estimator_.feature_names_in_

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers

        self.cv_results_ = results
        self.n_splits_ = n_splits
        return self
예제 #19
0
    def fit(self, X, y=None, *, groups=None, **fit_params):
        """Run fit with all sets of parameters.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like of shape (n_samples, n_output) \
            or (n_samples,), default=None
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like of shape (n_samples,), default=None
            Group labels for the samples used while splitting the dataset into
            train/test set. Only used in conjunction with a "Group" :term:`cv`
            instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
        **fit_params : dict of str -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        refit_metric = "score"

        if callable(self.scoring):
            scorers = self.scoring
        elif self.scoring is None or isinstance(self.scoring, str):
            scorers = check_scoring(self.estimator, self.scoring)
        else:
            scorers = _check_multimetric_scoring(self.estimator, self.scoring)
            self._check_refit_for_multimetric(scorers)
            refit_metric = self.refit

        #X, y, groups = indexable(X, y, groups) # todo debug
        fit_params = _check_fit_params(X, fit_params)

        cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
        n_splits = cv_orig.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)

        parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(scorer=scorers,
                                    fit_params=fit_params,
                                    return_train_score=self.return_train_score,
                                    return_n_test_samples=True,
                                    return_times=True,
                                    return_parameters=False,
                                    error_score=self.error_score,
                                    verbose=self.verbose)
        results = {}
        with parallel:
            all_candidate_params = []
            all_out = []
            all_more_results = defaultdict(list)

            def evaluate_candidates(candidate_params,
                                    cv=None,
                                    more_results=None):
                cv = cv or cv_orig
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                if self.verbose > 0:
                    print("Fitting {0} folds for each of {1} candidates,"
                          " totalling {2} fits".format(
                              n_splits, n_candidates, n_candidates * n_splits))

                if self.online_train_val_split:
                    can = enumerate(candidate_params)
                    spl = enumerate(cv.split(X, None, groups))
                    lst = []
                    for (cand_idx, parameters), (split_idx,
                                                 (train,
                                                  test)) in product(can, spl):
                        lst.append(
                            delayed(_fit_and_score)(
                                clone(base_estimator),
                                X,
                                y,
                                train=train,
                                test=test,
                                parameters=parameters,
                                online_train_val_split=True,
                                **fit_and_score_kwargs))
                    out = parallel(lst)
                else:
                    can = enumerate(candidate_params)
                    spl = enumerate(cv.split(X, y, groups))
                    lst = []
                    for (cand_idx, parameters), (split_idx,
                                                 (train,
                                                  test)) in product(can, spl):
                        lst.append(
                            delayed(_fit_and_score)(
                                clone(base_estimator),
                                X,
                                y,
                                train=train,
                                test=test,
                                parameters=parameters,
                                split_progress=(split_idx, n_splits),
                                candidate_progress=(cand_idx, n_candidates),
                                online_train_val_split=False,
                                **fit_and_score_kwargs))
                    out = parallel(lst)


#                    out = parallel(delayed(_fit_and_score)(clone(base_estimator),
#                                                           X, y,
#                                                           train=train, test=test,
#                                                           parameters=parameters,
#                                                           split_progress=(
#                                                               split_idx,
#                                                               n_splits),
#                                                           candidate_progress=(
#                                                               cand_idx,
#                                                               n_candidates),
#                                                           **fit_and_score_kwargs)
#                                   for (cand_idx, parameters),
#                                       (split_idx, (train, test)) in product(
#                                       enumerate(candidate_params),
#                                       enumerate(cv.split(X, y, groups)))
#                                   )

                if len(out) < 1:
                    raise ValueError('No fits were performed. '
                                     'Was the CV iterator empty? '
                                     'Were there no candidates?')
                elif len(out) != n_candidates * n_splits:
                    raise ValueError('cv.split and cv.get_n_splits returned '
                                     'inconsistent results. Expected {} '
                                     'splits, got {}'.format(
                                         n_splits,
                                         len(out) // n_candidates))

                # For callable self.scoring, the return type is only know after
                # calling. If the return type is a dictionary, the error scores
                # can now be inserted with the correct key. The type checking
                # of out will be done in `_insert_error_scores`.
                if callable(self.scoring):
                    _insert_error_scores(out, self.error_score)
                all_candidate_params.extend(candidate_params)
                all_out.extend(out)
                if more_results is not None:
                    for key, value in more_results.items():
                        all_more_results[key].extend(value)

                nonlocal results
                results = self._format_results(all_candidate_params, n_splits,
                                               all_out, all_more_results)

                return results

            self._run_search(evaluate_candidates)

            # multimetric is determined here because in the case of a callable
            # self.scoring the return type is only known after calling
            first_test_score = all_out[0]['test_scores']
            self.multimetric_ = isinstance(first_test_score, dict)

            # check refit_metric now for a callabe scorer that is multimetric
            if callable(self.scoring) and self.multimetric_:
                self._check_refit_for_multimetric(first_test_score)
                refit_metric = self.refit

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            # If callable, refit is expected to return the index of the best
            # parameter set.
            if callable(self.refit):
                self.best_index_ = self.refit(results)
                if not isinstance(self.best_index_, np.numbers.Integral):
                    raise TypeError('best_index_ returned is not an integer')
                if (self.best_index_ < 0
                        or self.best_index_ >= len(results["params"])):
                    raise IndexError('best_index_ index out of range')
            else:
                self.best_index_ = results["rank_test_%s" %
                                           refit_metric].argmin()
                self.best_score_ = results["mean_test_%s" %
                                           refit_metric][self.best_index_]
            self.best_params_ = results["params"][self.best_index_]

        if self.refit:
            # we clone again after setting params in case some
            # of the params are estimators as well.
            self.best_estimator_ = clone(
                clone(base_estimator).set_params(**self.best_params_))

            refit_start_time = time.time()
            if isinstance(self.best_estimator_, Pipeline):
                self.best_estimator_.train()
                # todo set train intervall to whole dataset
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)

            if isinstance(self.best_estimator_, Pipeline):
                self.best_estimator_.prod()
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
예제 #20
0
def _fit_and_score_weighted(estimator,
                            x_data,
                            y_data,
                            scorer,
                            train,
                            test,
                            verbose,
                            parameters,
                            fit_params,
                            error_score=np.nan,
                            sample_weights=None):
    """Expand :func:`sklearn.model_selection._validation._fit_and_score`."""
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(x_data, fit_params, train)

    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for (key, val) in parameters.items():
            cloned_parameters[key] = clone(val, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.time()

    x_train, y_train = _safe_split(estimator, x_data, y_data, train)
    x_test, y_test = _safe_split(estimator, x_data, y_data, test, train)
    if sample_weights is not None:
        sample_weights_test = sample_weights[test]
    else:
        sample_weights_test = None

    try:
        if y_train is None:
            estimator.fit(x_train, **fit_params)
        else:
            estimator.fit(x_train, y_train, **fit_params)

    except Exception as exc:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        if isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
            else:
                test_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" %
                (error_score, format_exception_only(type(exc), exc)[0]),
                FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        test_scores = _score_weighted(estimator,
                                      x_test,
                                      y_test,
                                      scorer,
                                      sample_weights=sample_weights_test)
        score_time = time.time() - start_time - fit_time
    if verbose > 2:
        if isinstance(test_scores, dict):
            for scorer_name in sorted(test_scores):
                msg += ", %s=" % scorer_name
                msg += "%.3f" % test_scores[scorer_name]
        else:
            msg += ", score="
            msg += "%.3f" % test_scores

    if verbose > 1:
        total_time = score_time + fit_time
        print(_message_with_time('CV', msg, total_time))

    return [test_scores]
예제 #21
0
def _compute_one_fold(
    fold_index,
    train,
    test,
    multi_output_proba,
    all_classes,
    classes,
    estimator,
    X,
    y,
    groups,
    scorers,
    verbose,
    fit_params,
    return_predict,
    method,
    no_scoring,
):
    if verbose:
        print("cv %d started\n" % fold_index)

    ### Clone the estimator ###
    cloned_estimator = sklearn.base.clone(estimator)

    ### split train test ###
    X_train, y_train = sklearn.model_selection._validation._safe_split(
        estimator, X, y, train)
    if groups is not None:
        groups_train, _ = sklearn.model_selection._validation._safe_split(
            estimator, groups, None, train)
    else:
        groups_train = None

    X_test, y_test = sklearn.model_selection._validation._safe_split(
        estimator, X, y, test, train)
    if groups is not None:
        groups_test, _ = sklearn.model_selection._validation._safe_split(
            estimator, groups, None, test, train)
    else:
        groups_test = None

    if hasattr(X_test, "index"):
        index_test = X_test.index
    else:
        index_test = test

    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)
    # Try to subset the fit_params if that is possible, Ex : 'sample_weight=np.array(....)' should be subsetted but not 'epochs=10'
    start_fit = time()

    ### Fit estimator ###
    if y_train is None:
        if groups_train is not None and function_has_named_argument(
                cloned_estimator.fit, "groups"):
            cloned_estimator.fit(X_train, groups=groups_train, **fit_params)
        else:
            cloned_estimator.fit(X_train, **fit_params)
    else:
        if groups_train is not None and function_has_named_argument(
                cloned_estimator.fit, "groups"):
            cloned_estimator.fit(X_train,
                                 y_train,
                                 groups=groups_train,
                                 **fit_params)
        else:
            cloned_estimator.fit(X_train, y_train, **fit_params)

    fit_time = time() - start_fit

    result_predict = None
    if return_predict:
        func = getattr(cloned_estimator, method)
        predictions = func(X_test)

        ## re-alignement with class ##
        if method in ("predict_proba", "predict_log_proba",
                      "decision_function"):

            def _align_predict(predictions, classes,
                               cloned_estimator_classes_):

                float_min = np.finfo(predictions.dtype).min
                default_values = {
                    "decision_function": float_min,
                    "predict_log_proba": float_min,
                    "predict_proba": 0
                }

                predictions_for_all_classes = pd.DataFrame(
                    default_values[method], index=index_test, columns=classes)

                for j, c in enumerate(cloned_estimator_classes_):
                    predictions_for_all_classes[c] = predictions[:, j]

                return predictions_for_all_classes

            if multi_output_proba:
                predictions = [
                    _align_predict(p, c, cloned_c) for p, c, cloned_c in zip(
                        predictions, all_classes, cloned_estimator.classes_)
                ]
            else:
                predictions = _align_predict(predictions, classes,
                                             cloned_estimator.classes_)

        result_predict = (predictions, test)

    result = OrderedDict()

    ### Score test ###
    test_scores_dictionary = None
    if not no_scoring:
        start_score = time()
        test_scores_dictionary = _score_with_group(cloned_estimator,
                                                   X_test,
                                                   y_test,
                                                   groups_test,
                                                   scorer=scorers,
                                                   is_multimetric=True)
        # Here : scorers is a dictionary of scorers, hence is_multimetric = True
        score_time = time() - start_score

        ### Score train ###
        train_scores_dictionary = _score_with_group(cloned_estimator,
                                                    X_train,
                                                    y_train,
                                                    groups_train,
                                                    scorer=scorers,
                                                    is_multimetric=True)

        ### Put everything into a dictionnary ###
        for k, v in test_scores_dictionary.items():
            result["test_%s" % k] = v

        for k, v in train_scores_dictionary.items():
            result["train_%s" % k] = v

    result["fit_time"] = fit_time

    if not no_scoring:
        result["score_time"] = score_time

    result[
        "n_test_samples"] = sklearn.model_selection._validation._num_samples(
            X_test)
    result["fold_nb"] = fold_index

    return result, result_predict, test_scores_dictionary
예제 #22
0
def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params,
                     method):
    """Fit estimator and predict values for a given dataset split.

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like of shape (n_samples, n_features)
        The data to fit.

        .. versionchanged:: 0.20
            X is only required to be an object with finite length or shape now

    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
        The target variable to try to predict in the case of
        supervised learning.

    train : array-like of shape (n_train_samples,)
        Indices of training samples.

    test : array-like of shape (n_test_samples,)
        Indices of test samples.

    verbose : int
        The verbosity level.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    method : str
        Invokes the passed method name of the passed estimator.

    Returns
    -------
    predictions : sequence
        Result of calling 'estimator.method'

    test : array-like
        This is the value of the test parameter
    """
    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, _ = _safe_split(estimator, X, y, test, train)

    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    func = getattr(estimator, method)
    assert not np.any(np.isinf(X_test)), X_test
    assert not np.any(np.isnan(X_test)), X_test
    predictions = func(X_test)

    encode = method in [
        'decision_function', 'predict_proba', 'predict_log_proba'
    ] and y is not None

    if encode:
        if isinstance(predictions, list):
            predictions = [
                _enforce_prediction_order(estimator.classes_[i_label],
                                          predictions[i_label],
                                          n_classes=len(set(y[:, i_label])),
                                          method=method)
                for i_label in range(len(predictions))
            ]
        else:
            # A 2D y array should be a binary label indicator matrix
            n_classes = len(set(y)) if y.ndim == 1 else y.shape[1]
            predictions = _enforce_prediction_order(estimator.classes_,
                                                    predictions, n_classes,
                                                    method)
    return predictions, test, estimator
예제 #23
0
def _fit_and_score(estimator,
                   X,
                   y,
                   scorer,
                   train,
                   test,
                   verbose,
                   parameters,
                   fit_params,
                   return_train_score=False,
                   return_parameters=False,
                   return_n_test_samples=False,
                   return_times=False,
                   return_estimator=False,
                   error_score=np.nan):

    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    train_scores = {}
    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for k, v in parameters.items():
            cloned_parameters[k] = clone(v, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    #print(X_train.shape)
    X_train = pd.concat([
        X_train,
        pd.concat([X_train.iloc[:, 24:], X_train.iloc[:, :24]], axis=1)
    ])  #X_train.iloc[:, :33], X_train.iloc[:, 33:]], axis=1)
    y_train = np.concatenate([y_train, y_train])
    #print(X_train.shape)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
                if return_train_score:
                    train_scores = test_scores.copy()
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" %
                (error_score, format_exception_only(type(e), e)[0]),
                FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        test_scores = _score(estimator, X_test, y_test, scorer)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer)
    if verbose > 2:
        if isinstance(test_scores, dict):
            for scorer_name in sorted(test_scores):
                msg += ", %s=" % scorer_name
                if return_train_score:
                    msg += "(train=%.3f," % train_scores[scorer_name]
                    msg += " test=%.3f)" % test_scores[scorer_name]
                else:
                    msg += "%.3f" % test_scores[scorer_name]
        else:
            msg += ", score="
            msg += ("%.3f" % test_scores if not return_train_score else
                    "(train=%.3f, test=%.3f)" % (train_scores, test_scores))

    if verbose > 1:
        total_time = score_time + fit_time
        print(_message_with_time('CV', msg, total_time))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    if return_estimator:
        ret.append(estimator)
    return ret
예제 #24
0
def _fit_and_score(estimator,
                   X,
                   y,
                   scorer,
                   train,
                   test,
                   verbose,
                   parameters,
                   fit_params,
                   return_train_score=False,
                   return_parameters=False,
                   return_n_test_samples=False,
                   return_times=True,
                   return_estimator=False,
                   error_score=np.nan):

    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)

    train_scores = {}
    if parameters is not None:
        # clone after setting parameters in case any parameters
        # are estimators (like pipeline steps)
        # because pipeline doesn't clone steps in fit
        cloned_parameters = {}
        for k, v in parameters.items():
            cloned_parameters[k] = clone(v, safe=False)

        estimator = estimator.set_params(**cloned_parameters)

    start_time = time.perf_counter()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.perf_counter() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if isinstance(scorer, dict):
                test_scores = {name: error_score for name in scorer}
                if return_train_score:
                    train_scores = test_scores.copy()
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn(
                "Estimator fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%s" %
                (error_score, format_exception_only(type(e), e)[0]),
                FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.perf_counter() - start_time
        test_scores = _score(estimator, X_test, y_test, scorer)

        score_time = time.perf_counter() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer)
    if verbose > 2:
        if isinstance(test_scores, dict):
            for scorer_name in sorted(test_scores):
                msg += ", %s=" % scorer_name
                if return_train_score:
                    msg += "(train=%.3f," % train_scores[scorer_name]
                    msg += " test=%.3f)" % test_scores[scorer_name]
                else:
                    msg += "%.3f" % test_scores[scorer_name]
        else:
            msg += ", score="
            msg += ("%.3f" % test_scores if not return_train_score else
                    "(train=%.3f, test=%.3f)" % (train_scores, test_scores))

    total_time = score_time + fit_time
    if verbose > 1:
        print(_message_with_time('CV', msg, total_time))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    p = pickle.dumps(estimator)
    mem = sys.getsizeof(p)  #in bytes

    #TODO переделать без костылей
    #for numpy
    if (isinstance(X_test, np.ndarray)):
        inf_example = X_test[0].reshape(1, -1)

    #for DataFrame
    if (isinstance(X_test, pd.DataFrame)):
        inf_example = X_test.iloc[0].to_numpy().reshape(1, -1)
    #Reshape your data either using array.reshape(-1, 1)
    #if your data has a single feature or array.reshape(1, -1) if it contains
    #a single sample.

    inference_time_start = time.perf_counter()
    s = estimator.predict(inf_example)
    inference_time_end = time.perf_counter()
    inference_time = inference_time_end - inference_time_start

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time, inference_time, mem, total_time])
    if return_parameters:
        ret.append(parameters)
    if return_estimator:
        ret.append(estimator)
    return ret