示例#1
0
def split_cv(*arrays, y=None, groups=None, cv=3, random_state=None):
    '''supervise splitting

    arrays : 2d arrays 
        arrays to be splitted, usually X    
    y : 1d array
        class label, if None not to stratify
    groups
        - split by groups
    cv
        - number of splits

    return
    ----
    generator of list containing splited arrays,shape = [m*n*k], for 1 fold
    [(0train, 0test), (1train, 1test), ...]

    m - indices of folds [0 : cv-1]
    n - indice of variable/arrays [0 : n_arrays-1]
    k - indice of train(0)/test[1] set [0:1]
    '''

    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    validation.check_consistent_length(*arrays, y, groups)
    arrays = list(arrays)

    if cv == 1:
        if y is not None:
            arrays.append(y)
        return [[(i, i) for i in arrays]]
    # get cross validator
    if y is not None:
        arrays.append(y)
        cv = check_cv(cv, y=y, classifier=True)
    else:
        cv = check_cv(cv, classifier=False)
    # set random state
    if hasattr(cv, 'random_state'):
        cv.random_state = random_state
    # reset_index pandas df or series
    arrays = _reset_index(*arrays)
    arrays = indexable(*arrays)
    # get indexing method
    train_test = ([
        (safe_indexing(i, train_index), safe_indexing(i, test_index))
        for i in arrays
    ] for train_index, test_index in cv.split(arrays[0], y, groups))

    return train_test
示例#2
0
def permutations(estimator,
                 X,
                 y,
                 cv=None,
                 n_permuations=100,
                 random_state=0,
                 scoring=None):
    """
    This follows the sklearn API sklearn.inspection.permutation_test_score
    I have modified accordinlgy to accomodate filtering of features using correlation matrix
    before running cross-validation using the model
    """

    Xs, ys = indexable(X, y)
    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # corr = CorrMatrix()
    # corr.fit(X,y)
    # Xs, ys = corr.transform()
    score = _permutations(clone(estimator), Xs, ys, cv, scorer)
    permutation_scores = np.zeros((n_permuations))
    for i in range(n_permuations):
        # corr_p = CorrMatrix()
        # corr_p.fit(X, y)
        # Xp, yp = corr_p.transform()
        yp = _safe_indexing(y, random_state.permutation(len(y)))
        permutation_scores[i] = _permutations(clone(estimator), Xs, yp, cv,
                                              scorer)

    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permuations + 1)

    return score, permutation_scores, pvalue
示例#3
0
def _wrapped_cross_val_score(sklearn_pipeline,
                             features,
                             target,
                             cv,
                             scoring_function,
                             sample_weight=None,
                             groups=None):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps,
                                           sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            scores = [
                _fit_and_score(estimator=clone(sklearn_pipeline),
                               X=features,
                               y=target,
                               scorer=scorer,
                               train=train,
                               test=test,
                               verbose=0,
                               parameters=None,
                               fit_params=sample_weight_dict)
                for train, test in cv_iter
            ]
            CV_score = np.array(scores)[:, 0]
            return np.nanmean(CV_score)
    except TimeoutException:
        return "Timeout"
    except Exception as e:
        return -float('inf')
 def fit(self, X, y):
     cv = check_cv(self.cv, y, classifier=True)
     self.estimators_ = []
     self.scores = []
     self.score = 0
     for train, valid in cv.split(X, y):
         score1 = 0
         test = len(y[valid])
         print(X[train].shape)
         print(y[train].shape)
         clf = lgb.LGBMClassifier(**self.lgb_params).fit(
             X[train],
             y[train],
             eval_set=[(X[train], y[train])],
             early_stopping_rounds=15)
         for i in range(0, test):
             yt = clf.predict(X[valid][i, :])
             if yt == y[valid][i]:
                 score1 += 1
         score1 = score1 / test
         print(score1)
         self.scores.append(score1)
         self.estimators_.append(clf)
     self.score = sum(self.scores) / len(self.scores)
     return self
示例#5
0
def permutation_test_score(estimator, X, y, groups=None, cv=None,
                           n_permutations=100, n_jobs=1, random_state=0,
                           verbose=0, scoring=None):
    """
    Evaluate the significance of a cross-validated score with permutations,
    as in test 1 of [Ojala2010]_.

    A modification of original sklearn's permutation test score function
    to evaluate p-value outside this function, so that the score can be
    reused from outside.


    .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier
                   Performance.  The Journal of Machine Learning Research (2010)
                   vol. 11

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator), X, _shuffle(y, groups, random_state),
            groups, cv, scorer)
        for _ in range(n_permutations))
    permutation_scores = np.array(permutation_scores)
    return permutation_scores
    def test_fit_and_score_return_dict(self):

        # Scoring
        accuracy_scorer = make_scorer(accuracy_score, normalize='weighted')

        # Test estimator
        dumb = DummyClassifier(strategy='constant', constant=1)

        # Test custom scorer
        bagAccScorer = BagScorer(accuracy_scorer, sparse=True)

        # Rename for easier parameters
        X = self.train_bags
        y = self.train_labels
        scoring = {'bag-scorer': bagAccScorer}
        estimator = dumb
        groups = None
        cv = 3
        n_jobs = 3
        verbose = 0
        pre_dispatch = 6
        fit_params = None
        return_estimator = True
        error_score = 'raise'
        return_train_score = True
        parameters = None

        # Test _fit_and_score method
        X, y, groups = indexable(X, y, groups)
        cv = check_cv(cv, y, classifier=is_classifier(estimator))
        scorers = _check_multimetric_scoring(estimator, scoring=scoring)

        # Use one cross-validation split
        generator = cv.split(X, y, groups)
        # Get training and test split of training data
        train, test = next(generator)
        # Generate scores using BagScorer
        scores = _fit_and_score(clone(estimator),
                                X,
                                y,
                                scorers,
                                train,
                                test,
                                verbose,
                                parameters,
                                fit_params,
                                return_train_score=return_train_score,
                                return_times=True,
                                return_estimator=return_estimator,
                                return_n_test_samples=False,
                                error_score=error_score)

        # Returned dictionary contains keys
        self.assertIn('train_scores', scores.keys())
        self.assertIn('test_scores', scores.keys())
        self.assertIn('fit_time', scores.keys())
        self.assertIn('score_time', scores.keys())
        self.assertIn('estimator', scores.keys())

        return None
示例#7
0
def fit_and_save(estimator,
                 X,
                 y=None,
                 groups=None,
                 scoring=None,
                 cv=None,
                 n_jobs=1,
                 verbose=0,
                 fit_params=None,
                 pre_dispatch='2*n_jobs',
                 return_train_score=True,
                 parameters=dict(),
                 uuid='',
                 url='http://127.0.0.1:8000'):

    import json, requests, numpy
    from sklearn.model_selection._validation import cross_validate

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)

    _base_scores = [0. for _ in range(cv.get_n_splits(X, y, groups))]

    cv_score = {}
    cv_score.update(
        {'train_%s' % s: numpy.array(_base_scores)
         for s in scorers})
    cv_score.update(
        {'test_%s' % s: numpy.array(_base_scores)
         for s in scorers})
    cv_score.update({'fit_time': _base_scores, 'score_time': _base_scores})

    try:
        cv_score = cross_validate(estimator, X, y, groups, scorers, cv, n_jobs,
                                  verbose, fit_params, pre_dispatch,
                                  return_train_score)
        error = None
    except Exception as e:
        error = '{}: {}'.format(type(e).__name__, str(e))

    try:
        for k, v in cv_score.items():
            if type(v) == type(numpy.array([])):
                cv_score[k] = v.tolist()
        response = requests.post('{url}/grids/{uuid}/results'.format(
            url=url, uuid=uuid),
                                 data={
                                     'gridsearch': uuid,
                                     'params': json.dumps(parameters),
                                     'errors': error,
                                     'cv_data': json.dumps(cv_score)
                                 })

    except requests.exceptions.ConnectionError as e:
        response = None
    if response is None:
        return
    return response
示例#8
0
def cross_val_predict(estimator, X, y=None, groups=None, cv='warn',
                      n_jobs=None, verbose=0, fit_params=None,
                      pre_dispatch='2*n_jobs', method='predict'):

    """
    Minor modifications and simplications brought to the sklearn function in order to allow
    for application with non-partition CV scheme. 
    """

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(delayed(_fit_and_predict)(
        clone(estimator), X, y, train, test, verbose, fit_params, method)
        for train, test in cv.split(X, y, groups))


    # Concatenate the predictions
    predictions = [pred_block_i for pred_block_i, _ in prediction_blocks]
    predictions = np.concatenate(predictions)

    test_indices = np.concatenate([indices_i
                                   for _, indices_i in prediction_blocks])
    test_index = [y.index[_] for _ in test_indices]
    #print(predictions)

    if y.ndim == 1:
        return pd.Series(predictions, index = test_index)
    elif y.ndim>1:
        return pd.DataFrame(predictions, index = test_index)
示例#9
0
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
                    n_jobs=1, verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs'):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                              train, test, verbose, None,
                                              fit_params)
                      for train, test in splits)

    group_order = []
    if hasattr(cv, 'groups'):
        group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits]
    return np.squeeze(np.array(scores)), group_order
示例#10
0
    def fit(self, X, y=None, *, groups=None, **fit_params):
        """Run fit with all sets of parameters.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like of shape (n_samples, n_output) \
            or (n_samples,), default=None
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like of shape (n_samples,), default=None
            Group labels for the samples used while splitting the dataset into
            train/test set. Only used in conjunction with a "Group" :term:`cv`
            instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
        **fit_params : dict of str -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        X, y, groups = indexable(X, y, groups)

        self._run_search(X, y, cv)

        return self
示例#11
0
def learning_curve(estimator, X, mixed_y, groups=None,
                   train_sizes=np.linspace(0.1, 1.0, 5), cv=None, scoring=None,
                   exploit_incremental_learning=False, n_jobs=1,
                   pre_dispatch="all", verbose=0, shuffle=False,
                   random_state=None):
    """Learning curve."""

    if exploit_incremental_learning and not hasattr(estimator, "partial_fit"):
        raise ValueError("An estimator must support the partial_fit interface "
                         "to exploit incremental learning")

    # TODO: wrapper patch, key hard coding?
    _y = mixed_y['classifier'] if isinstance(mixed_y, dict) else mixed_y
    X, y, groups = indexable(X, _y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    # Store it as list as we will be iterating over the list multiple times
    cv_iter = list(cv.split(X, y, groups))

    scorer = check_scoring(estimator, scoring=scoring)

    n_max_training_samples = len(cv_iter[0][0])
    # Because the lengths of folds can be significantly different, it is
    # not guaranteed that we use all of the available training data when we
    # use the first 'n_max_training_samples' samples.
    train_sizes_abs = _translate_train_sizes(train_sizes,
                                             n_max_training_samples)
    n_unique_ticks = train_sizes_abs.shape[0]
    if verbose > 0:
        print("[learning_curve] Training set sizes: " + str(train_sizes_abs))

    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
                        verbose=verbose)

    if shuffle:
        rng = check_random_state(random_state)
        cv_iter = ((rng.permutation(train), test) for train, test in cv_iter)

    if exploit_incremental_learning:
        classes = np.unique(y) if is_classifier(estimator) else None
        out = parallel(delayed(_incremental_fit_estimator)(
            clone(estimator), X, mixed_y, classes, train, test, train_sizes_abs,
            scorer, verbose) for train, test in cv_iter)
    else:
        train_test_proportions = []
        for train, test in cv_iter:
            for n_train_samples in train_sizes_abs:
                train_test_proportions.append((train[:n_train_samples], test))

        out = parallel(delayed(_fit_and_score)(
            clone(estimator), X, mixed_y, scorer, train, test,
            verbose, parameters=None, fit_params=None, return_train_score=True)
            for train, test in train_test_proportions)
        out = np.array(out)
        n_cv_folds = out.shape[0] // n_unique_ticks
        out = out.reshape(n_cv_folds, n_unique_ticks, 2)

    out = np.asarray(out).transpose((2, 1, 0))

    return train_sizes_abs, out[0], out[1]
示例#12
0
 def _get_cv(self, y_tr):
     
     cv = check_cv(self.cv, y_tr, classifier=is_classifier(self.estimator))
     return cv
             
                 
     
     
示例#13
0
    def fit(self, X, y, groups=None):
        """Actual fitting,  performing the search over parameters."""
        results = dict()

        best_index = None
        best_parameters = None

        for bracket_idx in range(self.num_brackets - 1, -1, -1):
            successive_halving_steps = bracket_idx + 1
            # TODO: num_arms should be different

            estimator = self.estimator
            cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
            self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

            X, y, groups = indexable(X, y, groups)
            n_splits = cv.get_n_splits(X, y, groups)

            base_estimator = clone(self.estimator)

            arms_pulled = 0
            if 'mean_test_score' in results:
                arms_pulled = len(results['mean_test_score'])

            res = self._successive_halving(X, y, groups, cv, self.eta,
                                           successive_halving_steps - 1,
                                           self.num_brackets - 1)
            bracket_results, bracket_best_index, bracket_best_parameters = res
            for key, values in bracket_results.items():
                if key not in results:
                    results[key] = values
                else:
                    results[key] = np.append(results[key], values)

            if best_index is None:
                best_index = bracket_best_index + arms_pulled
                best_parameters = bracket_best_parameters
            elif bracket_results['mean_test_score'][
                    bracket_best_index] > results['mean_test_score'][
                        best_index]:
                best_index = bracket_best_index + arms_pulled
                best_parameters = bracket_best_parameters

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
示例#14
0
def regress(exp: Experiment,
            field,
            estimator,
            cv=RepeatedSortedStratifiedKFold(3, 1),
            params=None):
    '''Evaluate regression during cross validation.

    Parameters
    ----------
    field : str
        column name in the sample metadata, which contains the variable we want to predict.
    estimator : estimator object implementing `fit` and `predict`
        scikit-learn estimator. e.g. :class:`sklearn.ensemble.RandomForestRegressor`
    cv : int, cross-validation generator or an iterable
        similar to the `cv` parameter in :class:`sklearn.model_selection.GridSearchCV`
    params : dict of string to sequence, or sequence of such
        For example, the output of
        :class:`sklearn.model_selection.ParameterGrid` or
        :class:`sklearn.model_selection.ParameterSampler`. By default,
        it uses whatever default parameters of the `estimator` set in
        `scikit-learn`

    Yields
    ------
    pandas.DataFrame
        The result of prediction per sample for a given parameter set. It contains the
        following columns:

        - Y_TRUE: the true value for the samples
        - SAMPLE: sample IDs
        - CV: which split of the cross validation
        - Y_PRED: the predicted value for the samples
    '''
    X = exp.data
    y = exp.sample_metadata[field]
    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    if params is None:
        # use sklearn default param values for the given estimator
        params = [{}]

    for param in params:
        logger.debug('run regression with parameters: %r' % param)
        dfs = []
        for i, (train, test) in enumerate(cv.split(X, y)):
            # deep copy the model by clone to avoid the impact from last iteration of fit.
            model = clone(estimator)
            model = model.set_params(**param)
            model.fit(X[train], y[train])
            pred = model.predict(X[test])
            df = pd.DataFrame({
                'Y_PRED': pred,
                'Y_TRUE': y[test].values,
                'SAMPLE': y[test].index.values,
                'CV': i
            })
            dfs.append(df)
        yield pd.concat(dfs, axis=0).reset_index(drop=True)
示例#15
0
文件: search.py 项目: NicolasHug/dabl
    def _check_input_parameters(self, X, y, groups):

        if (self.budget_on != 'n_samples'
                and self.budget_on not in self.estimator.get_params()):
            raise ValueError(
                'Cannot budget on parameter {} which is not supported '
                'by estimator {}'.format(self.budget_on,
                                         self.estimator.__class__.__name__))

        if isinstance(self.max_budget, str) and self.max_budget != 'auto':
            raise ValueError(
                "max_budget must be either 'auto' or a positive number")
        if self.max_budget != 'auto' and self.max_budget <= 0:
            raise ValueError(
                "max_budget must be either 'auto' or a positive number")

        if isinstance(self.r_min, str) and self.r_min != 'auto':
            raise ValueError(
                "r_min must be either 'auto' or a positive number no greater "
                "than max_budget.")
        if self.r_min != 'auto' and self.r_min <= 0:
            raise ValueError(
                "r_min must be either 'auto' or a positive number no greater "
                "than max_budget.")

        if self.force_exhaust_budget and self.r_min != 'auto':
            raise ValueError(
                'r_min must be set to auto if force_exhaust_budget is True.')

        self.r_min_ = self.r_min
        if self.r_min_ == 'auto':
            if self.budget_on == 'n_samples':
                cv = check_cv(self.cv,
                              y,
                              classifier=is_classifier(self.estimator))
                n_splits = cv.get_n_splits(X, y, groups)

                # please see https://gph.is/1KjihQe for a justification
                magic_factor = 2
                self.r_min_ = n_splits * magic_factor
                if is_classifier(self.estimator):
                    n_classes = np.unique(y).shape[0]
                    self.r_min_ *= n_classes
            else:
                self.r_min_ = 1

        self.max_budget_ = self.max_budget
        if self.max_budget_ == 'auto':
            if self.budget_on == 'n_samples':
                self.max_budget_ = X.shape[0]
            else:
                self.max_budget_ = 20  # FIXME  # n_candidates * r_min??

        if self.r_min_ > self.max_budget_:
            raise ValueError(
                'r_min_={} is greater than max_budget_={}.'.format(
                    self.r_min_, self.max_budget_))
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator

        """
        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        score_function = partial(
            cross_val_score, X=X, y=y, groups=groups, scoring=self.scoring,
            cv=cv, n_jobs=self.n_jobs, verbose=self.verbose,
            fit_params=fit_params)
        self.f = partial(
            _fit_score, mdl=self.estimator, param_names=self.param_names,
            score_function=score_function)

        self.objective = SingleObjective(
            self.f, self.batch_size, self.objective_name)
        self._init_design_chooser()

        self.run_optimization(max_iter=self.max_iter, verbosity=self.verbosity)

        self.best_index_ = self.Y.argmin()
        self.best_params_ = dict(zip(self.param_names,
                                     10 ** self.X[self.best_index_]))
        self.best_score_ = self.Y[self.Y.argmin()]

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        if self.refit:
            self.best_estimator_ = clone(self.estimator).set_params(
                **self.best_params_)
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)

        return self
示例#17
0
 def fit(self, X, y):
     y_labels = self._get_labels(y, self.n_classes)
     cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator))
     self.estimators_ = []
     
     for train, _ in cv.split(X, y_labels):
         self.estimators_.append(
             clone(self.estimator).fit(X[train], y_labels[train])
         )
     return self
示例#18
0
 def transform(self, X, y=None):
     cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
     
     X_prob = np.zeros((X.shape[0], self.n_classes))
     X_pred = np.zeros(X.shape[0])
     
     for estimator, (_, test) in zip(self.estimators_, cv.split(X)):
         X_prob[test] = estimator.predict_proba(X[test])
         X_pred[test] = estimator.predict(X[test])
     return np.hstack([X_prob, np.array([X_pred]).T])
示例#19
0
 def fit(self, X, y):
     y_labels = self._get_labels(y)
     cv = check_cv(self.cv, y_labels, classifier=is_classifier(self.estimator))
     self.estimators_ = []
     
     for train, _ in cv.split(X, y_labels):
         self.estimators_.append(
             clone(self.estimator).fit(X[train], y_labels[train])
         )
     return self
示例#20
0
    def transform(self, X, y=None):
        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))

        X_prob = np.zeros((X.shape[0], self.n_classes))
        X_pred = np.zeros(X.shape[0])

        for estimator, (_, test) in zip(self.estimators_, cv.split(X)):
            X_prob[test] = estimator.predict_proba(X[test])
            X_pred[test] = estimator.predict(X[test])
        return np.hstack([X_prob, np.array([X_pred]).T])
示例#21
0
    def fit(self, X, y, **fit_params):
        cv = check_cv(self.cv, y, classifier=False)
        self.estimators_ = []

        for train, valid in cv.split(X, y):
            self.estimators_.append(
                xgb.XGBRegressor(**self.xgb_params).fit(X[train],
                                                        y[train],
                                                        eval_set=[(X[valid],
                                                                   y[valid])],
                                                        **self.fit_params))
        return self
 def fit(self, X, y):
     print("随机森林开始拟合")
     cv = check_cv(self.cv, y, classifier=False)
     self.estimators_ = []
     self.scores = []
     for train, valid in cv.split(X, y):
         model = RandomForestRegressor(**self.rf_params).fit(
             X.iloc[train], y.iloc[train])
         self.estimators_.append(model)
         score = rmsple(y.iloc[valid], model.predict(X.iloc[valid]))
         self.scores.append(score)
     return self
示例#23
0
文件: gp_deap.py 项目: stenpiren/tpot
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None, groups=None):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
                                    X=features,
                                    y=target,
                                    scorer=scorer,
                                    train=train,
                                    test=test,
                                    verbose=0,
                                    parameters=None,
                                    fit_params=sample_weight_dict)
                                for train, test in cv_iter]
            CV_score = np.array(scores)[:, 0]
            return np.nanmean(CV_score)
    except TimeoutException:
        return "Timeout"
    except Exception as e:
        return -float('inf')
示例#24
0
def cross_validate(estimator, X, mixed_y=None, groups=None, scoring=None, cv=None,
                   n_jobs=1, verbose=0, fit_params=None,
                   pre_dispatch='2*n_jobs', return_train_score="warn"):
    """Evaluate metric(s) by cross-validation and also record fit/score times."""

    # TODO: wrapper patch, key hard coding?
    _y = mixed_y['classifier'] if isinstance(mixed_y, dict) else mixed_y

    X, y, groups = indexable(X, _y, groups)
    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fit_and_score)(
            clone(estimator), X, mixed_y, scorers, train, test, verbose, None,
            fit_params, return_train_score=return_train_score,
            return_times=True)
        for train, test in cv.split(X, y, groups))

    if return_train_score:
        train_scores, test_scores, fit_times, score_times = zip(*scores)
        train_scores = _aggregate_score_dicts(train_scores)
    else:
        test_scores, fit_times, score_times = zip(*scores)
    test_scores = _aggregate_score_dicts(test_scores)

    # TODO: replace by a dict in 0.21
    ret = DeprecationDict() if return_train_score == 'warn' else {}
    ret['fit_time'] = np.array(fit_times)
    ret['score_time'] = np.array(score_times)

    for name in scorers:
        ret['test_%s' % name] = np.array(test_scores[name])
        if return_train_score:
            key = 'train_%s' % name
            ret[key] = np.array(train_scores[name])
            if return_train_score == 'warn':
                message = (
                    'You are accessing a training score ({!r}), '
                    'which will not be available by default '
                    'any more in 0.21. If you need training scores, '
                    'please set return_train_score=True').format(key)
                # warn on key access
                ret.add_warning(key, message, FutureWarning)

    return ret
示例#25
0
    def create_features(self):
        self.n_class = 5
        self.cv = 5
        estimator = self.get_rfc()
        estimators = []
        y_labels = self._get_labels(y)
        cv = check_cv(self.cv, y_labels, classifier=is_classifier(estimator))

        for tr_idx, _ in cv.split(train, y_labels):
            estimators.append(
                clone(estimator).fit(train.loc[tr_idx], y_labels[tr_idx]))
        train_prob = np.zeros([train.shape[0], self.n_class])
        train_pred = np.zeros(train.shape[0])

        test_prob = np.zeros([test.shape[0], self.n_class])
        test_pred = np.zeros(test.shape[0])

        cv = check_cv(self.cv, classifier=is_classifier(estimator))
        for estimator, (_, te_idx) in zip(estimators, cv.split(train)):
            train_prob[te_idx] = estimator.predict_proba(train.loc[te_idx])
            train_pred[te_idx] = estimator.predict(train.loc[te_idx])

        for estimator, (_, te_idx) in zip(estimators, cv.split(test)):
            test_prob[te_idx] = estimator.predict_proba(test.loc[te_idx])
            test_pred[te_idx] = estimator.predict(test.loc[te_idx])

        tmp_train = pd.DataFrame(train_prob)
        tmp_test = pd.DataFrame(test_prob)
        tmp_train["class_pred"] = np.array([train_pred]).T
        tmp_test["class_pred"] = np.array([test_pred]).T

        columns = ["{}_prob".format(i)
                   for i in range(self.n_class)] + ["class_pred"]
        tmp_train.columns = columns
        tmp_test.column = columns

        self.train = tmp_train
        self.test = tmp_test
示例#26
0
    def fit(self, X, y):
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(0)
        y_labels = self._get_labels(y)
        cv = check_cv(self.cv,
                      y_labels,
                      classifier=is_classifier(self.estimator))
        self.estimators_ = []

        for train, _ in cv.split(X, y_labels):
            X = np.array(X)
            self.estimators_.append(
                clone(self.estimator).fit(X[train], y_labels[train]))
        return self
示例#27
0
    def fit(self, X, y, **fit_params):
        cv = check_cv(self.cv, y, classifier=False)
        self.estimators_ = []
        
        for train, valid in cv.split(X, y):
            self.estimators_.append(
                xgb.XGBRegressor(**self.xgb_params).fit(
                    X[train], y[train],
                    eval_set=[(X[valid], y[valid])],
                    **self.fit_params
                )
            )

        return self
示例#28
0
def cross_val_score(estimator,
                    X,
                    y=None,
                    groups=None,
                    scoring=None,
                    cv=None,
                    n_jobs=1,
                    verbose=0,
                    fit_params=None,
                    pre_dispatch='2*n_jobs',
                    datasets=None):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    """
    if datasets is None:
        datasets = read_all_datasets()
    """
    scores = parallel(
        delayed(_fit_and_score)(clone(estimator),
                                X,
                                y,
                                scorer,
                                train,
                                test,
                                verbose,
                                None,
                                fit_params,
                                to_evaluate=datasets)
        for train, test in splits)

    group_order = []
    if hasattr(cv, 'groups'):
        group_order = [
            np.array(cv.groups)[test].tolist()[0] for _, test in splits
        ]
    return np.squeeze(np.array(scores)), group_order  #scores
 def fit(self, X, y, **fit_xgb_params):
     cv = check_cv(self.cv, y, classifier=False)
     print(X.shape, " ", y.shape)
     self.estimators_ = []
     self.scores = []
     for train, valid in cv.split(X, y):
         model = xgb.XGBRegressor(**self.xgb_params).fit(
             X.iloc[train],
             y.iloc[train],
             eval_set=[(X.iloc[valid], y.iloc[valid])],
             **self.fit_xgb_params)
         self.estimators_.append(model)
         score = rmsple(y.iloc[valid], model.predict(X.iloc[valid]))
         self.scores.append(score)
     return self
示例#30
0
def cross_val_transform(target_encoder,
                        X,
                        y=None,
                        cv=5,
                        classifier=False,
                        n_jobs=None):
    cv = check_cv(cv, y, classifier=classifier)
    splits = list(cv.split(X, y))
    transform_outputs = Parallel(n_jobs=n_jobs)(
        delayed(_fit_and_transform)(clone(target_encoder), X, y, train_idx,
                                    test_idx)
        for train_idx, test_idx in splits)

    output = np.zeros_like(X)
    for (_, test_idx), transform_output in zip(splits, transform_outputs):
        output[test_idx] = transform_output
    return output
示例#31
0
def cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
                      verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
                      method='predict', pickle_predictions=False, **pickler_kwargs):
    """Please see sklearn for documenation
    This has only been modified so binned regressors can return probabilites
    and predictions can be cached during computation
    """
    X, y, groups = indexable(X, y, groups)

    pickler = CachingPickler(**pickler_kwargs) if pickle_predictions else None

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    if method in ['decision_function', 'predict_proba', 'predict_log_proba'] and is_classifier(estimator):
        le = LabelEncoder()
        y = le.fit_transform(y)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(delayed(_fit_and_predict)(
        clone(estimator), X, y, train, test, verbose, fit_params, method, pickler)
        for train, test in cv.split(X, y, groups))

    # Concatenate the predictions
    if pickle_predictions:
        predictions = [pickler.unpickle_data(pred_block_i) for pred_block_i, _ in prediction_blocks]
    else:
        predictions = [pred_block_i for pred_block_i, _ in prediction_blocks]

    test_indices = np.concatenate([indices_i for _, indices_i in prediction_blocks])

    if not _check_is_permutation(test_indices, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))

    # Check for sparse predictions
    if sp.issparse(predictions[0]):
        predictions = sp.vstack(predictions, format=predictions[0].format)
    else:
        predictions = np.concatenate(predictions)
    return predictions[inv_test_indices]
示例#32
0
def cross_val_score(estimator,
                    X,
                    y=None,
                    groups=None,
                    scoring=None,
                    cv=None,
                    fit_params=None,
                    verbose=0):
    X, y, groups = indexable(X, y, groups)
    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    scores = []
    for train, test in cv.split(X, y, groups):
        score = delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, \
                                 verbose, None, fit_params)
        scores.append(score)
    result = delayed(concat_cv_scores)(scores)
    return result
 def _plot(cls,
           estimator,
           X,
           y,
           train_sizes=None,
           cv=None,
           n_jobs=1,
           ax=None,
           cmap='tab10'):
     cv = check_cv(cv)
     plotter = cls()._create(estimator,
                             X,
                             y,
                             train_sizes=train_sizes,
                             cv=cv,
                             n_jobs=n_jobs)
     plotter.plot(ax=ax, cmap=cmap)
     return plotter
示例#34
0
def cross_val_predict_proba(estimator, X, y, groups = None, cv = None, 
    n_jobs = 1, verbose = 0, fit_params = None, pre_dispatch = '2*n_jobs'):
    '''
    Gets class probability predictions for test examples 
    over cross validations runs.

    Adapted from mne.decoding.base.cross_val_multiscore(). See that func's
    documentation for details on inputs.
    '''
    import time
    import numbers
    from mne.parallel import parallel_func
    from mne.fixes import is_classifier
    from sklearn.base import clone
    from sklearn.utils import indexable
    from sklearn.model_selection._split import check_cv

    # check arguments
    X, y, groups = indexable(X, y, groups)
    cv = check_cv(cv, y, classifier = is_classifier(estimator))
    cv_iter = list(cv.split(X, y, groups))

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    # Note: this parallelization is implemented using MNE Parallel
    parallel, p_func, n_jobs = parallel_func(_predict_proba, n_jobs,
                                             pre_dispatch = pre_dispatch)
    preds = parallel(p_func(clone(estimator), X, y, train, test,
                             0, None, fit_params)
                      for train, test in cv_iter)

    # flatten over parallel output
    y_hat = np.concatenate([p[0] for p in preds], axis = 0)
    is_y_true = True
    try:
        y_true = np.concatenate([p[1] for p in preds], axis = 0)
    except: # learner was unsupervised
        is_y_true = False

    # return results
    if is_y_true:
        return y_hat, y_true
    else:
        return y_hat
示例#35
0
    def _run_search(self, evaluate_candidates, X, y):
        rng = check_random_state(self.random_state)

        candidate_params = list(self._generate_candidate_params())

        n_iterations = int(ceil(log2(len(candidate_params))))
        print(n_iterations)
        n_samples_total = X.shape[0]
        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
        n_classes = len(np.unique(y)) if is_classifier(self.estimator) else 1
        min_n_samples = cv.get_n_splits(X, y) * n_classes * 2
        # max_iter = int(ceil(n_samples_total / (min_n_samples * n_candidates)))
        # n_iterations = min(n_iterations, max_iter)

        for iter_i in range(n_iterations):
            n_candidates = len(candidate_params)

            # randomly sample training samples
            n_samples_iter = floor(n_samples_total / (n_candidates * n_iterations))
            if is_classifier(self.estimator):
                n_samples_iter = max(n_samples_iter, min_n_samples)
            print("n_samples_iter: {}".format(n_samples_iter))
            indices = rng.choice(n_samples_total, n_samples_iter,
                                 replace=False)
            X_iter, y_iter = X[indices], y[indices]

            more_results= {'iter': [iter_i] * n_candidates,
                           'n_samples': [n_samples_iter] * n_candidates}
            out = evaluate_candidates(candidate_params, X_iter, y_iter,
                                      more_results=more_results)

            # Select the best half of the candidates for the next iteration
            # We need to filter out candidates from the previous iterations
            n_candidates_to_keep = ceil(n_candidates / 2)
            best_candidates_indices = np.argsort(out['mean_test_score'])[::-1]
            best_candidates_indices = [i for i in best_candidates_indices
                                       if out['iter'][i] == iter_i]
            best_candidates_indices = \
                best_candidates_indices[:n_candidates_to_keep]
            candidate_params = [out['params'][i]
                                for i in best_candidates_indices]

        assert len(candidate_params) == n_candidates_to_keep == 1
示例#36
0
 def fit(self,X,y):
     cv = check_cv(self.cv,y,classifier = True)
     self.estimators_ = []
     self.scores = []
     self.score = 0
     for train,valid in cv.split(X,y):
         score1 = 0
         test = len(y[valid])
         clf = SGDClassifier(**self.sgd_params).fit(X[train],y[train])
         for i in range(0,test):
             yt = clf.predict(X[valid][i,:])
             if yt == y[valid][i]:
                 score1 += 1
         score1 = score1 / test
         print(score1)
         self.scores.append(score1)
         self.estimators_.append(clf)
     self.score = sum(self.scores) / len(self.scores)
     return self
示例#37
0
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None,
                             groups=None, use_dask=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    use_dask : bool, default False
        Whether to use dask
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    if use_dask:
        try:
            import dask_ml.model_selection  # noqa
            import dask  # noqa
            from dask.delayed import Delayed
        except ImportError:
            msg = "'use_dask' requires the optional dask and dask-ml depedencies."
            raise ImportError(msg)

        dsk, keys, n_splits = dask_ml.model_selection._search.build_graph(
            estimator=sklearn_pipeline,
            cv=cv,
            scorer=scorer,
            candidate_params=[{}],
            X=features,
            y=target,
            groups=groups,
            fit_params=sample_weight_dict,
            refit=False,
            error_score=float('-inf'),
        )

        cv_results = Delayed(keys[0], dsk)
        scores = [cv_results['split{}_test_score'.format(i)]
                  for i in range(n_splits)]
        CV_score = dask.delayed(np.array)(scores)[:, 0]
        return dask.delayed(np.nanmean)(CV_score)
    else:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
                                         X=features,
                                         y=target,
                                         scorer=scorer,
                                         train=train,
                                         test=test,
                                         verbose=0,
                                         parameters=None,
                                         fit_params=sample_weight_dict)
                                    for train, test in cv_iter]
                CV_score = np.array(scores)[:, 0]
                return np.nanmean(CV_score)
        except TimeoutException:
            return "Timeout"
        except Exception as e:
            return -float('inf')
示例#38
0
    def fit(self, X, y, groups=None, sample_weight=None):
        """ Fit ensemble classifers and the meta-classifier.

        Parameters
        ----------
        X : numpy array, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : numpy array, shape = [n_samples]
            Target values.

        groups : numpy array/None, shape = [n_samples]
            The group that each sample belongs to. This is used by specific
            folding strategies such as GroupKFold()

        sample_weight : array-like, shape = [n_samples], optional
            Sample weights passed as sample_weights to each regressor
            in the regressors list as well as the meta_regressor.
            Raises error if some regressor does not support
            sample_weight in the fit() method.

        Returns
        -------
        self : object

        """
        if self.use_clones:
            self.clfs_ = clone(self.classifiers)
            self.meta_clf_ = clone(self.meta_classifier)
        else:
            self.clfs_ = self.classifiers
            self.meta_clf_ = self.meta_classifier
        if self.verbose > 0:
            print("Fitting %d classifiers..." % (len(self.classifiers)))

        final_cv = check_cv(self.cv, y, classifier=self.stratify)
        if isinstance(self.cv, int):
            # Override shuffle parameter in case of self generated
            # cross-validation strategy
            final_cv.shuffle = self.shuffle
            final_cv.random_state = self.random_state

        # Input validation.
        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'])

        if sample_weight is None:
            fit_params = None
        else:
            fit_params = dict(sample_weight=sample_weight)

        meta_features = None

        for n, model in enumerate(self.clfs_):

            if self.verbose > 0:
                i = self.clfs_.index(model) + 1
                print("Fitting classifier%d: %s (%d/%d)" %
                      (i, _name_estimators((model,))[0][0],
                       i, len(self.clfs_)))

            if self.verbose > 2:
                if hasattr(model, 'verbose'):
                    model.set_params(verbose=self.verbose - 2)

            if self.verbose > 1:
                print(_name_estimators((model,))[0][1])

            prediction = cross_val_predict(
                    model, X, y, groups=groups, cv=final_cv,
                    n_jobs=self.n_jobs, fit_params=fit_params,
                    verbose=self.verbose, pre_dispatch=self.pre_dispatch,
                    method='predict_proba' if self.use_probas else 'predict')

            if not self.use_probas:
                prediction = prediction[:, np.newaxis]
            elif self.drop_last_proba:
                prediction = prediction[:, :-1]

            if meta_features is None:
                meta_features = prediction
            else:
                meta_features = np.column_stack((meta_features, prediction))

        if self.store_train_meta_features:
            self.train_meta_features_ = meta_features

        # Fit the base models correctly this time using ALL the training set
        for model in self.clfs_:
            if sample_weight is None:
                model.fit(X, y)
            else:
                model.fit(X, y, sample_weight=sample_weight)

        # Fit the secondary model
        if self.use_features_in_secondary:
            meta_features = self._stack_first_level_features(
                X,
                meta_features
            )

        if sample_weight is None:
            self.meta_clf_.fit(meta_features, y)
        else:
            self.meta_clf_.fit(meta_features, y,
                               sample_weight=sample_weight)

        return self
    def fit(self, X, y, groups=None):
        """ Fit ensemble classifers and the meta-classifier.

        Parameters
        ----------
        X : numpy array, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : numpy array, shape = [n_samples]
            Target values.

        groups : numpy array/None, shape = [n_samples]
            The group that each sample belongs to. This is used by specific
            folding strategies such as GroupKFold()

        Returns
        -------
        self : object

        """
        if self.use_clones:
            self.clfs_ = [clone(clf) for clf in self.classifiers]
            self.meta_clf_ = clone(self.meta_classifier)
        else:
            self.clfs_ = self.classifiers
            self.meta_clf_ = self.meta_classifier
        if self.verbose > 0:
            print("Fitting %d classifiers..." % (len(self.classifiers)))

        final_cv = check_cv(self.cv, y, classifier=self.stratify)
        if isinstance(self.cv, int):
            # Override shuffle parameter in case of self generated
            # cross-validation strategy
            final_cv.shuffle = self.shuffle
        skf = list(final_cv.split(X, y, groups))

        all_model_predictions = np.array([]).reshape(len(y), 0)
        for model in self.clfs_:

            if self.verbose > 0:
                i = self.clfs_.index(model) + 1
                print("Fitting classifier%d: %s (%d/%d)" %
                      (i, _name_estimators((model,))[0][0],
                       i, len(self.clfs_)))

            if self.verbose > 2:
                if hasattr(model, 'verbose'):
                    model.set_params(verbose=self.verbose - 2)

            if self.verbose > 1:
                print(_name_estimators((model,))[0][1])

            if not self.use_probas:
                single_model_prediction = np.array([]).reshape(0, 1)
            else:
                single_model_prediction = np.array([]).reshape(0, len(set(y)))

            for num, (train_index, test_index) in enumerate(skf):

                if self.verbose > 0:
                    print("Training and fitting fold %d of %d..." %
                          ((num + 1), final_cv.get_n_splits()))

                try:
                    model.fit(X[train_index], y[train_index])
                except TypeError as e:
                    raise TypeError(str(e) + '\nPlease check that X and y'
                                    'are NumPy arrays. If X and y are lists'
                                    ' of lists,\ntry passing them as'
                                    ' numpy.array(X)'
                                    ' and numpy.array(y).')
                except KeyError as e:
                    raise KeyError(str(e) + '\nPlease check that X and y'
                                   ' are NumPy arrays. If X and y are pandas'
                                   ' DataFrames,\ntry passing them as'
                                   ' X.values'
                                   ' and y.values.')

                if not self.use_probas:
                    prediction = model.predict(X[test_index])
                    prediction = prediction.reshape(prediction.shape[0], 1)
                else:
                    prediction = model.predict_proba(X[test_index])
                single_model_prediction = np.vstack([single_model_prediction.
                                                    astype(prediction.dtype),
                                                     prediction])

            all_model_predictions = np.hstack([all_model_predictions.
                                               astype(single_model_prediction.
                                                      dtype),
                                               single_model_prediction])

        if self.store_train_meta_features:
            # Store the meta features in the order of the
            # original X,y arrays
            reodered_indices = np.array([]).astype(y.dtype)
            for train_index, test_index in skf:
                reodered_indices = np.concatenate((reodered_indices,
                                                   test_index))
            self.train_meta_features_ = all_model_predictions[np.argsort(
                reodered_indices)]

        # We have to shuffle the labels in the same order as we generated
        # predictions during CV (we kinda shuffled them when we did
        # Stratified CV).
        # We also do the same with the features (we will need this only IF
        # use_features_in_secondary is True)
        reordered_labels = np.array([]).astype(y.dtype)
        reordered_features = np.array([]).reshape((0, X.shape[1]))\
            .astype(X.dtype)
        for train_index, test_index in skf:
            reordered_labels = np.concatenate((reordered_labels,
                                               y[test_index]))
            reordered_features = np.concatenate((reordered_features,
                                                 X[test_index]))

        # Fit the base models correctly this time using ALL the training set
        for model in self.clfs_:
            model.fit(X, y)

        # Fit the secondary model
        if not self.use_features_in_secondary:
            self.meta_clf_.fit(all_model_predictions, reordered_labels)
        else:
            self.meta_clf_.fit(np.hstack((reordered_features,
                                          all_model_predictions)),
                               reordered_labels)

        return self
    def fit(self, X, y, groups=None):
        """ Fit ensemble regressors and the meta-regressor.

        Parameters
        ----------
        X : numpy array, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : numpy array, shape = [n_samples]
            Target values.

        groups : numpy array/None, shape = [n_samples]
            The group that each sample belongs to. This is used by specific
            folding strategies such as GroupKFold()

        Returns
        -------
        self : object

        """
        if self.refit:
            self.regr_ = [clone(clf) for clf in self.regressors]
            self.meta_regr_ = clone(self.meta_regressor)
        else:
            self.regr_ = self.regressors
            self.meta_regr_ = self.meta_regressor

        kfold = check_cv(self.cv, y)
        if isinstance(self.cv, int):
            # Override shuffle parameter in case of self generated
            # cross-validation strategy
            kfold.shuffle = self.shuffle

        meta_features = np.zeros((X.shape[0], len(self.regressors)))

        #
        # The outer loop iterates over the base-regressors. Each regressor
        # is trained cv times and makes predictions, after which we train
        # the meta-regressor on their combined results.
        #
        for i, regr in enumerate(self.regressors):
            #
            # In the inner loop, each model is trained cv times on the
            # training-part of this fold of data; and the holdout-part of data
            # is used for predictions. This is repeated cv times, so in
            # the end we have predictions for each data point.
            #
            # Advantage of this complex approach is that data points we're
            # predicting have not been trained on by the algorithm, so it's
            # less susceptible to overfitting.
            #
            for train_idx, holdout_idx in kfold.split(X, y, groups):
                instance = clone(regr)
                instance.fit(X[train_idx], y[train_idx])
                y_pred = instance.predict(X[holdout_idx])
                meta_features[holdout_idx, i] = y_pred

        # save meta-features for training data
        if self.store_train_meta_features:
            self.train_meta_features_ = meta_features

        # Train meta-model on the out-of-fold predictions
        if self.use_features_in_secondary:
            self.meta_regr_.fit(np.hstack((X, meta_features)), y)
        else:
            self.meta_regr_.fit(meta_features, y)

        # Retrain base models on all data
        for regr in self.regr_:
            regr.fit(X, y)

        return self
示例#41
0
    def fit(self):
        LOG.info('Start fitting ...')

        gs_cv_params = {'n_jobs': self.n_jobs, 'cv': _cv_build(self.cv_inner),
                        'verbose': 0}

        zscore_cv_auc = []
        zscore_cv_acc = []
        split_id = 0
        for dozs in [False, True]:
            LOG.info('Generate %sz-scored sample ...', '' if dozs else 'non ')
            X, y, groups = self._generate_sample(zscored=dozs)

            # The inner CV loop is a grid search on clf_params
            LOG.info('Creating ModelAndGridSearchCV')
            inner_cv = ModelAndGridSearchCV(self.param, **gs_cv_params)

            # Some sklearn's validations
            scoring = check_scoring(inner_cv, scoring=self._scorer)
            cv_outer = check_cv(_cv_build(self.cv_outer), y,
                                classifier=is_classifier(inner_cv))

            # Outer CV loop
            outer_cv_scores = []
            outer_cv_acc = []
            LOG.info('Starting nested cross-validation ...')
            for train, test in list(cv_outer.split(X, y, groups)):
                # Find the groups in the train set, in case inner CV is LOSO.
                fit_params = None
                if self.cv_inner.get('type') == 'loso':
                    train_groups = [groups[i] for i in train]
                    fit_params = {'groups': train_groups}

                result = nested_fit_and_score(
                    clone(inner_cv), X, y, scoring, train, test, fit_params=fit_params, verbose=1)

                # Test group has no positive cases
                if result is None:
                    continue

                score, clf = result
                test_group = list(set(groups[i] for i in test))[0]
                self._models.append({
                    # 'clf_type': clf_str,
                    'zscored': int(dozs),
                    'outer_split_id': split_id,
                    'left-out-sites': self.sites[test_group],
                    'best_model': clf.best_model_,
                    'best_params': clf.best_params_,
                    'best_score': clf.best_score_,
                    'best_index': clf.best_index_,
                    'cv_results': clf.cv_results_,
                    'cv_scores': score['test']['score'],
                    'cv_accuracy': score['test']['accuracy'],
                    'cv_params': clf.cv_results_['params'],
                    'cv_auc_means': clf.cv_results_['mean_test_score'],
                    'cv_splits': {'split%03d' % i: clf.cv_results_['split%d_test_score' % i]
                                  for i in list(range(clf.n_splits_))}
                })

                # Store the outer loop scores
                if score['test']['score'] is not None:
                    outer_cv_scores.append(score['test']['score'])
                outer_cv_acc.append(score['test']['accuracy'])
                split_id += 1

                # LOG.info(
                #     '[%s-%szs] Outer CV: roc_auc=%f, accuracy=%f, '
                #     'Inner CV: best roc_auc=%f, params=%s. ',
                #     clf.best_model_[0], 'n' if not dozs else '',
                #     score['test']['score'] if score['test']['score'] is not None else -1.0,
                #     score['test']['accuracy'],
                #     clf.best_score_, clf.best_model_[1])

            LOG.info('Outer CV loop finished, %s=%f (+/-%f), accuracy=%f (+/-%f)',
                     self._scorer,
                     np.mean(outer_cv_scores), 2 * np.std(outer_cv_scores),
                     np.mean(outer_cv_acc), 2 * np.std(outer_cv_acc))

            zscore_cv_auc.append(outer_cv_scores)
            zscore_cv_acc.append(outer_cv_acc)

        # Select best performing model
        best_inner_loops = [model['best_score'] for model in self._models]
        best_idx = np.argmax(best_inner_loops)
        self._best_model = self._models[best_idx]
        LOG.info('Inner CV [%d models compared] - best model %s-%szs, score=%f, params=%s',
                 len(best_inner_loops) * len(self._models[0]['cv_params']),
                 self._best_model['best_model'][0],
                 'n' if not self._best_model['zscored'] else '',
                 self._best_model['best_score'], self._best_model['best_params'])

        # Write out evaluation result
        best_zs = 1 if self._best_model['zscored'] else 0
        LOG.info('CV - estimated performance: %s=%f (+/-%f), accuracy=%f (+/-%f)',
                 self._scorer,
                 np.mean(zscore_cv_auc[best_zs]), 2 *
                 np.std(zscore_cv_auc[best_zs]),
                 np.mean(zscore_cv_acc[best_zs]), 2 *
                 np.std(zscore_cv_acc[best_zs]),
                 )
示例#42
0
    def fit(self, X, y=None, labels=None):
        #return self._fit(
        #    X, y, labels,
        #    parameter_iterable # parameter_iterable, \in Sized, it actually does len(parameter_iterable) in _fit
        #)

        # FIXME code duplication from BaseSearchCV._fit
        estimator = self.estimator
        cv = _split.check_cv(self.cv, y, classifier=is_classifier(estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y, labels = indexable(X, y, labels)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                  'of samples (%i) than data (X: %i samples)'
                                  % (len(y), n_samples))

        n_splits = cv.get_n_splits(X, y, labels)

        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch
        # FIXME how to handle pre_dispatch


        # FIXME recursively getting new parameters to evaluate

#        parameter_iterable = ...  # the magic
#
#        # The evaluation (Parallel) stuff
#        out = Parallel(
#            n_jobs=self.n_jobs, verbose=self.verbose,
#            pre_dispatch=pre_dispatch
#        )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
#                                  train, test, self.verbose, parameters,
#                                  self.fit_params, return_parameters=True,
#                                  error_score=self.error_score)
#            for parameters in parameter_iterable
#            for train, test in cv.split(X, y, labels))
#

        # n_fits on each (train, test)
        def cross_validation(raw_parameters):
            parameters = dict(zip(
                self.param_grid.keys(), raw_parameters
            ))  # TODO more robust way of doing this
            print(parameters)

            return Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose,
                pre_dispatch=pre_dispatch
            )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                      train, test, self.verbose, parameters,
                                      self.fit_params, return_parameters=True,
                                      error_score=self.error_score)
               for train, test in cv.split(X, y, labels))

        x = cartesian_product(*self.param_grid.values())

        # FIXME implement as non-recursive
        def bo_(x_obs, y_obs, n_iter):
            if n_iter > 0:
                kernel = kernels.Matern() + kernels.WhiteKernel()
                gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=16)
                gp.fit(x_obs, 1-y_obs)

                a = a_EI(gp, x_obs=x_obs, y_obs=1-y_obs)

                argmax_f_x_ = x[np.argmax(a(x))]

                # heavy evaluation
                f_argmax_f_x_ = cross_validation(argmax_f_x_)

                y_ob = np.atleast_2d(mean_mean_validation_scores(f_argmax_f_x_)).T

                return f_argmax_f_x_ + bo_(
                    x_obs=np.vstack((x_obs, argmax_f_x_)),
                    y_obs=np.vstack((y_obs, y_ob)),
                    n_iter=n_iter-1,
                )

            else:
                return []


        # FIXME (most informative) decision like Numerical Probabilistics stuff for integrations
        # sobol initilization?

        sampled_x_ind = np.random.choice(
            x.shape[0],
            size=self.n_initial_points,
            replace=False,
        )
        print(sampled_x_ind)

        x_obs = x[sampled_x_ind]
        f_x_obs = list(map(cross_validation, x_obs))

        y_obs = np.atleast_2d(list(map(mean_mean_validation_scores, f_x_obs))).T

        out = sum(f_x_obs, []) + bo_(x_obs, y_obs, n_iter=self.n_iter)

        n_fits = len(out)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_splits):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _ , parameters in \
                    out[grid_start:grid_start + n_splits]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_splits)
            scores.append((score, parameters))

            grid_scores.append(_search._CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))

        self.grid_scores_ = grid_scores

        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
示例#43
0
    def _fit(self, X, y, groups, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        X, y, groups = indexable(X, y, groups)

        cv = check_cv(self.cv, y, classifier=True)
        n_splits = cv.get_n_splits(X, y, groups)

        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            LOG.info("Fitting %d folds for each of %d candidates, totalling"
                     " %d fits", n_splits, n_candidates, n_candidates * n_splits)

        pre_dispatch = self.pre_dispatch

        cv_iter = list(cv.split(X, y, groups))
        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(delayed(_model_fit_and_score)(
            estimator, X, y, self.scoring, train, test, self.verbose, parameters,
            fit_params=self.fit_params,
            return_train_score=self.return_train_score,
            return_n_test_samples=True,
            return_times=True, return_parameters=True,
            error_score=self.error_score)
          for estimator, parameters in parameter_iterable
          for train, test in cv_iter)

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_scores, test_scores, test_sample_counts,
             fit_time, score_time, parameters) = zip(*out)
        else:
            (test_scores, test_sample_counts,
             fit_time, score_time, parameters) = zip(*out)

        candidate_params = parameters[::n_splits]
        n_candidates = len(candidate_params)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                              n_splits)
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array -
                                             array_means[:, np.newaxis]) ** 2,
                                            axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        _store('test_score', test_scores, splits=True, rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_score', train_scores, splits=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)

        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
        best_parameters = candidate_params[best_index][1]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(MaskedArray,
                                            np.empty(n_candidates,),
                                            mask=True,
                                            dtype=object))
        for cand_i, params in enumerate(candidate_params):
            _, param_values = params
            for name, value in param_values.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits
        self.best_model_ = candidate_params[best_index]

        if self.refit:
            # build best estimator and fit
            best_estimator = _clf_build(self.best_model_[0])
            best_estimator.set_params(**best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self