예제 #1
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        X, y = indexable(X, y)
        cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        base_estimator = clone(self.estimator)

        best = best_parameters(base_estimator, cv, X, y, parameter_iterable,
                               self.scorer_, self.fit_params, self.iid)
        best = best.compute()

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if isinstance(base_estimator, Pipeline):
            base_estimator = base_estimator.to_sklearn().compute()

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = base_estimator.set_params(**best.parameters)
            if y is not None:
                self.best_estimator_ = best_estimator.fit(
                    X, y, **self.fit_params)
            else:
                self.best_estimator_ = best_estimator.fit(X, **self.fit_params)
        return self
예제 #2
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        X, y = indexable(X, y)
        cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        base_estimator = clone(self.estimator)

        best = best_parameters(base_estimator, cv, X, y, parameter_iterable,
                               self.scorer_, self.fit_params, self.iid)
        best = best.compute()

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score


        if isinstance(base_estimator, Pipeline):
            base_estimator = base_estimator.to_sklearn().compute()

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = base_estimator.set_params(**best.parameters)
            if y is not None:
                self.best_estimator_ = best_estimator.fit(X, y, **self.fit_params)
            else:
                self.best_estimator_ = best_estimator.fit(X, **self.fit_params)
        return self
예제 #3
0
    def fit(self, X, y):
        X, y = check_X_y(X,
                         y,
                         force_all_finite=False,
                         multi_output=self.multi_output)
        _check_param_grid(self.param_grid)

        cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if comm_rank == 0:
            self._fit_master(X, y, cv)
        else:
            self._fit_slave()

        return self
예제 #4
0
    def fit(self, X, y):
        if master:
            LOG.info("comm_size:" + str(comm_size))
        X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output, accept_sparse='csr')
        _check_param_grid(self.param_grid)

        cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        if master:
            LOG.info("cv length:" + str(len(cv)))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if master:
            self._fit_master(X, y, cv)
        else:
            self._fit_slave()
        return self
예제 #5
0
def dynamic_cross_val_score(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, scoring=None, cv=None,
                verbose=0, fit_params=None):

    print "dynamic cross val mit %s" % esa_feature_list + unigram_feature_list
    vec = DictVectorizer()
    tfidf = TfidfTransformer()

    X = vec.fit_transform(fv).toarray()
    # X= tfidf.fit_transform(X).toarray()

    X, y = cross_validation.indexable(X, y)

    cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator))
    scorer = cross_validation.check_scoring(estimator, scoring=scoring)
    scores = []

    cross_val_step = 0
    for train, test in cv:

        fv_copy = copy.deepcopy(fv)

        #baue X in jedem Schritt neu
        for i in range(0,len(fv)): #jedes i steht für einen featuredict
            feature_dict = fv_copy[i]
            dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec
            for feature in esa_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten
            for feature in unigram_feature_list:
                feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten



        X = vec.fit_transform(fv_copy).toarray()
        # X = tfidf.fit_transform(X).toarray()

        scores.append(cross_validation._fit_and_score(cross_validation.clone(estimator), X, y, scorer,
                        train, test, verbose, None, fit_params))

        cross_val_step += 1


    return np.array(scores)[:, 0]
예제 #6
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y = indexable(X, y)

        cv = check_cv(cv, X, y, classifier=is_classifier(self.estimator))

        base_estimator = clone(self.estimator)
        out = [_fit_and_score(clone(base_estimator), X, y, self.scorer_, train,
                              test, self.verbose, parameters, self.fit_params,
                              return_parameters=True,
                              error_score=self.error_score)
               for parameters in parameter_iterable
               for train, test in cv]
        self._dask_value = value(out)

        out, = compute(value(out))
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
    def fit(self, X, y, fit_params=None, predict_params=None, X_test=None, y_test=None):
        """Do nested cross-validation.

        If ``X_test`` and ``y_test`` are not provided, nested cross-validation using
        ``X`` and ``y`' is performed, i.e., data is first split into *K* folds, where
        *K-1* folds are used for training and hyper-parameter selection and the
        remaining fold for testing. The training portion is again split into *T* folds
        to perform a grid-search over hyper-parameters. The parameters that achieved the
        best average performance across the *T* inner cross-validation folds are selected.
        Using these parameters, a model is trained on the entire training data and applied
        to the *K*-th testing fold.

        If ``X_test`` and ``y_test`` are provided, a regular cross-validation is performed on
        ``X`` and ``y`` to determine hyper-parameters as for the inner cross-validation above.
        Using the best performing parameters, a model is trained on all of ``X`` and ``y`` and
        applied to ``X_test`` and ``y_test`` for testing.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Feature matrix.

        y : structured array, shape = [n_samples]
            A structured array containing the binary event indicator
            as first field, and time of event or time of censoring as
            second field.

        fit_params : dict
            Additional arguments passed to the fit method.

        predict_params : dict
            Additional arguments passed to the predict method.

        X_test : array-like, shape = [n_test_samples, n_features]
            Hold-out data to perform testing on.

        y_test : array-like or sequence, shape = [n_test_samples]
            Target values of hold-out test data.

        Returns
        -------
        self
        """
        if y.dtype.names is None:
            X, y = check_X_y(X, y)
        else:
            X, event, time = check_arrays_survival(X, y, force_all_finite=False)
            y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)])

        if X_test is not None:
            X_test, event_test, time_test = check_arrays_survival(X_test, y_test, force_all_finite=False)
            y_test = numpy.fromiter(zip(event_test, time_test), dtype=[('event', numpy.bool), ('time', numpy.float64)])

        cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        self._dview, self._lview = self._init_cluster()
        if X_test is None:
            self._fit(X, y, cv, fit_params, predict_params)
        else:
            self._fit_holdout(X, y, fit_params, predict_params, X_test, y_test)

        del self._dview
        del self._lview

        return self