def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y = indexable(X, y) cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) base_estimator = clone(self.estimator) best = best_parameters(base_estimator, cv, X, y, parameter_iterable, self.scorer_, self.fit_params, self.iid) best = best.compute() self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if isinstance(base_estimator, Pipeline): base_estimator = base_estimator.to_sklearn().compute() if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = base_estimator.set_params(**best.parameters) if y is not None: self.best_estimator_ = best_estimator.fit( X, y, **self.fit_params) else: self.best_estimator_ = best_estimator.fit(X, **self.fit_params) return self
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y = indexable(X, y) cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) base_estimator = clone(self.estimator) best = best_parameters(base_estimator, cv, X, y, parameter_iterable, self.scorer_, self.fit_params, self.iid) best = best.compute() self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if isinstance(base_estimator, Pipeline): base_estimator = base_estimator.to_sklearn().compute() if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = base_estimator.set_params(**best.parameters) if y is not None: self.best_estimator_ = best_estimator.fit(X, y, **self.fit_params) else: self.best_estimator_ = best_estimator.fit(X, **self.fit_params) return self
def fit(self, X, y): X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output) _check_param_grid(self.param_grid) cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if comm_rank == 0: self._fit_master(X, y, cv) else: self._fit_slave() return self
def fit(self, X, y): if master: LOG.info("comm_size:" + str(comm_size)) X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output, accept_sparse='csr') _check_param_grid(self.param_grid) cv = check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) if master: LOG.info("cv length:" + str(len(cv))) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if master: self._fit_master(X, y, cv) else: self._fit_slave() return self
def dynamic_cross_val_score(estimator, fv, esa_feature_list, unigram_feature_list, dynamic_X, y=None, scoring=None, cv=None, verbose=0, fit_params=None): print "dynamic cross val mit %s" % esa_feature_list + unigram_feature_list vec = DictVectorizer() tfidf = TfidfTransformer() X = vec.fit_transform(fv).toarray() # X= tfidf.fit_transform(X).toarray() X, y = cross_validation.indexable(X, y) cv = cross_validation.check_cv(cv, X, y, classifier=cross_validation.is_classifier(estimator)) scorer = cross_validation.check_scoring(estimator, scoring=scoring) scores = [] cross_val_step = 0 for train, test in cv: fv_copy = copy.deepcopy(fv) #baue X in jedem Schritt neu for i in range(0,len(fv)): #jedes i steht für einen featuredict feature_dict = fv_copy[i] dynamic_vec = dynamic_X[cross_val_step] #zeigt auf esa_vec for feature in esa_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten for feature in unigram_feature_list: feature_dict.update(dynamic_vec[find_index_for_dynamic_feature(feature)][i]) #das i-te feature-dict mit esa-feature updaten X = vec.fit_transform(fv_copy).toarray() # X = tfidf.fit_transform(X).toarray() scores.append(cross_validation._fit_and_score(cross_validation.clone(estimator), X, y, scorer, train, test, verbose, None, fit_params)) cross_val_step += 1 return np.array(scores)[:, 0]
def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" cv = self.cv self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y = indexable(X, y) cv = check_cv(cv, X, y, classifier=is_classifier(self.estimator)) base_estimator = clone(self.estimator) out = [_fit_and_score(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv] self._dask_value = value(out) out, = compute(value(out)) n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append(_CVScoreTuple( parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y, fit_params=None, predict_params=None, X_test=None, y_test=None): """Do nested cross-validation. If ``X_test`` and ``y_test`` are not provided, nested cross-validation using ``X`` and ``y`' is performed, i.e., data is first split into *K* folds, where *K-1* folds are used for training and hyper-parameter selection and the remaining fold for testing. The training portion is again split into *T* folds to perform a grid-search over hyper-parameters. The parameters that achieved the best average performance across the *T* inner cross-validation folds are selected. Using these parameters, a model is trained on the entire training data and applied to the *K*-th testing fold. If ``X_test`` and ``y_test`` are provided, a regular cross-validation is performed on ``X`` and ``y`` to determine hyper-parameters as for the inner cross-validation above. Using the best performing parameters, a model is trained on all of ``X`` and ``y`` and applied to ``X_test`` and ``y_test`` for testing. Parameters ---------- X : array-like, shape = [n_samples, n_features] Feature matrix. y : structured array, shape = [n_samples] A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. fit_params : dict Additional arguments passed to the fit method. predict_params : dict Additional arguments passed to the predict method. X_test : array-like, shape = [n_test_samples, n_features] Hold-out data to perform testing on. y_test : array-like or sequence, shape = [n_test_samples] Target values of hold-out test data. Returns ------- self """ if y.dtype.names is None: X, y = check_X_y(X, y) else: X, event, time = check_arrays_survival(X, y, force_all_finite=False) y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)]) if X_test is not None: X_test, event_test, time_test = check_arrays_survival(X_test, y_test, force_all_finite=False) y_test = numpy.fromiter(zip(event_test, time_test), dtype=[('event', numpy.bool), ('time', numpy.float64)]) cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) self._dview, self._lview = self._init_cluster() if X_test is None: self._fit(X, y, cv, fit_params, predict_params) else: self._fit_holdout(X, y, fit_params, predict_params, X_test, y_test) del self._dview del self._lview return self