def fit_grid_point(base_estimator, parameters, X, y, sample_weight, train, test, verbose, **fit_params): """Run fit on one set of parameters""" if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') # update parameters of the classifier after a copy of its base structure estimator = clone(base_estimator) estimator.set_params(**parameters) X_train, y_train, sample_weight_train = _safe_split( estimator, X, y, sample_weight, train) X_test, y_test, sample_weight_test = _safe_split(estimator, X, y, sample_weight, test, train) if sample_weight is not None: fit_params = fit_params.copy() fit_params['sample_weight'] = sample_weight_train if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) if verbose > 1: end_msg = "%s -%s" % ( msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ( (64 - len(end_msg)) * '.', end_msg) return estimator, parameters, train, test
def fit_grid_point(base_estimator, parameters, X, y, sample_weight, train, test, verbose, **fit_params): """Run fit on one set of parameters""" if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') # update parameters of the classifier after a copy of its base structure estimator = clone(base_estimator) estimator.set_params(**parameters) X_train, y_train, sample_weight_train = _safe_split( estimator, X, y, sample_weight, train) X_test, y_test, sample_weight_test = _safe_split( estimator, X, y, sample_weight, test, train) if sample_weight is not None: fit_params = fit_params.copy() fit_params['sample_weight'] = sample_weight_train if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg) return estimator, parameters, train, test
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params=None): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) if num_samples(train) == 0 or num_samples(test) == 0: raise RuntimeError( 'Cross validation error in fit_estimator. The total data set ' 'contains %d elements, which were split into a training set ' 'of %d elements and a test set of %d elements. Unfortunately, ' 'you can\'t have a %s set with 0 elements.' % ( num_samples(X), num_samples(train), num_samples(test), 'training' if num_samples(train) == 0 else 'test')) # adjust length of sample weights n_samples = num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) # fit and score start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = _score(estimator, X_test, y_test, scorer) train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time msmbuilder_api = is_msmbuilder_estimator(estimator) n_samples_test = num_samples(X_test, is_nested=msmbuilder_api) n_samples_train = num_samples(X_train, is_nested=msmbuilder_api) if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) return (test_score, n_samples_test, train_score, n_samples_train, scoring_time)
def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params): """Fit estimator and predict values for a given dataset split. Read more in the :ref:`User Guide <cross_validation>`. Parameters ---------- estimator : estimator object implementing 'fit' and 'predict' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. Returns ------- preds : sequence Result of calling 'estimator.predict' test : array-like This is the value of the test parameter """ # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) X_train, y_train = _safe_split(estimator, X, y, train) X_test, _ = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) #preds = estimator.predict(X_test) preds = estimator.decision_function(X_test) return preds, test
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params=None): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # adjust length of sample weights n_samples = _num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) # fit and score start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = _score(estimator, X_test, y_test, scorer) train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time mixtape_api = _is_mixtape_estimator(estimator) n_samples_test = _num_samples(X_test, mixtape_api=mixtape_api) n_samples_train = _num_samples(X_train, mixtape_api=mixtape_api) if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) return (test_score, n_samples_test, train_score, n_samples_train, scoring_time)
def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") iris = load_iris() X, y = iris.data, iris.target K = np.dot(X, X.T) cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0) tr, te = list(cv)[0] X_tr, y_tr = cval._safe_split(clf, X, y, tr) K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr) assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T)) X_te, y_te = cval._safe_split(clf, X, y, te, tr) K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr) assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
def score_each_boost(estimator, parameters, min_n_estimators, X, y, sample_weight, score_func, train, test, verbose): """Run fit on one set of parameters Returns the score and the instance of the classifier """ if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') X_test, y_test, sample_weight_test = _safe_split( estimator, X, y, sample_weight, test, train) test_score_params = {} if sample_weight is not None: test_score_params['sample_weight'] = sample_weight_test this_n_test_samples = _num_samples(X_test) all_scores = [] all_clf_params = [] n_test_samples = [] for i, y_pred in enumerate(estimator.staged_predict(X_test)): if i + 1 < min_n_estimators: continue score = score_func(y_test, y_pred, **test_score_params) all_scores.append(score) clf_para = copy(parameters) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) # boosting may have stopped early if len(all_scores) < estimator.n_estimators - min_n_estimators + 1: last_score = all_scores[-1] last_clf_params = all_clf_params[-1] for i in range(len(all_scores), estimator.n_estimators - min_n_estimators + 1): all_scores.append(last_score) clf_para = copy(last_clf_params) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg) return all_scores, all_clf_params, n_test_samples
def test_kernel_precomputed(self): from sklearn.metrics.pairwise import pairwise_kernels from sklearn.cross_validation import _safe_split m = MinlipSurvivalAnalysis(kernel="precomputed", solver="cvxpy") K = pairwise_kernels(self.x, metric="rbf") train_idx = numpy.arange(50, self.x.shape[0]) test_idx = numpy.arange(50) X_fit, y_fit = _safe_split(m, K, self.y, train_idx) X_test, y_test = _safe_split(m, K, self.y, test_idx, train_idx) m.fit(X_fit, y_fit) p = m.predict(X_test) v = concordance_index_censored(y_test['cens'], y_test['time'], p) expected = numpy.array([0.508748, 378, 365, 0, 0]) assert_array_almost_equal(expected, v)
def score_each_boost(estimator, parameters, min_n_estimators, X, y, sample_weight, score_func, train, test, verbose): """Run fit on one set of parameters Returns the score and the instance of the classifier """ if verbose > 1: start_time = time.time() msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.iteritems())) print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.') X_test, y_test, sample_weight_test = _safe_split(estimator, X, y, sample_weight, test, train) test_score_params = {} if sample_weight is not None: test_score_params['sample_weight'] = sample_weight_test this_n_test_samples = _num_samples(X_test) all_scores = [] all_clf_params = [] n_test_samples = [] for i, y_pred in enumerate(estimator.staged_predict(X_test)): if i + 1 < min_n_estimators: continue score = score_func(y_test, y_pred, **test_score_params) all_scores.append(score) clf_para = copy(parameters) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) # boosting may have stopped early if len(all_scores) < estimator.n_estimators - min_n_estimators + 1: last_score = all_scores[-1] last_clf_params = all_clf_params[-1] for i in range(len(all_scores), estimator.n_estimators - min_n_estimators + 1): all_scores.append(last_score) clf_para = copy(last_clf_params) clf_para['n_estimators'] = i + 1 all_clf_params.append(clf_para) n_test_samples.append(this_n_test_samples) if verbose > 1: end_msg = "%s -%s" % ( msg, logger.short_format_time(time.time() - start_time)) print "[BoostGridSearchCV] %s %s" % ( (64 - len(end_msg)) * '.', end_msg) return all_scores, all_clf_params, n_test_samples
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'): if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = estimator.score(X_test, y_test) scoring_time = time.time() - start_time ret = [test_score, _num_samples(X_test), scoring_time] if return_parameters: ret.append(parameters) return ret
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise', extraOut="auto"): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) # Add additional return values extraRVs = {} if extraOut != None: extraRVs["counts"] = {"train":train.shape[0], "test":test.shape[0]} if "estimator" in extraOut: extraRVs["estimator"] = estimator if extraOut == "auto" or "predictions" in extraOut: assert test.shape[0] == X_test.shape[0] probabilities = estimator.predict_proba(X_test) probabilityByIndex = {} for exampleIndex, prediction in zip(test, probabilities): probabilityByIndex[exampleIndex] = prediction extraRVs["probabilities"] = probabilityByIndex if (extraOut == "auto" or "importances" in extraOut) and hasattr(estimator, "feature_importances_"): extraRVs["importances"] = estimator.feature_importances_ ret.append(extraRVs) return ret
def fit(self, X, y): """Fit the RFA model and automatically tune the number of selected features. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where `n_samples` is the number of samples and `n_features` is the total number of features. y : array-like, shape = [n_samples] Target values (integers for classification, real numbers for regression). """ X, y = check_X_y(X, y, "csr") if self.estimator_params is not None: warnings.warn("The parameter 'estimator_params' is deprecated as of version 0.16 " "and will be removed in 0.18. The parameter is no longer " "necessary because the value is set via the estimator initialisation " "or set_params function." , DeprecationWarning) # Initialization rfa = RFA(estimator=self.estimator, n_features_to_select=1, step=self.step, estimator_params=self.estimator_params, verbose=self.verbose - 1) cv = check_cv(self.cv, X, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) scores = np.zeros(X.shape[1]) n_features_to_select_by_rank = np.zeros(X.shape[1]) # Cross-validation for n, (train, test) in enumerate(cv): X_train, y_train = _safe_split(self.estimator, X, y, train) X_test, y_test = _safe_split(self.estimator, X, y, test, train) # Compute a full ranking of the features # ranking_ contains the same set of values for all CV folds, # but perhaps reordered ranking_ = rfa.fit(X_train, y_train).ranking_ # Score each subset of features for k in range(0, np.max(ranking_)): indices = np.where(ranking_ <= k + 1)[0] estimator = clone(self.estimator) estimator.fit(X_train[:, indices], y_train) score = _score(estimator, X_test[:, indices], y_test, scorer) if self.verbose > 0: print("Finished fold with %d / %d feature ranks, score=%f" % (k + 1, np.max(ranking_), score)) scores[k] += score # n_features_to_select_by_rank[k] is being overwritten # multiple times, but by the same value n_features_to_select_by_rank[k] = indices.size # Select the best upper bound for feature rank. It's OK to use the # last ranking_, as np.max(ranking_) is the same over all CV folds. scores = scores[:np.max(ranking_)] k = np.argmax(scores) # Re-execute an elimination with best_k over the whole set rfa = RFA(estimator=self.estimator, n_features_to_select=n_features_to_select_by_rank[k], step=self.step, estimator_params=self.estimator_params) rfa.fit(X, y) # Set final attributes self.support_ = rfa.support_ self.n_features_ = rfa.n_features_ self.ranking_ = rfa.ranking_ self.estimator_ = clone(self.estimator) if self.estimator_params: self.estimator_.set_params(**self.estimator_params) self.estimator_.fit(self.transform(X), y) # Fixing a normalization error, n is equal to len(cv) - 1 # here, the scores are normalized by len(cv) self.grid_scores_ = scores / len(cv) return self
def fit_and_score_n_support(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_support=True, error_score='raise'): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. Returns ------- train_score : float, optional Score on training set, returned only if `return_train_score` is `True`. test_score : float Score on test set. n_test_samples : int Number of test samples. scoring_time : float Time spent for fitting and scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) if return_n_support: ret.append(estimator.n_support_) return ret
def _fit_and_score(estimator, X, y, scorer, train, test, cv, verbose, parameters, fit_params, return_train_score=False, return_parameters=False): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scoring : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. train : array-like, shape = (n_train_samples,) Indices of training samples. test : array-like, shape = (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. Returns ------- train_score : float, optional Score on training set, returned only if `return_train_score` is `True`. test_score : float Score on test set. n_test_samples : int Number of test samples. scoring_time : float Time spent for fitting and scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust lenght of sample weights n_samples = _num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) if cv is not None: fit_params["cv"] = cv if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) return ret
def grid_search_early_stopping(estimator, param_grid, verbose, scoring, cv, X, y, early_stopping_rounds, eval_set_size, n_jobs=1, iid=True, refit=True, pre_dispatch='2*n_jobs', error_score='raise'): ''' This is from scikit-learn package. ''' parameter_iterable = ParameterGrid(param_grid) scorer_ = check_scoring(estimator, scoring=scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(estimator) pre_dispatch = pre_dispatch out = Parallel( n_jobs=n_jobs, verbose=2 if verbose > 0 else 0, pre_dispatch=pre_dispatch)(delayed(_fit_and_score)( clone(base_estimator), X, y, scorer_, train, test, 2 if verbose > 0 else 0, parameters, { "early_stopping_rounds": early_stopping_rounds, "eval_metric": get_xgboost_eval_metric(scoring), "eval_set": [_safe_split(estimator, X, y, test, train)], "verbose": True if verbose > 1 else False }, return_parameters=True, error_score=error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] best_score_ = best.mean_validation_score if refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**best.parameters) if y is not None: best_estimator, _, _ = fit_estimator_early_stopping( best_estimator, X, y, scoring, early_stopping_rounds, eval_set_size, verbose) else: raise ValueError('y is required.') return best_estimator, best.parameters, grid_scores
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, scoring, return_train_score=False, return_parameters=False, error_score='raise'): """ Fit estimator and compute scores for a given dataset split. This overrides the behavior of _fit_and_score method in cross_validation.py. Note that a new argument, scoring, has been added to the function. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. scoring: string The name of the scoring function used in cross_val_score. Default is accuracy. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. Returns ------- train_score : float, optional Score on training set, returned only if `return_train_score` is `True`. test_score : float Score on test set. n_test_samples : int Number of test samples. scoring_time : float Time spent for fitting and scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() x_train, y_train = _safe_split(estimator, X, y, train) x_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: b = estimator.fit(x_train, **fit_params) else: b = estimator.fit(x_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: test_score = _score(estimator, x_test, y_test, scorer) if return_train_score: train_score = _score(estimator, x_train, y_train, scorer) # Addition to original scikit code: # Create FitEvents for each estimator fit. fit_event = FitEvent(b, estimator, x_train) ModelDbSyncer.Syncer.instance.add_to_buffer(fit_event) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(x_test), scoring_time]) # Addition to original scikit code: # Create MetricEvents for each estimator. metric_event = MetricEvent(x_test, estimator, "", "", scoring, test_score) ModelDbSyncer.Syncer.instance.add_to_buffer(metric_event) if return_parameters: ret.append(parameters) return ret
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise', extraOut="auto"): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += " , n=" + str(X_test.shape[0]) + ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) # Add additional return values extraRVs = {} if extraOut != None: extraRVs["counts"] = {"train": train.shape[0], "test": test.shape[0]} if "estimator" in extraOut: extraRVs["estimator"] = estimator if extraOut == "auto" or "predictions" in extraOut: assert test.shape[0] == X_test.shape[0] probabilities = estimator.predict_proba(X_test) probabilityByIndex = {} for exampleIndex, prediction in zip(test, probabilities): probabilityByIndex[exampleIndex] = prediction extraRVs["probabilities"] = probabilityByIndex if (extraOut == "auto" or "importances" in extraOut) and hasattr( estimator, "feature_importances_"): extraRVs["importances"] = estimator.feature_importances_ ret.append(extraRVs) return ret