def _fit_and_score_grid(estimator, X, y, scorer, train, test, grid, fit_params, error_score='raise'): '''Doc String''' X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): scores = [error_score] * len(grid) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: origParams = estimator.get_params() scores = [ _score(estimator.set_params(**params), X_test, y_test, scorer) for params in grid ] estimator.set_params(**origParams) return scores
def _gen_train_val(self, X, y, cv_split_method): X, y, groups = indexable(X, y, None) Xs_tr, ys_tr, Xs_cv, ys_cv = [], [], [], [] if isinstance(cv_split_method, BaseCrossValidator): for tr, cv in cv_split_method.split(X, y, groups): X_tr, y_tr = _safe_split(self, X, y, tr) X_cv, y_cv = _safe_split(self, X, y, cv, tr) Xs_tr.append(X_tr) Xs_cv.append(X_cv) ys_tr.append(y_tr) ys_cv.append(y_cv) elif cv_split_method.__name__ == 'train_test_split': X, X_val, y, y_val = train_test_split( X, y, random_state=self._random_state, test_size=self.validation_fraction) Xs_tr.append(X_tr) Xs_cv.append(X_cv) ys_tr.append(y_tr) ys_cv.append(y_cv) else: raise ValueError("Split method should be a " "sklearn.model_selection spliter class...") return Xs_tr, ys_tr, Xs_cv, ys_cv
def cross_val_score(estimator, X, y=None, scoring=accuracy_score, cv=5): """ Use the given estimator to perform fit and predict for splits defined by 'cv' and compute the given score on each of the splits. :param estimator: A valid sklearn_wrapper estimator :param X, y: Valid data and target values that work with the estimator :param scoring: a scorer object from sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics) Default value is accuracy_score. :param cv: an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices. Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5. Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here. :return: cv_results: a list of scores corresponding to each cross validation fold """ if isinstance(cv, int): cv = StratifiedKFold(cv) cv_results = [] for train, test in cv.split(X, y): X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) trained_estimator = estimator.fit(X_train, y_train) predicted_values = trained_estimator.predict(X_test) cv_results.append(scoring(y_test, predicted_values)) return cv_results
def _train(self): """Trains one iteration of the model called when ``tune.run`` is called. Different routines are run depending on if the ``early_stopping`` attribute is True or not. - If ``self.early_stopping`` is True, each fold is fit with `partial_fit`, which stops training the model if the validation score is not improving for a particular fold. - Otherwise, run the full cross-validation procedure. In both cases, the average test accuracy is returned over all folds, and is returned as a singleton dictionary with "average_test_score" as the key. """ if self.early_stopping: for i, (train, test) in enumerate(self.cv.split(self.X, self.y)): X_train, y_train = _safe_split(self.estimator, self.X, self.y, train) X_test, y_test = _safe_split(self.estimator, self.X, self.y, test, train_indices=train) self.estimator[i].partial_fit(X_train, y_train, np.unique(self.y)) if self.return_train_score: self.fold_train_scores[i] = self.scoring( self.estimator[i], X_train, y_train) self.fold_scores[i] = self.scoring(self.estimator[i], X_test, y_test) self.mean_scores = sum(self.fold_scores) / len(self.fold_scores) if self.return_train_score: self.mean_train_scores = sum(self.fold_train_scores) / len( self.fold_train_scores) return { "average_test_score": self.mean_scores, "average_train_score": self.mean_train_scores } return {"average_test_score": self.mean_scores} else: scores = cross_validate(self.estimator, self.X, self.y, cv=self.cv, fit_params=self.fit_params, groups=self.groups, scoring=self.scoring) self.test_accuracy = sum(scores["test_score"]) / len( scores["test_score"]) if self.return_train_score: self.train_accuracy = sum(scores["train_score"]) / len( scores["train_score"]) return { "average_test_score": self.test_accuracy, "average_train_score": self.train_accuracy } return {"average_test_score": self.test_accuracy}
def _permutation_test_score(estimator, X, y, groups, cv, scorer): """Auxiliary function for permutation_test_score""" avg_score = [] for train, test in cv.split(X, y, groups): X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) estimator.fit(X_train, y_train) avg_score.append(scorer(estimator, X_test, y_test)) return np.mean(avg_score)
def _baf_single_fit(train, test, baf, estimator, X, y, scorer, random_state): """""" X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) baf_i = clone(baf) baf_i.random_state = random_state baf_i._fit(X_train, y_train) return baf_i.support_, _score(baf_i.estimator_, baf_i.transform(X_test, ), y_test, scorer), baf_i.score_
def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): """ Return the score for a fit across one fold. """ X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) return rfe._fit( X_train, y_train, lambda estimator, features: _score( estimator, X_test[:, features], y_test, scorer)).scores_
def _patch_split(estimator, X, y, indices, train_indices=None): if isinstance(y, dict): mixed_y = {} for key, _y in y.items(): X_subset, y_subset = _safe_split(estimator, X, _y, indices, train_indices=train_indices) mixed_y[key] = y_subset return X_subset, mixed_y else: return _safe_split(estimator, X, y, indices, train_indices=train_indices)
def _permutations(estimator, X, y, cv, scorer): """Auxiliary function for permutations""" avg_score = [] for train, test in cv.split(X, y): X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) # X_train, X_test = impute_data(X_train, X_test) estimator.fit(X_train, y_train) avg_score.append(scorer(estimator, X_test, y_test)) return np.mean(avg_score)
def _partial_fit_and_score( self, estimator, # type: BaseEstimator train, # type: List[int] test, # type: List[int] partial_fit_params, # type: Dict[str, Any] ): # type: (...) -> List[Number] X_train, y_train = _safe_split(estimator, self.X, self.y, train) X_test, y_test = _safe_split(estimator, self.X, self.y, test, train_indices=train) start_time = time() try: estimator.partial_fit(X_train, y_train, **partial_fit_params) except Exception as e: if self.error_score == "raise": raise e elif isinstance(self.error_score, Number): fit_time = time() - start_time test_score = self.error_score score_time = 0.0 if self.return_train_score: train_score = self.error_score else: raise ValueError("error_score must be 'raise' or numeric.") else: fit_time = time() - start_time test_score = self.scoring(estimator, X_test, y_test) score_time = time() - fit_time - start_time if self.return_train_score: train_score = self.scoring(estimator, X_train, y_train) # Required for type checking but is never expected to fail. assert isinstance(fit_time, Number) assert isinstance(score_time, Number) ret = [test_score, fit_time, score_time] if self.return_train_score: ret.insert(0, train_score) return ret
def cross_val_score_track_trials(estimator, X, y=None, scoring=accuracy_score, cv=5): """ Use the given estimator to perform fit and predict for splits defined by 'cv' and compute the given score on each of the splits. Parameters ---------- estimator: A valid sklearn_wrapper estimator X, y: Valid data and target values that work with the estimator scoring: string or a scorer object created using https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer. A string from sklearn.metrics.SCORERS.keys() can be used or a scorer created from one of sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics). A completely custom scorer object can be created from a python function following the example at https://scikit-learn.org/stable/modules/model_evaluation.html The metric has to return a scalar value, cv: an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices. Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5. Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here. :return: cv_results: a list of scores corresponding to each cross validation fold """ if isinstance(cv, int): cv = StratifiedKFold(cv) scorer = check_scoring(estimator, scoring=scoring) cv_results: List[float] = [] log_loss_results = [] time_results = [] for train, test in cv.split(X, y): X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) start = time.time() trained = estimator.fit(X_train, y_train) score_value = scorer(trained, X_test, y_test) execution_time = time.time() - start # not all estimators have predict probability try: y_pred_proba = trained.predict_proba(X_test) logloss = log_loss(y_true=y_test, y_pred=y_pred_proba) log_loss_results.append(logloss) except BaseException: logger.debug("Warning, log loss cannot be computed") cv_results.append(score_value) time_results.append(execution_time) return np.array(cv_results).mean(), np.array( log_loss_results).mean(), np.array(execution_time).mean()
def _safe_split_multi(estimator, X, y, train, test): X_train, y_train, X_test, y_test = [], [], [], [] for x_, y_, tr_, ts_ in zip(X, y, train, test): out = [_safe_split(estimator, x__, y_, tr_) for x__ in x_] X_tr, y_tr = zip(*out) X_train.append(np.array(X_tr)) y_train.append(y_tr[0]) # they are all equal out = [_safe_split(estimator, x__, y_, ts_, tr_) for x__ in x_] X_ts, y_ts = zip(*out) X_test.append(np.array(X_ts)) y_test.append(y_ts[0]) # they are all equal return X_train, y_train, X_test, y_test
def _partial_fit_and_score( self, estimator, # type: BaseEstimator train, # type: List[int] test, # type: List[int] partial_fit_params # type: Dict[str, Any] ): # type: (...) -> List[float] X_train, y_train = _safe_split(estimator, self.X, self.y, train) X_test, y_test = _safe_split(estimator, self.X, self.y, test, train_indices=train) start_time = time() try: estimator.partial_fit(X_train, y_train, **partial_fit_params) except Exception as e: if self.error_score == 'raise': raise e elif isinstance(self.error_score, Number): fit_time = time() - start_time test_score = self.error_score score_time = 0.0 if self.return_train_score: train_score = self.error_score else: raise ValueError('error_score must be \'raise\' or numeric.') else: fit_time = time() - start_time test_score = self.scoring(estimator, X_test, y_test) score_time = time() - fit_time - start_time if self.return_train_score: train_score = self.scoring(estimator, X_train, y_train) ret = [test_score, fit_time, score_time] if self.return_train_score: ret.insert(0, train_score) return ret
def _rfe_single_fit(rfe, estimator, x_data, y_data, train, test, scorer, **fit_kwargs): """Return the score for a fit across one fold.""" (x_train, y_train) = _safe_split(estimator, x_data, y_data, train) (x_test, y_test) = _safe_split(estimator, x_data, y_data, test, train) (fit_kwargs_train, _) = _split_fit_kwargs(fit_kwargs, train, test) def step_score(estimator, features): """Score for a single step in the recursive feature elimination.""" return _score(estimator, x_test[:, features], y_test, scorer) return rfe._fit(x_train, y_train, step_score=step_score, **fit_kwargs_train).scores_
def transform(self, X, y=None): if self.time_based: x_mask = _create_mask(X, self.data_range) return X[x_mask] else: X_train, _ = _safe_split(None, X, None, self.data_range) return X_train
def fit_transform(self, X, y=None): # TODO make checks if the data_range is set correctly if self.time_based: x_mask = _create_mask(X, self.data_range) if y is not None: assert len(X) == len(y) y_mask = _create_mask(y, self.data_range) return X[x_mask], y[y_mask] else: return X[x_mask] else: if y is not None: X, y = _safe_split(None, X, y, self.data_range) return X, y else: X, y = _safe_split(None, X, y, self.data_range) return X
def _partial_fit_and_score(self, estimator, train, test, partial_fit_params): # type: (...) -> List[float] X_train, y_train = _safe_split(estimator, self.X, self.y, train) X_test, y_test = _safe_split(estimator, self.X, self.y, test, train_indices=train) start_time = perf_counter() try: estimator.partial_fit(X_train, y_train, **partial_fit_params) except Exception as e: if self.error_score == 'raise': raise e elif isinstance(self.error_score, Number): fit_time = perf_counter() - start_time test_score = self.error_score score_time = 0.0 if self.return_train_score: train_score = self.error_score else: raise ValueError("error_score must be 'raise' or numeric.") else: fit_time = perf_counter() - start_time test_score = self.scoring(estimator, X_test, y_test) score_time = perf_counter() - fit_time - start_time if self.return_train_score: train_score = self.scoring(estimator, X_train, y_train) ret = [test_score, fit_time, score_time] if self.return_train_score: ret.insert(0, train_score) return ret
def test_kernel_precomputed(gbsg2): x, y = gbsg2 from sklearn.metrics.pairwise import pairwise_kernels from sklearn.utils.metaestimators import _safe_split m = MinlipSurvivalAnalysis(kernel="precomputed", solver="ecos") K = pairwise_kernels(x, metric="rbf", gamma=1. / 32) train_idx = numpy.arange(50, x.shape[0]) test_idx = numpy.arange(50) X_fit, y_fit = _safe_split(m, K, y, train_idx) X_test, y_test = _safe_split(m, K, y, test_idx, train_idx) m.fit(X_fit, y_fit) p = m.predict(X_test) assert_cindex_almost_equal(y_test['cens'], y_test['time'], p, (0.626514131897712, 457, 269, 17, 0))
def cross_val_score_track_trials(estimator, X, y=None, scoring=accuracy_score, cv=5): """ Use the given estimator to perform fit and predict for splits defined by 'cv' and compute the given score on each of the splits. :param estimator: A valid sklearn_wrapper estimator :param X, y: Valid data and target values that work with the estimator :param scoring: a scorer object from sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics) Default value is accuracy_score. :param cv: an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices. Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5. Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here. :return: cv_results: a list of scores corresponding to each cross validation fold """ if isinstance(cv, int): cv = StratifiedKFold(cv) cv_results = [] log_loss_results = [] time_results = [] for train, test in cv.split(X, y): X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) start = time.time() trained_estimator = estimator.fit(X_train, y_train) predicted_values = trained_estimator.predict(X_test) execution_time = time.time() - start # not all estimators have predict probability try: y_pred_proba = trained_estimator.predict_proba(X_test) logloss = log_loss(y_true=y_test, y_pred=y_pred_proba) log_loss_results.append(logloss) except BaseException: logger.debug("Warning, log loss cannot be computed") cv_results.append(scoring(y_test, predicted_values)) time_results.append(execution_time) return np.array(cv_results).mean(), np.array( log_loss_results).mean(), np.array(execution_time).mean()
def _incremental_fit_estimator(estimator, X, y, classes, train, test, train_sizes, scorer, verbose): """Train estimator on training subsets incrementally and compute scores.""" train_scores, test_scores = [], [] partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) for n_train_samples, partial_train in partitions: train_subset = train[:n_train_samples] X_train, y_train = _safe_split(estimator, X, y, train_subset) X_partial_train, y_partial_train = _safe_split(estimator, X, y, partial_train) X_test, y_test = _safe_split(estimator, X, y, test, train_subset) if y_partial_train is None: estimator.partial_fit(X_partial_train, classes=classes) else: estimator.partial_fit(X_partial_train, y_partial_train, classes=classes) train_scores.append(_score(estimator, X_train, y_train, scorer)) test_scores.append(_score(estimator, X_test, y_test, scorer)) return np.array((train_scores, test_scores)).T
def test_safe_split_with_precomputed_kernel(): clf = SVC() clfp = SVC(kernel="precomputed") iris = datasets.load_iris() X, y = iris.data, iris.target K = np.dot(X, X.T) cv = ShuffleSplit(test_size=0.25, random_state=0) train, test = list(cv.split(X))[0] X_train, y_train = _safe_split(clf, X, y, train) K_train, y_train2 = _safe_split(clfp, K, y, train) assert_array_almost_equal(K_train, np.dot(X_train, X_train.T)) assert_array_almost_equal(y_train, y_train2) X_test, y_test = _safe_split(clf, X, y, test, train) K_test, y_test2 = _safe_split(clfp, K, y, test, train) assert_array_almost_equal(K_test, np.dot(X_test, X_train.T)) assert_array_almost_equal(y_test, y_test2)
def test_kernel_precomputed(self): from sklearn.metrics.pairwise import pairwise_kernels from sklearn.utils.metaestimators import _safe_split m = MinlipSurvivalAnalysis(kernel="precomputed", solver="cvxpy") K = pairwise_kernels(self.x, metric="rbf") train_idx = numpy.arange(50, self.x.shape[0]) test_idx = numpy.arange(50) X_fit, y_fit = _safe_split(m, K, self.y, train_idx) X_test, y_test = _safe_split(m, K, self.y, test_idx, train_idx) m.fit(X_fit, y_fit) p = m.predict(X_test) v = concordance_index_censored(y_test['cens'], y_test['time'], p) expected = numpy.array([0.508748, 378, 365, 0, 0]) assert_array_almost_equal(expected, v)
def _fit_and_score(est, x, y, scorer, train_index, test_index, parameters, fit_params, predict_params): """Train survival model on given data and return its score on test data""" X_train, y_train = _safe_split(est, x, y, train_index) train_params = fit_params.copy() # Training est.set_params(**parameters) est.fit(X_train, y_train, **train_params) # Testing test_predict_params = predict_params.copy() X_test, y_test = _safe_split(est, x, y, test_index, train_index) score = scorer(est, X_test, y_test, **test_predict_params) if not isinstance(score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s) instead." % (str(score), type(score))) return score
def _fit_ovo_binary(estimator, X, y, i, j): """Fit a single binary estimator (one-vs-one).""" cond = np.logical_or(y == i, y == j) y = y[cond] y_binary = np.empty(y.shape, np.int) y_binary[y == i] = 0 y_binary[y == j] = 1 indcond = np.arange(X.shape[0])[cond.reshape(-1,)] return _fit_binary(estimator, _safe_split(estimator, X, None, indices=indcond)[0], y_binary, classes=[i, j]), indcond
def _predict_proba(estimator, X, y, train, test, verbose, parameters, fit_params): ''' Fits an estimator to the training set and outputs probability predictions (and true labels, if applicable) for test set. Adapted from mne.decoding.base._fit_and_score() ''' from mne.fixes import _check_fit_params from sklearn.utils.metaestimators import _safe_split if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = _check_fit_params(X, fit_params, train) if parameters is not None: estimator.set_params(**parameters) X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) y_hat = estimator.predict_proba(X_test) if y_train is None: return y_hat else: return y_hat, y_test
def _fit_and_predict(estimator, X, y, train, test, class_ratio, verbose, fit_params, method): from sklearn.utils.metaestimators import _safe_split from sklearn.model_selection._validation import _index_param_value from imblearn.under_sampling import RandomUnderSampler # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) rus = RandomUnderSampler(ratio=class_ratio, return_indices=True, random_state=42) if len(X.shape) < 2: X0 = X[0][train] y_train = y[train] idxs = rus.fit_sample(X0, y_train) X_train = np.empty(shape=(len(idxs), X.shape[0], X0.shape[1])) X_test = np.empty(shape=(len(test), X.shape[0], X0.shape[1])) for i in range(X.shape[0]): X_train[:, i, :] = X[i][train][idxs].toarray() X_test[:, i, :] = X[i][test].toarray() y_train = to_categorical(y_train[idxs]) else: X_train, y_train = _safe_split(estimator, X, y, train) X_test, _ = _safe_split(estimator, X, y, test, train) idxs = rus.fit_sample(X_train, y_train) X_train = X_train[idxs] y_train = y_train[idxs] clf = estimator.fit(X_train, y_train, **fit_params) func = getattr(estimator, method) logger.info('-- predict_proba()') predictions = func(X_test) return predictions, test, pd.DataFrame(clf.cv_results_)
def test_kernel_precomputed(gbsg2): x, y = gbsg2 from sklearn.metrics.pairwise import pairwise_kernels from sklearn.utils.metaestimators import _safe_split m = MinlipSurvivalAnalysis(kernel="precomputed", solver="osqp", max_iter=25000) xt = scale(x) K = pairwise_kernels(xt, metric="rbf", gamma=0.1) train_idx = numpy.arange(200, x.shape[0]) test_idx = numpy.arange(200) X_fit, y_fit = _safe_split(m, K, y, train_idx) X_test, y_test = _safe_split(m, K, y, test_idx, train_idx) m.fit(X_fit, y_fit) p = m.predict(X_test) assert_cindex_almost_equal(y_test['cens'], y_test['time'], p, (0.6518928901200369, 8472, 4524, 0, 3))
def _fit_and_predict(estimator, X, y, train, test, predict_proba): """ fit the estimator with the train samples and make prediction with the test data Args: estimator(object): sklearn object X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). y (pd.Series): Observed outcome of size (num_subjects,). train: test: predict_proba (bool): If True, the treatment model is a classifier and use 'predict_proba', If False, use 'predict'. """ X_train, y_train = _safe_split(estimator, X, y, train) X_test, _ = _safe_split(estimator, X, y, test, train) estimator.fit(X_train, y_train) if predict_proba: pred = estimator.predict_proba(X_test)[:, 1] else: pred = estimator.predict(X_test) return pd.Series(pred, index=X_test.index), estimator
def cross_val_train_predict(estimator, x, y, predict_method: str = "predict", cv: int = 5): """ Return fit estimators and predictions of each (Stratified) fold. """ from sklearn.base import clone, is_classifier from sklearn.model_selection._split import check_cv from sklearn.utils.metaestimators import _safe_split import numpy as np splitter = check_cv(cv, y, classifier=is_classifier(estimator)) estimators = [] predictions = None for train, test in splitter.split(x, y): x_train, y_train = _safe_split(estimator, x, y, train) x_test, _ = _safe_split(estimator, x, y, test, train) fold_estimator = clone(estimator) fold_predict = getattr(fold_estimator, predict_method) fold_estimator.fit(x_train, y_train) estimators.append(fold_estimator) fold_prediction = fold_predict(x_test) if predictions is None: if fold_prediction.ndim == 2: predictions = np.empty(shape=(len(y), fold_prediction.shape[1])) else: predictions = np.empty(shape=(len(y), )) predictions[test] = fold_prediction return predictions, estimators
def split_with_schemas(estimator, all_X, all_y, indices, train_indices=None): subset_X, subset_y = _safe_split( estimator, all_X, all_y, indices, train_indices) if hasattr(all_X, 'json_schema'): n_rows = subset_X.shape[0] schema = { 'type': 'array', 'minItems': n_rows, 'maxItems': n_rows, 'items': all_X.json_schema['items']} lale.datasets.data_schemas.add_schema(subset_X, schema) if hasattr(all_y, 'json_schema'): n_rows = subset_y.shape[0] schema = { 'type': 'array', 'minItems': n_rows, 'maxItems': n_rows, 'items': all_y.json_schema['items']} lale.datasets.data_schemas.add_schema(subset_y, schema) return subset_X, subset_y
def _worker(estimator_, i, X, y, train, test): """Implement the worker resubmission in case of errors.""" # custom_name = "{}_p_{}_i_{}".format( # ("permutation" if is_permutation_test else "regular"), RANK, i) # tmp_name_base = 'tmp_' + custom_name worker_logger = logging.getLogger('worker') experiment_resubmissions = 0 experiment_completed = False worker_logger.info("{}{} executing job {}".format(NAME, RANK, i)) while not experiment_completed and \ experiment_resubmissions <= MAX_RESUBMISSIONS: try: if experiment_resubmissions > 0: worker_logger.warning("{}{} resubmitting experiment {}".format(NAME, RANK, i)) # tmp_name = tmp_name_base + '_submission_{}'.format( # experiment_resubmissions + 1) # run_experiment(data, labels, None, config, # is_permutation_test, experiments_folder_path, # tmp_name) # TODO necessary? estimator = clone(estimator_.estimator) # need to get the deepest estimator to use _safe_split estimator__ = clone(estimator) while hasattr(estimator__, 'estimator'): estimator__ = clone(estimator__.estimator) X_train, y_train = _safe_split(estimator__, X, y, train) X_test, y_test = _safe_split(estimator__, X, y, test, train) if estimator_.shuffle_y: random_state = check_random_state(estimator_.random_state) y_train = _shuffle(y_train, estimator_.groups, random_state) worker_logger.info("{}{} fitting experiment {} - starting".format(NAME, RANK, i)) estimator.fit(X_train, y_train) worker_logger.info("{}{} fitting experiment {} - completed".format(NAME, RANK, i)) worker_logger.debug("{}{} scoring experiment {} - starting".format(NAME, RANK, i)) yts_pred = estimator.predict(X_test) ytr_pred = estimator.predict(X_train) lr_score = estimator_.scorer_(estimator, X_train, y_train) ts_score = estimator_.scorer_(estimator, X_test, y_test) worker_logger.debug("{}{} scoring experiment {} - complete".format(NAME, RANK, i)) if hasattr(estimator, 'cv_results_'): # In case in which the estimator is a CV object cv_results = estimator.cv_results_ else: cv_results = None cv_results_ = { 'split_i': i, 'learn_score': lr_score, 'test_score': ts_score, 'cv_results_': cv_results, 'ytr_pred': ytr_pred, 'yts_pred': yts_pred, 'test_index': test, 'train_index': train, 'estimator': estimator } experiment_completed = True # ### Dump partial results if estimator_.experiments_folder is not None: worker_logger.debug("{}{} saving results for experiment {}".format(NAME, RANK, i)) pkl_name = ( 'permutation' if estimator_.shuffle_y else 'regular') + \ '_%d.pkl' % i pkl.dump(cv_results_, gzip.open(os.path.join( estimator_.experiments_folder, pkl_name), 'wb')) except StandardError as error: # If somethings out of the ordinary happens, # resubmit the job experiment_resubmissions += 1 warnings.warn( "[{}_{}] failed experiment {}, resubmission #{}\n" "Exception raised: {}".format( NAME, RANK, i, experiment_resubmissions, error)) if not experiment_completed: warnings.warn( "[{}_{}] failed to complete experiment {}, " "max resubmissions limit reached".format(NAME, RANK, i)) return {} else: if not IS_MPI_JOB and estimator_.verbose: worker_logger.info("[{}{}]: {} job {} completed".format(NAME, RANK, ('permutation' if estimator_.shuffle_y else 'regular'), i)) return cv_results_
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise'): """ Fit estimator and compute scores for a given dataset split. """ if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) LOG.info("[CV] %s %s", msg, (64 - len(msg)) * '.') # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time test_score = [_score(estimator, X_test, y_test, s) for s in scorer] score_time = time.time() - start_time - fit_time if return_train_score: train_score = [_score(estimator, X_train, y_train, s) for s in scorer] if verbose > 2: msg += ", score=".join(('%f' % ts for ts in test_score)) if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) LOG.info("[CV] %s %s", (64 - len(end_msg)) * '.', end_msg) ret = [train_score, test_score] if return_train_score else [test_score] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) return ret