def _grid_search_params_iter(self, train_X, train_y): if callable(self.inner_cv): inner_cv = self.inner_cv(train_X, train_y) else: inner_cv = _check_cv(self.inner_cv, train_X, train_y, classifier=is_classifier(self.estimator)) param_iter = ParameterGrid(self.param_grid) LOG.info("Performing grid search over %d configurations" % len(param_iter)) for fold_id, (train_index, test_index) in enumerate(inner_cv): for parameters in param_iter: yield fold_id + 1, train_index, test_index, parameters
def test_check_cv_return_types(): X = np.ones((9, 2)) cv = cval._check_cv(3, X, classifier=False) assert_true(isinstance(cv, cval.KFold)) y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1]) cv = cval._check_cv(3, X, y_binary, classifier=True) assert_true(isinstance(cv, cval.StratifiedKFold)) y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) cv = cval._check_cv(3, X, y_multiclass, classifier=True) assert_true(isinstance(cv, cval.StratifiedKFold)) X = np.ones((5, 2)) y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] cv = cval._check_cv(3, X, y_seq_of_seqs, classifier=True) assert_true(isinstance(cv, cval.KFold)) y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs) cv = cval._check_cv(3, X, y_indicator_matrix, classifier=True) assert_true(isinstance(cv, cval.KFold)) y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]]) cv = cval._check_cv(3, X, y_multioutput, classifier=True) assert_true(isinstance(cv, cval.KFold))
def _grid_search(self, train_X, train_y): if callable(self.inner_cv): inner_cv = self.inner_cv(train_X, train_y) else: inner_cv = _check_cv(self.inner_cv, train_X, train_y, classifier=is_classifier(self.estimator)) master = MPIGridSearchCVMaster(self.param_grid, inner_cv, self.estimator, self.scorer_, self.fit_params) return master.run(train_X, train_y)
def fit(self, X, y): X, y = check_X_y(X, y, force_all_finite=False, multi_output=self.multi_output) _check_param_grid(self.param_grid) cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if comm_rank == 0: self._fit_master(X, y, cv) else: self._fit_slave() return self
def fit(self, X, y=None, **fit_params): """Fit ensemble of models Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data. y : array-like, optional Target data if base estimators are supervised. Returns ------- self """ self._check_params() cv = _check_cv(self.cv, X, y) self._fit(X, y, cv, **fit_params) return self
def _cross_val_predict(estimator, X, y=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """Generate cross-validated estimates for each input data point Parameters ---------- estimator : estimator object implementing 'fit' and 'predict' The object to use to fit the data. X : array-like The data to fit. Can be, for example a list, or an array at least 2d. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. cv : cross-validation generator or int, optional, default: None A cross-validation generator to use. If int, determines the number of folds in StratifiedKFold if y is binary or multiclass and estimator is a classifier, or the number of folds in KFold otherwise. If None, it is equivalent to cv=3. This generator must include all elements in the test set exactly once. Otherwise, a ValueError is raised. n_jobs : integer, optional The number of CPUs to use to do the computation. -1 means 'all CPUs'. verbose : integer, optional The verbosity level. fit_params : dict, optional Parameters to pass to the fit method of the estimator. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' Returns ------- preds : ndarray This is the result of calling 'predict' """ X, y = indexable(X, y) cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) preds_blocks = parallel(delayed(_fit_and_predict)(pylibfm.FM(), X, y, train, test, verbose, fit_params) for train, test in cv) p = np.concatenate([p for p, _ in preds_blocks]) locs = np.concatenate([loc for _, loc in preds_blocks]) if not _check_is_partition(locs, X.shape[0]): raise ValueError('cross_val_predict only works for partitions') preds = p.copy() preds[locs] = p return preds
def fit(self, X, y, fit_params=None, predict_params=None, X_test=None, y_test=None): """Do nested cross-validation. If ``X_test`` and ``y_test`` are not provided, nested cross-validation using ``X`` and ``y`' is performed, i.e., data is first split into *K* folds, where *K-1* folds are used for training and hyper-parameter selection and the remaining fold for testing. The training portion is again split into *T* folds to perform a grid-search over hyper-parameters. The parameters that achieved the best average performance across the *T* inner cross-validation folds are selected. Using these parameters, a model is trained on the entire training data and applied to the *K*-th testing fold. If ``X_test`` and ``y_test`` are provided, a regular cross-validation is performed on ``X`` and ``y`` to determine hyper-parameters as for the inner cross-validation above. Using the best performing parameters, a model is trained on all of ``X`` and ``y`` and applied to ``X_test`` and ``y_test`` for testing. Parameters ---------- X : array-like, shape = [n_samples, n_features] Feature matrix. y : structured array, shape = [n_samples] A structured array containing the binary event indicator as first field, and time of event or time of censoring as second field. fit_params : dict Additional arguments passed to the fit method. predict_params : dict Additional arguments passed to the predict method. X_test : array-like, shape = [n_test_samples, n_features] Hold-out data to perform testing on. y_test : array-like or sequence, shape = [n_test_samples] Target values of hold-out test data. Returns ------- self """ if y.dtype.names is None: X, y = check_X_y(X, y) else: X, event, time = check_arrays_survival(X, y, force_all_finite=False) y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)]) if X_test is not None: X_test, event_test, time_test = check_arrays_survival(X_test, y_test, force_all_finite=False) y_test = numpy.fromiter(zip(event_test, time_test), dtype=[('event', numpy.bool), ('time', numpy.float64)]) cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) self._dview, self._lview = self._init_cluster() if X_test is None: self._fit(X, y, cv, fit_params, predict_params) else: self._fit_holdout(X, y, fit_params, predict_params, X_test, y_test) del self._dview del self._lview return self
def permutation_test_score(estimator, X, y, data_train=None, cv=None, n_permutations=100, n_jobs=1, labels=None, random_state=0, verbose=0, scoring=None): """Evaluate the significance of a cross-validated score with permutations Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like The target variable to try to predict in the case of supervised learning. data_train : np.array, optional Data to train on, if data for training is different from X. scoring : string, callable or None, optional, default: None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. cv : integer or cross-validation generator, optional If an integer is passed, it is the number of fold (default 3). Specific cross-validation objects can be passed, see sklearn.cross_validation module for the list of possible objects. n_permutations : integer, optional Number of times to permute ``y``. n_jobs : integer, optional The number of CPUs to use to do the computation. -1 means 'all CPUs'. labels : array-like of shape [n_samples] (optional) Labels constrain the permutation among groups of samples with a same label. random_state : RandomState or an int seed (0 by default) A random number generator instance to define the state of the random permutations generator. verbose : integer, optional The verbosity level. Returns ------- score : float The true score without permuting targets. permutation_scores : array, shape = [n_permutations] The scores obtained for each permutations. pvalue : float The returned value equals p-value if `score_func` returns bigger numbers for better scores (e.g., accuracy_score). If `score_func` is rather a loss function (i.e. when lower is better such as with `mean_squared_error`) then this is actually the complement of the p-value: 1 - p-value. Notes ----- This function implements Test 1 in: Ojala and Garriga. Permutation Tests for Studying Classifier Performance. The Journal of Machine Learning Research (2010) vol. 11 """ X, y = indexable(X, y) cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) random_state = check_random_state(random_state) if data_train is None: # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. # Default behavior of sklearn permutation score score = _permutation_test_score(clone(estimator), X, y, cv, scorer) permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_permutation_test_score)( clone(estimator), X, _shuffle(y, labels, random_state), cv, scorer) for _ in range(n_permutations)) else: # Modification for 2pn # First get the real score, train on nii_optional (actor), test on nii_func (observer) score = [] for train, test in cv: estimator.fit(data_train[train], y[train]) score.append(scorer(estimator, X[test], y[test])) score = np.mean(score) # Then, get the prmutation scores permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_permutation_test_score)( clone(estimator), X, _shuffle(y, labels, random_state), cv, scorer, data_train) for _ in range(n_permutations)) permutation_scores = np.array(permutation_scores) pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1) return score, permutation_scores, pvalue