def test_old_style_cv(): cv1 = _CVIterableWrapper([np.array([True, False, True, False] * 5), np.array([False, True, False, True] * 5)]) cv2 = _CVIterableWrapper([np.array([True, False, True, False] * 5), np.array([False, True, True, True] * 5)]) assert tokenize(cv1) == tokenize(cv1) assert tokenize(cv1) != tokenize(cv2) sol = cv1.get_n_splits(np_X, np_y, np_groups) assert compute_n_splits(cv1, np_X, np_y, np_groups) == sol with assert_dask_compute(False): assert compute_n_splits(cv1, da_X, da_y, da_groups) == sol
def test_cv_iterable_wrapper(): y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) with warnings.catch_warnings(record=True): from sklearn.cross_validation import StratifiedKFold as OldSKF cv = OldSKF(y_multiclass, n_folds=3) wrapped_old_skf = _CVIterableWrapper(cv) # Check if split works correctly np.testing.assert_equal(list(cv), list(wrapped_old_skf.split())) # Check if get_n_splits works correctly assert_equal(len(cv), wrapped_old_skf.get_n_splits()) kf_iter = KFold(n_splits=5).split(X, y) kf_iter_wrapped = check_cv(kf_iter) # Since the wrapped iterable is enlisted and stored, # split can be called any number of times to produce # consistent results. assert_array_equal(list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y))) # If the splits are randomized, successive calls to split yields different # results kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y) kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) assert_array_equal(list(kf_randomized_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y))) assert_true(np.any(np.array(list(kf_iter_wrapped.split(X, y))) != np.array(list(kf_randomized_iter_wrapped.split(X, y)))))
def test_cv_iterable_wrapper(): y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) with warnings.catch_warnings(record=True): from sklearn.cross_validation import StratifiedKFold as OldSKF cv = OldSKF(y_multiclass, n_folds=3) wrapped_old_skf = _CVIterableWrapper(cv) # Check if split works correctly np.testing.assert_equal(list(cv), list(wrapped_old_skf.split())) # Check if get_n_splits works correctly assert_equal(len(cv), wrapped_old_skf.get_n_splits()) kf_iter = KFold(n_splits=5).split(X, y) kf_iter_wrapped = check_cv(kf_iter) # Since the wrapped iterable is enlisted and stored, # split can be called any number of times to produce # consistent results. assert_array_equal(list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y))) # If the splits are randomized, successive calls to split yields different # results kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y) kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) assert_array_equal(list(kf_randomized_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y))) assert_true( np.any( np.array(list(kf_iter_wrapped.split(X, y))) != np.array( list(kf_randomized_iter_wrapped.split(X, y)))))
def check_cv2(cv=3, y=None, classifier=False, random_state=None): """Input checker utility for building a cross-validator NOTE: this is the same as sklearn.model_selection._split.check_cv but with an added parameter for random_state So that nested CV splits are reproduceable Parameters ---------- cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if classifier is True and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validation strategies that can be used here. y : array-like, optional The target variable for supervised learning problems. classifier : boolean, optional, default False Whether the task is a classification task, in which case stratified KFold will be used. random_state : None, int or RandomState When shuffle=True, pseudo-random number generator state used for shuffling. If None, use default numpy RNG for shuffling. Returns ------- checked_cv : a cross-validator instance. The return value is a cross-validator which generates the train/test splits via the ``split`` method. """ if cv is None: cv = 3 if isinstance(cv, numbers.Integral): if (classifier and (y is not None) and (type_of_target(y) in ('binary', 'multiclass'))): return StratifiedKFold(cv, random_state=random_state) else: return KFold(cv, random_state=random_state) if not hasattr(cv, 'split') or isinstance(cv, str): if not isinstance(cv, Iterable) or isinstance(cv, str): raise ValueError("Expected cv as an integer, cross-validation " "object (from sklearn.model_selection) " "or an iterable. Got %s." % cv) return _CVIterableWrapper(cv) return cv # New style cv objects are passed without any modification
def _check_cv(cv=3, y=None, classifier=False, **kwargs): """Input checker utility for building a cross-validator. Parameters ---------- cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if classifier is True and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validation strategies that can be used here. y : array-like, optional The target variable for supervised learning problems. classifier : boolean, optional, default False Whether the task is a classification task, in which case stratified KFold will be used. kwargs : dict Other parameters for StratifiedShuffleSplit or ShuffleSplit. Returns ------- checked_cv : a cross-validator instance. The return value is a cross-validator which generates the train/test splits via the ``split`` method. """ if cv is None: cv = kwargs.pop('n_splits', 0) or 10 if isinstance(cv, numbers.Integral): if (classifier and (y is not None) and (type_of_target(y) in ('binary', 'multiclass'))): return StratifiedShuffleSplit(cv, **kwargs) else: return ShuffleSplit(cv, **kwargs) if not hasattr(cv, 'split') or isinstance(cv, str): if not isinstance(cv, Iterable) or isinstance(cv, str): raise ValueError("Expected cv as an integer, cross-validation " "object (from sklearn.model_selection) " "or an iterable. Got %s." % cv) return _CVIterableWrapper(cv) return cv # New style cv objects are passed without any modification
def test_cv_iterable_wrapper(): y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) with warnings.catch_warnings(record=True): from sklearn.cross_validation import StratifiedKFold as OldSKF cv = OldSKF(y_multiclass, n_folds=3) wrapped_old_skf = _CVIterableWrapper(cv) # Check if split works correctly np.testing.assert_equal(list(cv), list(wrapped_old_skf.split())) # Check if get_n_splits works correctly assert_equal(len(cv), wrapped_old_skf.get_n_splits())
def fit(self, scores, y_true): """Train calibration Parameters ---------- scores : (n_samples, ) array-like Uncalibrated scores. y_true : (n_samples, ) array-like True labels (dtype=bool). """ # to force equal priors, randomly select (and average over) # up to fifty balanced (i.e. #true == #false) calibration sets. if self.equal_priors: counter = Counter(y_true) positive, negative = counter[True], counter[False] if positive > negative: majority, minority = True, False n_majority, n_minority = positive, negative else: majority, minority = False, True n_majority, n_minority = negative, positive n_splits = min(50, n_majority // n_minority + 1) minority_index = np.where(y_true == minority)[0] majority_index = np.where(y_true == majority)[0] cv = [] for _ in range(n_splits): test_index = np.hstack([ np.random.choice(majority_index, size=n_minority, replace=False), minority_index]) cv.append(([], test_index)) cv = _CVIterableWrapper(cv) # to estimate priors from the data itself, use the whole set else: cv = 'prefit' self.calibration_ = CalibratedClassifierCV( base_estimator=_Passthrough(), method=self.method, cv=cv) self.calibration_.fit(scores.reshape(-1, 1), y_true) return self