def test_grid_search_correct_score_results(): # test that correct scores are used n_splits = 3 clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] for score in ['f1', 'roc_auc']: grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits) results = grid_search.fit(X, y).cv_results_ # Test scorer names result_keys = list(results.keys()) expected_keys = (("mean_test_score", "rank_test_score") + tuple("split%d_test_score" % cv_i for cv_i in range(n_splits))) assert_true(all(in1d(expected_keys, result_keys))) cv = StratifiedKFold(n_splits=n_splits) n_splits = grid_search.n_splits_ for candidate_i, C in enumerate(Cs): clf.set_params(C=C) cv_scores = np.array( list(grid_search.cv_results_['split%d_test_score' % s][candidate_i] for s in range(n_splits))) for i, (train, test) in enumerate(cv.split(X, y)): clf.fit(X[train], y[train]) if score == "f1": correct_score = f1_score(y[test], clf.predict(X[test])) elif score == "roc_auc": dec = clf.decision_function(X[test]) correct_score = roc_auc_score(y[test], dec) assert_almost_equal(correct_score, cv_scores[i])
def test_grid_search_correct_score_results(): # test that correct scores are used n_splits = 3 clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] for score in ['f1', 'roc_auc']: # XXX: It seems there's some global shared state in LinearSVC - fitting # multiple `SVC` instances in parallel using threads sometimes results # in wrong results. This only happens with threads, not processes/sync. # For now, we'll fit using the sync scheduler. grid_search = dcv.GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits, scheduler='sync') cv_results = grid_search.fit(X, y).cv_results_ # Test scorer names result_keys = list(cv_results.keys()) expected_keys = (("mean_test_score", "rank_test_score") + tuple("split%d_test_score" % cv_i for cv_i in range(n_splits))) assert all(in1d(expected_keys, result_keys)) cv = StratifiedKFold(n_splits=n_splits) n_splits = grid_search.n_splits_ for candidate_i, C in enumerate(Cs): clf.set_params(C=C) cv_scores = np.array( list(grid_search.cv_results_['split%d_test_score' % s][candidate_i] for s in range(n_splits))) for i, (train, test) in enumerate(cv.split(X, y)): clf.fit(X[train], y[train]) if score == "f1": correct_score = f1_score(y[test], clf.predict(X[test])) elif score == "roc_auc": dec = clf.decision_function(X[test]) correct_score = roc_auc_score(y[test], dec) assert_almost_equal(correct_score, cv_scores[i])
def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): X, y = check_X_y(X, y) # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. epsilon = 1e-9 * np.var(X, axis=0).max() if _refit: self.classes_ = None if _check_partial_fit_first_call(self, classes): # This is the first call to partial_fit: # initialize various cumulative counters n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_prior_ = np.zeros(n_classes) self.class_count_ = np.zeros(n_classes) else: if X.shape[1] != self.theta_.shape[1]: msg = "Number of features %d does not match previous data %d." raise ValueError(msg % (X.shape[1], self.theta_.shape[1])) # Put epsilon back in each time self.sigma_[:, :] -= epsilon classes = self.classes_ unique_y = np.unique(y) unique_y_in_classes = in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError("The target label(s) %s in y do not exist in the " "initial classes %s" % (y[~unique_y_in_classes], classes)) for y_i in unique_y: i = classes.searchsorted(y_i) X_i = X[y == y_i, :] if sample_weight is not None: sw_i = sample_weight[y == y_i] N_i = sw_i.sum() else: sw_i = None N_i = X_i.shape[0] new_theta, new_sigma = self._update_mean_variance( self.class_count_[i], self.theta_[i, :], self.sigma_[i, :], X_i, sw_i) self.theta_[i, :] = new_theta self.sigma_[i, :] = new_sigma self.class_count_[i] += N_i self.sigma_[:, :] += epsilon self.class_prior_[:] = self.class_count_ / np.sum(self.class_count_) #print self.class_prior_[:] return self
def _partial_fit(self, X, y, classes=None, _refit=True, sample_weight=None): """Actual implementation of Gaussian Bayes fitting. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target values. classes : array-like, shape (n_classes,) List of all the classes that can possibly appear in the y vector. _refit: bool, must be True sample_weight : not supported Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) # If the ratio of data variance between dimensions is too small, it # will cause numerical errors. To address this, we artificially # boost the variance by epsilon, a small fraction of the standard # deviation of the largest dimension. epsilon = 1e-9 * np.var(X, axis=0).max() if classes is None or not _refit: raise NotImplementedError('Partial fit is not supported.') self.classes_ = unique_labels(classes) n_features = X.shape[1] n_classes = len(self.classes_) self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features, n_features)) self.class_count_ = np.zeros(n_classes, dtype=np.float64) # Initialise the class prior n_classes = len(self.classes_) # Take into account the priors if self.priors is not None: priors = np.asarray(self.priors) # Check that the provide prior match the number of classes if len(priors) != n_classes: raise ValueError('Number of priors must match number of' ' classes.') # Check that the sum is 1 if priors.sum() != 1.0: raise ValueError('The sum of the priors should be 1.') # Check that the prior are non-negative if (priors < 0).any(): raise ValueError('Priors must be non-negative.') self.class_prior_ = priors else: # Initialize the priors to zeros for each class self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64) classes = self.classes_ unique_y = np.unique(y) unique_y_in_classes = in1d(unique_y, classes) if not np.all(unique_y_in_classes): raise ValueError("The target label(s) %s in y do not exist in the " "initial classes %s" % (unique_y[~unique_y_in_classes], classes)) for y_i in unique_y: i = classes.searchsorted(y_i) X_i = X[y == y_i, :] if sample_weight is not None: sw_i = sample_weight[y == y_i] N_i = sw_i.sum() else: sw_i = None N_i = X_i.shape[0] theta, sigma = self._update_mean_covariance(X_i) self.theta_[i, :] = theta self.sigma_[i, :, :] = sigma self.class_count_[i] += N_i self.sigma_[:, :, :] += epsilon # Update if only no priors is provided if self.priors is None: # Empirical prior, with sample_weight taken into account self.class_prior_ = self.class_count_ / self.class_count_.sum() return self