def test_bayesian_mixture_predict_predict_proba(): # this is the same test as test_gaussian_mixture_predict_predict_proba() rng = np.random.RandomState(0) rand_data = RandomData(rng) for prior_type in PRIOR_TYPE: for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y bgmm = BayesianGaussianMixture( n_components=rand_data.n_components, random_state=rng, weight_concentration_prior_type=prior_type, covariance_type=covar_type) # Check a warning message arrive if we don't do fit assert_raise_message(NotFittedError, "This BayesianGaussianMixture instance" " is not fitted yet. Call 'fit' with " "appropriate arguments before using " "this method.", bgmm.predict, X) bgmm.fit(X) Y_pred = bgmm.predict(X) Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1) assert_array_equal(Y_pred, Y_pred_proba) assert_greater_equal(adjusted_rand_score(Y, Y_pred), .95)
def test_monotonic_likelihood(): # We check that each step of the each step of variational inference without # regularization improve monotonically the training set of the bound rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components = rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] bgmm = BayesianGaussianMixture( n_components=2 * n_components, covariance_type=covar_type, warm_start=True, max_iter=1, random_state=rng, tol=1e-4, ) current_lower_bound = -np.infty # Do one training iteration at a time so we can make sure that the # training log likelihood increases after each iteration. for _ in range(500): prev_lower_bound = current_lower_bound current_lower_bound = bgmm.fit(X).lower_bound_ assert_greater_equal(current_lower_bound, prev_lower_bound) if bgmm.converged_: break assert bgmm.converged_
def test_monotonic_likelihood(): # We check that each step of the EM without regularization improve # monotonically the training set likelihood rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components = rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] gmm = GaussianMixture(n_components=n_components, covariance_type=covar_type, reg_covar=0, warm_start=True, max_iter=1, random_state=rng, tol=1e-7) current_log_likelihood = -np.infty with warnings.catch_warnings(): warnings.simplefilter("ignore", ConvergenceWarning) # Do one training iteration at a time so we can make sure that the # training log likelihood increases after each iteration. for _ in range(600): prev_log_likelihood = current_log_likelihood try: current_log_likelihood = gmm.fit(X).score(X) except ConvergenceWarning: pass assert_greater_equal(current_log_likelihood, prev_log_likelihood) if gmm.converged_: break assert gmm.converged_
def check_min_weight_fraction_leaf(name, X, y): """Test if leaves contain at least min_weight_fraction_leaf of the training set""" ForestEstimator = FOREST_ESTIMATORS[name] rng = np.random.RandomState(0) weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for max_leaf_nodes in (None, 1000): for frac in np.linspace(0, 0.5, 6): est = ForestEstimator(min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0) if isinstance(est, (RandomForestClassifier, RandomForestRegressor)): est.bootstrap = False est.fit(X, y, sample_weight=weights) out = est.estimators_[0].tree_.apply(X) node_weights = np.bincount(out, weights=weights) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert_greater_equal( np.min(leaf_weights), total_weight * est.min_weight_fraction_leaf, "Failed with {0} " "min_weight_fraction_leaf={1}".format( name, est.min_weight_fraction_leaf))
def check_min_weight_fraction_leaf(name): X, y = hastie_X, hastie_y # Test if leaves contain at least min_weight_fraction_leaf of the # training set ForestEstimator = FOREST_ESTIMATORS[name] rng = np.random.RandomState(0) weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for frac in np.linspace(0, 0.5, 6): est = ForestEstimator(min_weight_fraction_leaf=frac, n_estimators=1, random_state=0) if "RandomForest" in name: est.bootstrap = False est.fit(X, y, sample_weight=weights) out = est.estimators_[0].tree_.apply(X) node_weights = np.bincount(out, weights=weights) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert_greater_equal( np.min(leaf_weights), total_weight * est.min_weight_fraction_leaf, "Failed with {0} " "min_weight_fraction_leaf={1}".format( name, est.min_weight_fraction_leaf))
def test_min_weight_fraction_leaf(): """Test if leaves contain at least min_weight_fraction_leaf of the training set""" X = np.asfortranarray(iris.data.astype(tree._tree.DTYPE)) y = iris.target rng = np.random.RandomState(0) weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for max_leaf_nodes, name, frac in product((None, 1000), ALL_TREES, np.linspace(0, 0.5, 6)): TreeEstimator = ALL_TREES[name] est = TreeEstimator(min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0) est.fit(X, y, sample_weight=weights) out = est.tree_.apply(X) node_weights = np.bincount(out, weights=weights) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert_greater_equal( np.min(leaf_weights), total_weight * est.min_weight_fraction_leaf, "Failed with {0} " "min_weight_fraction_leaf={1}".format( name, est.min_weight_fraction_leaf))
def test_min_max_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # Constant feature. X = np.zeros(5) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_greater_equal(X_scaled.min(), 0.) assert_less_equal(X_scaled.max(), 1.)
def check_threshold(birch_instance, threshold): """Use the leaf linked list for traversal""" current_leaf = birch_instance.dummy_leaf_.next_leaf_ while current_leaf: subclusters = current_leaf.subclusters_ for sc in subclusters: assert_greater_equal(threshold, sc.radius) current_leaf = current_leaf.next_leaf_
def test_pearsonr_mat(self): pear_mat = pearsonr_mat(self.mat) assert_equal(pear_mat.shape, (10, 10)) pear_mat = pearsonr_mat(self.mat, self.w_mat) assert_equal(pear_mat.shape, (10, 10)) assert_greater_equal(np.min(pear_mat), -1) assert_less_equal(np.max(pear_mat), 1)
def test_calibration_multiclass(): """Test calibration for multiclass """ # test multi-class setting with classifier that implements # only decision function clf = LinearSVC() X, y_idx = make_blobs(n_samples=100, n_features=2, random_state=42, centers=3, cluster_std=3.0) # Use categorical labels to check that CalibratedClassifierCV supports # them correctly target_names = np.array(['a', 'b', 'c']) y = target_names[y_idx] X_train, y_train = X[::2], y[::2] X_test, y_test = X[1::2], y[1::2] clf.fit(X_train, y_train) for method in ['isotonic', 'sigmoid']: cal_clf = CalibratedClassifierCV(clf, method=method, cv=2) cal_clf.fit(X_train, y_train) probas = cal_clf.predict_proba(X_test) assert_array_almost_equal(np.sum(probas, axis=1), np.ones(len(X_test))) # Check that log-loss of calibrated classifier is smaller than # log-loss of naively turned OvR decision function to probabilities # via softmax def softmax(y_pred): e = np.exp(-y_pred) return e / e.sum(axis=1).reshape(-1, 1) uncalibrated_log_loss = \ log_loss(y_test, softmax(clf.decision_function(X_test))) calibrated_log_loss = log_loss(y_test, probas) assert_greater_equal(uncalibrated_log_loss, calibrated_log_loss) # Test that calibration of a multiclass classifier decreases log-loss # for RandomForestClassifier X, y = make_blobs(n_samples=100, n_features=2, random_state=42, cluster_std=3.0) X_train, y_train = X[::2], y[::2] X_test, y_test = X[1::2], y[1::2] clf = RandomForestClassifier(n_estimators=10, random_state=42) clf.fit(X_train, y_train) clf_probs = clf.predict_proba(X_test) loss = log_loss(y_test, clf_probs) for method in ['isotonic', 'sigmoid']: cal_clf = CalibratedClassifierCV(clf, method=method, cv=3) cal_clf.fit(X_train, y_train) cal_clf_probs = cal_clf.predict_proba(X_test) cal_loss = log_loss(y_test, cal_clf_probs) assert_greater(loss, cal_loss)
def test_multiple_init(): # Test that multiple inits does not much worse than a single one rng = np.random.RandomState(0) n_samples, n_features, n_components = 50, 5, 2 X = rng.randn(n_samples, n_features) for cv_type in COVARIANCE_TYPE: train1 = GaussianMixture(n_components=n_components, covariance_type=cv_type, random_state=0).fit(X).score(X) train2 = GaussianMixture(n_components=n_components, covariance_type=cv_type, random_state=0, n_init=5).fit(X).score(X) assert_greater_equal(train2, train1)
def test_lda_score(): # Test LDA score for batch training # score should be higher after each iteration n_topics, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert_greater_equal(score_2, score_1)
def test_lda_preplexity(): """ Test LDA preplexity for batch training preplexity should be lower after each iteration """ n_topics, alpha, eta, X = _build_sparse_mtx() lda_1 = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=0) lda_2 = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=0) distr_1 = lda_1.fit_transform(X, max_iters=1) prep_1 = lda_1.preplexity(X, distr_1, sub_sampling=False) distr_2 = lda_2.fit_transform(X, max_iters=10) prep_2 = lda_2.preplexity(X, distr_2, sub_sampling=False) assert_greater_equal(prep_1, prep_2)
def test_lda_score(method): # Test LDA score for batch training # score should be higher after each iteration n_components, X = _build_sparse_mtx() lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert_greater_equal(score_2, score_1)
def test_select_fdr_regression(): """ Test that fdr heuristic actually has low FDR. """ def single_fdr(alpha, n_informative, random_state): X, y = make_regression(n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0. false_discovery_rate = (num_false_positives / (num_true_positives + num_false_positives)) return false_discovery_rate for alpha in [0.001, 0.01, 0.1]: for n_informative in [1, 5, 10]: # As per Benjamini-Hochberg, the expected false discovery rate # should be lower than alpha: # FDR = E(FP / (TP + FP)) <= alpha false_discovery_rate = np.mean([ single_fdr(alpha, n_informative, random_state) for random_state in range(30) ]) assert_greater_equal(alpha, false_discovery_rate) # Make sure that the empirical false discovery rate increases # with alpha: if false_discovery_rate != 0: assert_greater(false_discovery_rate, alpha / 10)
def check_clustering(name, clusterer_orig, readonly_memmap=False): clusterer = clone(clusterer_orig) X, y = _create_small_ts_dataset() X, y = shuffle(X, y, random_state=7) X = TimeSeriesScalerMeanVariance().fit_transform(X) rng = np.random.RandomState(42) X_noise = X + (rng.randn(*X.shape) / 5) n_samples, n_features, dim = X.shape # catch deprecation and neighbors warnings if hasattr(clusterer, "n_clusters"): clusterer.set_params(n_clusters=3) set_random_state(clusterer) # fit clusterer.fit(X) # with lists clusterer.fit(X.tolist()) pred = clusterer.labels_ assert_equal(pred.shape, (n_samples,)) assert_greater(adjusted_rand_score(pred, y), 0.4) if clusterer._get_tags()['non_deterministic']: return set_random_state(clusterer) with warnings.catch_warnings(record=True): pred2 = clusterer.fit_predict(X) assert_array_equal(pred, pred2) # fit_predict(X) and labels_ should be of type int assert pred.dtype in [np.dtype('int32'), np.dtype('int64')] assert pred2.dtype in [np.dtype('int32'), np.dtype('int64')] # Add noise to X to test the possible values of the labels labels = clusterer.fit_predict(X_noise) # There should be at least one sample in every original cluster labels_sorted = np.unique(labels) assert_array_equal(labels_sorted, np.arange(0, 3)) # Labels should be less than n_clusters - 1 if hasattr(clusterer, 'n_clusters'): n_clusters = getattr(clusterer, 'n_clusters') assert_greater_equal(n_clusters - 1, labels_sorted[-1])
def test_prediction_proba(self): y_test_predicted = self.clf.predict_proba(self.X_test) assert_greater_equal(y_test_predicted.min(), 0) assert_less_equal(y_test_predicted.max(), 1) # check performance assert_greater(roc_auc_score(self.y_test, y_test_predicted[:, 1]), self.roc_floor) # check shape of integrity n_classes = len(np.unique(self.y_train)) assert_equal(y_test_predicted.shape, (self.X_test.shape[0], n_classes)) # check probability sum is 1 y_test_predicted_sum = np.sum(y_test_predicted, axis=1) assert_allclose(np.ones([self.X_test.shape[0], ]), y_test_predicted_sum)
def test_arpack_eigsh_initialization(): # Non-regression test that shows null-space computation is better with # initialization of eigsh from [-1,1] instead of [0,1] random_state = check_random_state(42) A = random_state.rand(50, 50) A = np.dot(A.T, A) # create s.p.d. matrix A = laplacian(A) + 1e-7 * np.identity(A.shape[0]) k = 5 # Test if eigsh is working correctly # New initialization [-1,1] (as in original ARPACK) # Was [0,1] before, with which this test could fail v0 = random_state.uniform(-1, 1, A.shape[0]) w, _ = eigsh(A, k=k, sigma=0.0, v0=v0) # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest assert_greater_equal(w[0], 0)
def test_arpack_eigsh_initialization(): # Non-regression test that shows null-space computation is better with # initialization of eigsh from [-1,1] instead of [0,1] random_state = check_random_state(42) A = random_state.rand(50, 50) A = np.dot(A.T, A) # create s.p.d. matrix A = graph_laplacian(A) + 1e-7 * np.identity(A.shape[0]) k = 5 # Test if eigsh is working correctly # New initialization [-1,1] (as in original ARPACK) # Was [0,1] before, with which this test could fail v0 = random_state.uniform(-1,1, A.shape[0]) w, _ = eigsh(A, k=k, sigma=0.0, v0=v0) # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest assert_greater_equal(w[0], 0)
def test_select_fdr_regression(): # Test that fdr heuristic actually has low FDR. def single_fdr(alpha, n_informative, random_state): X, y = make_regression( n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10, ) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0.0 false_discovery_rate = num_false_positives / (num_true_positives + num_false_positives) return false_discovery_rate for alpha in [0.001, 0.01, 0.1]: for n_informative in [1, 5, 10]: # As per Benjamini-Hochberg, the expected false discovery rate # should be lower than alpha: # FDR = E(FP / (TP + FP)) <= alpha false_discovery_rate = np.mean( [single_fdr(alpha, n_informative, random_state) for random_state in range(100)] ) assert_greater_equal(alpha, false_discovery_rate) # Make sure that the empirical false discovery rate increases # with alpha: if false_discovery_rate != 0: assert_greater(false_discovery_rate, alpha / 10)
def test_lda_perplexity(): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_topics, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method, total_samples=100, random_state=0) distr_1 = lda_1.fit_transform(X) perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False) distr_2 = lda_2.fit_transform(X) perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False) assert_greater_equal(perp_1, perp_2) perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True) perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True) assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
def test_lda_perplexity(method): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_components, X = _build_sparse_mtx() lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit(X) perp_1 = lda_1.perplexity(X, sub_sampling=False) lda_2.fit(X) perp_2 = lda_2.perplexity(X, sub_sampling=False) assert_greater_equal(perp_1, perp_2) perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True) perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True) assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
def test_filter(): # Tests the filter function. n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) # Parameters chosen specifically for this task. clust = OPTICS(eps=6.0, min_samples=4, metric='euclidean') # Run filter (before computing OPTICS) bool_memb = clust.filter(X, 0.5) idx_memb = clust.filter(X, 0.5, index_type='idx') # Test for equivalence between 'idx' and 'bool' extraction assert_equal(sum(bool_memb), len(idx_memb)) # Compute OPTICS clust.fit(X) clust.extract(0.5, clustering='dbscan') # core points from filter and extract should be the same within 1 point, # with extract occasionally underestimating due to start point of the # OPTICS algorithm. Here we test for at least 95% similarity in # classification of core/not core agree = sum(clust._is_core == bool_memb) assert_greater_equal(float(agree)/len(X), 0.95)
def check_min_weight_fraction_leaf(name, datasets, sparse=False): """Test if leaves contain at least min_weight_fraction_leaf of the training set""" if sparse: X = DATASETS[datasets]["X_sparse"].astype(np.float32) else: X = DATASETS[datasets]["X"].astype(np.float32) y = DATASETS[datasets]["y"] weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) TreeEstimator = ALL_TREES[name] # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)): est = TreeEstimator(min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0) est.fit(X, y, sample_weight=weights) if sparse: out = est.tree_.apply(X.tocsr()) else: out = est.tree_.apply(X) node_weights = np.bincount(out, weights=weights) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert_greater_equal( np.min(leaf_weights), total_weight * est.min_weight_fraction_leaf, "Failed with {0} " "min_weight_fraction_leaf={1}".format( name, est.min_weight_fraction_leaf))
def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1)
def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) # check performance assert_greater(roc_auc_score(self.y_test, pred_proba), self.roc_floor)
def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1)
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) ideal_n_labels_per_fold = n_samples // n_folds len(np.unique(labels)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = LabelKFold(n_folds=n_folds) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = [ 'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia' ] n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_labels_per_fold = n_samples // n_folds X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(labels)) assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, LabelKFold(n_folds=3).split(X, y, labels))
def check_limits(value, lower_bound, upper_bound): assert_less_equal(lower_bound, value) assert_greater_equal(upper_bound, value)
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) folds = cval.LabelKFold(labels, n_folds=n_folds).idxs ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in cval.LabelKFold(labels, n_folds=n_folds): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = [ "Albert", "Jean", "Bertrand", "Michel", "Jean", "Francis", "Robert", "Michel", "Rachel", "Lois", "Michelle", "Bernard", "Marion", "Laura", "Jean", "Rachel", "Franck", "John", "Gael", "Anna", "Alix", "Robert", "Marion", "David", "Tony", "Abel", "Becky", "Madmood", "Cary", "Mary", "Alexandre", "David", "Francis", "Barack", "Abdoul", "Rasha", "Xi", "Silvia", ] labels = np.asarray(labels, dtype=object) n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed folds = cval.LabelKFold(labels, n_folds=n_folds).idxs ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split for train, test in cval.LabelKFold(labels, n_folds=n_folds): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
def test_group_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_groups = 15 n_samples = 1000 n_splits = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed groups = rng.randint(0, n_groups, n_samples) ideal_n_groups_per_fold = n_samples // n_splits len(np.unique(groups)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = GroupKFold(n_splits=n_splits) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # Construct the test data groups = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']) n_groups = len(np.unique(groups)) n_samples = len(groups) n_splits = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_groups_per_fold = n_samples // n_splits X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # groups can also be a list cv_iter = list(lkf.split(X, y, groups.tolist())) for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups), cv_iter): assert_array_equal(train1, train2) assert_array_equal(test1, test2) # Should fail if there are more folds than groups groups = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(groups)) assert_raises_regexp(ValueError, "Cannot have number of splits.*greater", next, GroupKFold(n_splits=3).split(X, y, groups))
def test_assert_greater_equal(): assert_greater_equal(1, 0) assert_greater_equal(1, 1) assert_raises(AssertionError, assert_greater_equal, 0, 1)
def check_branching_factor(node, branching_factor): subclusters = node.subclusters_ assert_greater_equal(branching_factor, len(subclusters)) for cluster in subclusters: if cluster.child_: check_branching_factor(cluster.child_, branching_factor)
def test_group_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_groups = 15 n_samples = 1000 n_splits = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed groups = rng.randint(0, n_groups, n_samples) ideal_n_groups_per_fold = n_samples // n_splits len(np.unique(groups)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = GroupKFold(n_splits=n_splits) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # Construct the test data groups = np.array([ 'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia' ]) n_groups = len(np.unique(groups)) n_samples = len(groups) n_splits = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_groups_per_fold = n_samples // n_splits X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # groups can also be a list cv_iter = list(lkf.split(X, y, groups.tolist())) for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups), cv_iter): assert_array_equal(train1, train2) assert_array_equal(test1, test2) # Should fail if there are more folds than groups groups = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(groups)) assert_raises_regexp(ValueError, "Cannot have number of splits.*greater", next, GroupKFold(n_splits=3).split(X, y, groups))
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) ideal_n_labels_per_fold = n_samples // n_folds len(np.unique(labels)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = LabelKFold(n_folds=n_folds) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']) n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_labels_per_fold = n_samples // n_folds X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(labels)) assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, LabelKFold(n_folds=3).split(X, y, labels))
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) folds = cval.LabelKFold(labels, n_folds=n_folds).idxs ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in cval.LabelKFold(labels, n_folds=n_folds): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = [ 'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia' ] labels = np.asarray(labels, dtype=object) n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed folds = cval.LabelKFold(labels, n_folds=n_folds).idxs ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split for train, test in cval.LabelKFold(labels, n_folds=n_folds): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, proba_method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1)
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) folds = cval.LabelKFold(labels, n_folds=n_folds).idxs ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in cval.LabelKFold(labels, n_folds=n_folds): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'] labels = np.asarray(labels, dtype=object) n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed folds = cval.LabelKFold(labels, n_folds=n_folds).idxs ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split for train, test in cval.LabelKFold(labels, n_folds=n_folds): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
def check_limits(value, low, high): # check if low <= value <= high assert_less_equal(low, value) assert_greater_equal(high, value)