def test_bayesian_mixture_predict_predict_proba():
    # this is the same test as test_gaussian_mixture_predict_predict_proba()
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    for prior_type in PRIOR_TYPE:
        for covar_type in COVARIANCE_TYPE:
            X = rand_data.X[covar_type]
            Y = rand_data.Y
            bgmm = BayesianGaussianMixture(
                n_components=rand_data.n_components,
                random_state=rng,
                weight_concentration_prior_type=prior_type,
                covariance_type=covar_type)

            # Check a warning message arrive if we don't do fit
            assert_raise_message(NotFittedError,
                                 "This BayesianGaussianMixture instance"
                                 " is not fitted yet. Call 'fit' with "
                                 "appropriate arguments before using "
                                 "this method.", bgmm.predict, X)

            bgmm.fit(X)
            Y_pred = bgmm.predict(X)
            Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1)
            assert_array_equal(Y_pred, Y_pred_proba)
            assert_greater_equal(adjusted_rand_score(Y, Y_pred), .95)
예제 #2
0
def test_monotonic_likelihood():
    # We check that each step of the each step of variational inference without
    # regularization improve monotonically the training set of the bound
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=7)
    n_components = rand_data.n_components

    for covar_type in COVARIANCE_TYPE:
        X = rand_data.X[covar_type]
        bgmm = BayesianGaussianMixture(
            n_components=2 * n_components,
            covariance_type=covar_type,
            warm_start=True,
            max_iter=1,
            random_state=rng,
            tol=1e-4,
        )
        current_lower_bound = -np.infty
        # Do one training iteration at a time so we can make sure that the
        # training log likelihood increases after each iteration.
        for _ in range(500):
            prev_lower_bound = current_lower_bound
            current_lower_bound = bgmm.fit(X).lower_bound_
            assert_greater_equal(current_lower_bound, prev_lower_bound)

            if bgmm.converged_:
                break
        assert bgmm.converged_
def test_monotonic_likelihood():
    # We check that each step of the EM without regularization improve
    # monotonically the training set likelihood
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=7)
    n_components = rand_data.n_components

    for covar_type in COVARIANCE_TYPE:
        X = rand_data.X[covar_type]
        gmm = GaussianMixture(n_components=n_components,
                              covariance_type=covar_type, reg_covar=0,
                              warm_start=True, max_iter=1, random_state=rng,
                              tol=1e-7)
        current_log_likelihood = -np.infty
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ConvergenceWarning)
            # Do one training iteration at a time so we can make sure that the
            # training log likelihood increases after each iteration.
            for _ in range(600):
                prev_log_likelihood = current_log_likelihood
                try:
                    current_log_likelihood = gmm.fit(X).score(X)
                except ConvergenceWarning:
                    pass
                assert_greater_equal(current_log_likelihood,
                                     prev_log_likelihood)

                if gmm.converged_:
                    break

            assert gmm.converged_
예제 #4
0
def check_min_weight_fraction_leaf(name, X, y):
    """Test if leaves contain at least min_weight_fraction_leaf of the
    training set"""
    ForestEstimator = FOREST_ESTIMATORS[name]
    rng = np.random.RandomState(0)
    weights = rng.rand(X.shape[0])
    total_weight = np.sum(weights)

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for max_leaf_nodes in (None, 1000):
        for frac in np.linspace(0, 0.5, 6):
            est = ForestEstimator(min_weight_fraction_leaf=frac,
                                  max_leaf_nodes=max_leaf_nodes,
                                  random_state=0)
            if isinstance(est, (RandomForestClassifier,
                                RandomForestRegressor)):
                est.bootstrap = False
            est.fit(X, y, sample_weight=weights)
            out = est.estimators_[0].tree_.apply(X)
            node_weights = np.bincount(out, weights=weights)
            # drop inner nodes
            leaf_weights = node_weights[node_weights != 0]
            assert_greater_equal(
                np.min(leaf_weights),
                total_weight * est.min_weight_fraction_leaf,
                "Failed with {0} "
                "min_weight_fraction_leaf={1}".format(
                    name, est.min_weight_fraction_leaf))
예제 #5
0
def check_min_weight_fraction_leaf(name):
    X, y = hastie_X, hastie_y

    # Test if leaves contain at least min_weight_fraction_leaf of the
    # training set
    ForestEstimator = FOREST_ESTIMATORS[name]
    rng = np.random.RandomState(0)
    weights = rng.rand(X.shape[0])
    total_weight = np.sum(weights)

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for frac in np.linspace(0, 0.5, 6):
        est = ForestEstimator(min_weight_fraction_leaf=frac, n_estimators=1,
                              random_state=0)
        if "RandomForest" in name:
            est.bootstrap = False

        est.fit(X, y, sample_weight=weights)
        out = est.estimators_[0].tree_.apply(X)
        node_weights = np.bincount(out, weights=weights)
        # drop inner nodes
        leaf_weights = node_weights[node_weights != 0]
        assert_greater_equal(
            np.min(leaf_weights),
            total_weight * est.min_weight_fraction_leaf,
            "Failed with {0} "
            "min_weight_fraction_leaf={1}".format(
                name, est.min_weight_fraction_leaf))
예제 #6
0
def test_min_weight_fraction_leaf():
    """Test if leaves contain at least min_weight_fraction_leaf of the
    training set"""
    X = np.asfortranarray(iris.data.astype(tree._tree.DTYPE))
    y = iris.target
    rng = np.random.RandomState(0)
    weights = rng.rand(X.shape[0])
    total_weight = np.sum(weights)

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for max_leaf_nodes, name, frac in product((None, 1000),
                                              ALL_TREES,
                                              np.linspace(0, 0.5, 6)):
        TreeEstimator = ALL_TREES[name]
        est = TreeEstimator(min_weight_fraction_leaf=frac,
                            max_leaf_nodes=max_leaf_nodes,
                            random_state=0)
        est.fit(X, y, sample_weight=weights)
        out = est.tree_.apply(X)
        node_weights = np.bincount(out, weights=weights)
        # drop inner nodes
        leaf_weights = node_weights[node_weights != 0]
        assert_greater_equal(
            np.min(leaf_weights),
            total_weight * est.min_weight_fraction_leaf,
            "Failed with {0} "
            "min_weight_fraction_leaf={1}".format(
                name, est.min_weight_fraction_leaf))
예제 #7
0
def test_min_max_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # Constant feature.
    X = np.zeros(5)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_greater_equal(X_scaled.min(), 0.)
    assert_less_equal(X_scaled.max(), 1.)
예제 #8
0
def check_threshold(birch_instance, threshold):
    """Use the leaf linked list for traversal"""
    current_leaf = birch_instance.dummy_leaf_.next_leaf_
    while current_leaf:
        subclusters = current_leaf.subclusters_
        for sc in subclusters:
            assert_greater_equal(threshold, sc.radius)
        current_leaf = current_leaf.next_leaf_
예제 #9
0
    def test_pearsonr_mat(self):
        pear_mat = pearsonr_mat(self.mat)
        assert_equal(pear_mat.shape, (10, 10))

        pear_mat = pearsonr_mat(self.mat, self.w_mat)
        assert_equal(pear_mat.shape, (10, 10))

        assert_greater_equal(np.min(pear_mat), -1)
        assert_less_equal(np.max(pear_mat), 1)
예제 #10
0
def test_calibration_multiclass():
    """Test calibration for multiclass """
    # test multi-class setting with classifier that implements
    # only decision function
    clf = LinearSVC()
    X, y_idx = make_blobs(n_samples=100, n_features=2, random_state=42,
                          centers=3, cluster_std=3.0)

    # Use categorical labels to check that CalibratedClassifierCV supports
    # them correctly
    target_names = np.array(['a', 'b', 'c'])
    y = target_names[y_idx]

    X_train, y_train = X[::2], y[::2]
    X_test, y_test = X[1::2], y[1::2]

    clf.fit(X_train, y_train)
    for method in ['isotonic', 'sigmoid']:
        cal_clf = CalibratedClassifierCV(clf, method=method, cv=2)
        cal_clf.fit(X_train, y_train)
        probas = cal_clf.predict_proba(X_test)
        assert_array_almost_equal(np.sum(probas, axis=1), np.ones(len(X_test)))

        # Check that log-loss of calibrated classifier is smaller than
        # log-loss of naively turned OvR decision function to probabilities
        # via softmax
        def softmax(y_pred):
            e = np.exp(-y_pred)
            return e / e.sum(axis=1).reshape(-1, 1)

        uncalibrated_log_loss = \
            log_loss(y_test, softmax(clf.decision_function(X_test)))
        calibrated_log_loss = log_loss(y_test, probas)
        assert_greater_equal(uncalibrated_log_loss, calibrated_log_loss)

    # Test that calibration of a multiclass classifier decreases log-loss
    # for RandomForestClassifier
    X, y = make_blobs(n_samples=100, n_features=2, random_state=42,
                      cluster_std=3.0)
    X_train, y_train = X[::2], y[::2]
    X_test, y_test = X[1::2], y[1::2]

    clf = RandomForestClassifier(n_estimators=10, random_state=42)
    clf.fit(X_train, y_train)
    clf_probs = clf.predict_proba(X_test)
    loss = log_loss(y_test, clf_probs)

    for method in ['isotonic', 'sigmoid']:
        cal_clf = CalibratedClassifierCV(clf, method=method, cv=3)
        cal_clf.fit(X_train, y_train)
        cal_clf_probs = cal_clf.predict_proba(X_test)
        cal_loss = log_loss(y_test, cal_clf_probs)
        assert_greater(loss, cal_loss)
def test_multiple_init():
    # Test that multiple inits does not much worse than a single one
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 50, 5, 2
    X = rng.randn(n_samples, n_features)
    for cv_type in COVARIANCE_TYPE:
        train1 = GaussianMixture(n_components=n_components,
                                 covariance_type=cv_type,
                                 random_state=0).fit(X).score(X)
        train2 = GaussianMixture(n_components=n_components,
                                 covariance_type=cv_type,
                                 random_state=0, n_init=5).fit(X).score(X)
        assert_greater_equal(train2, train1)
예제 #12
0
def test_lda_score():
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_topics, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_1.fit_transform(X)
        score_1 = lda_1.score(X)

        lda_2.fit_transform(X)
        score_2 = lda_2.score(X)
        assert_greater_equal(score_2, score_1)
예제 #13
0
def test_lda_preplexity():
    """
    Test LDA preplexity for batch training
    preplexity should be lower after each iteration
    """
    n_topics, alpha, eta, X = _build_sparse_mtx()
    lda_1 = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=0)
    lda_2 = OnlineLDA(n_topics=n_topics, alpha=alpha, eta=eta, random_state=0)

    distr_1 = lda_1.fit_transform(X, max_iters=1)
    prep_1 = lda_1.preplexity(X, distr_1, sub_sampling=False)

    distr_2 = lda_2.fit_transform(X, max_iters=10)
    prep_2 = lda_2.preplexity(X, distr_2, sub_sampling=False)
    assert_greater_equal(prep_1, prep_2)
예제 #14
0
def test_lda_score(method):
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_components, X = _build_sparse_mtx()
    lda_1 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=1, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_2 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=10, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_1.fit_transform(X)
    score_1 = lda_1.score(X)

    lda_2.fit_transform(X)
    score_2 = lda_2.score(X)
    assert_greater_equal(score_2, score_1)
예제 #15
0
def test_select_fdr_regression():
    """
    Test that fdr heuristic actually has low FDR.
    """
    def single_fdr(alpha, n_informative, random_state):
        X, y = make_regression(n_samples=150,
                               n_features=20,
                               n_informative=n_informative,
                               shuffle=False,
                               random_state=random_state,
                               noise=10)

        with warnings.catch_warnings(record=True):
            # Warnings can be raised when no features are selected
            # (low alpha or very noisy data)
            univariate_filter = SelectFdr(f_regression, alpha=alpha)
            X_r = univariate_filter.fit(X, y).transform(X)
            X_r2 = GenericUnivariateSelect(f_regression,
                                           mode='fdr',
                                           param=alpha).fit(X, y).transform(X)

        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        num_false_positives = np.sum(support[n_informative:] == 1)
        num_true_positives = np.sum(support[:n_informative] == 1)

        if num_false_positives == 0:
            return 0.
        false_discovery_rate = (num_false_positives /
                                (num_true_positives + num_false_positives))
        return false_discovery_rate

    for alpha in [0.001, 0.01, 0.1]:
        for n_informative in [1, 5, 10]:
            # As per Benjamini-Hochberg, the expected false discovery rate
            # should be lower than alpha:
            # FDR = E(FP / (TP + FP)) <= alpha
            false_discovery_rate = np.mean([
                single_fdr(alpha, n_informative, random_state)
                for random_state in range(30)
            ])
            assert_greater_equal(alpha, false_discovery_rate)

            # Make sure that the empirical false discovery rate increases
            # with alpha:
            if false_discovery_rate != 0:
                assert_greater(false_discovery_rate, alpha / 10)
예제 #16
0
def check_clustering(name, clusterer_orig, readonly_memmap=False):

    clusterer = clone(clusterer_orig)
    X, y = _create_small_ts_dataset()
    X, y = shuffle(X, y, random_state=7)
    X = TimeSeriesScalerMeanVariance().fit_transform(X)
    rng = np.random.RandomState(42)
    X_noise = X + (rng.randn(*X.shape) / 5)

    n_samples, n_features, dim = X.shape
    # catch deprecation and neighbors warnings
    if hasattr(clusterer, "n_clusters"):
        clusterer.set_params(n_clusters=3)
    set_random_state(clusterer)

    # fit
    clusterer.fit(X)
    # with lists
    clusterer.fit(X.tolist())

    pred = clusterer.labels_
    assert_equal(pred.shape, (n_samples,))
    assert_greater(adjusted_rand_score(pred, y), 0.4)

    if clusterer._get_tags()['non_deterministic']:
        return

    set_random_state(clusterer)
    with warnings.catch_warnings(record=True):
        pred2 = clusterer.fit_predict(X)
    assert_array_equal(pred, pred2)

    # fit_predict(X) and labels_ should be of type int
    assert pred.dtype in [np.dtype('int32'), np.dtype('int64')]
    assert pred2.dtype in [np.dtype('int32'), np.dtype('int64')]

    # Add noise to X to test the possible values of the labels
    labels = clusterer.fit_predict(X_noise)

    # There should be at least one sample in every original cluster
    labels_sorted = np.unique(labels)
    assert_array_equal(labels_sorted, np.arange(0, 3))

    # Labels should be less than n_clusters - 1
    if hasattr(clusterer, 'n_clusters'):
        n_clusters = getattr(clusterer, 'n_clusters')
        assert_greater_equal(n_clusters - 1, labels_sorted[-1])
예제 #17
0
    def test_prediction_proba(self):
        y_test_predicted = self.clf.predict_proba(self.X_test)
        assert_greater_equal(y_test_predicted.min(), 0)
        assert_less_equal(y_test_predicted.max(), 1)

        # check performance
        assert_greater(roc_auc_score(self.y_test, y_test_predicted[:, 1]),
                       self.roc_floor)

        # check shape of integrity
        n_classes = len(np.unique(self.y_train))
        assert_equal(y_test_predicted.shape, (self.X_test.shape[0], n_classes))

        # check probability sum is 1
        y_test_predicted_sum = np.sum(y_test_predicted, axis=1)
        assert_allclose(np.ones([self.X_test.shape[0], ]),
                        y_test_predicted_sum)
예제 #18
0
파일: test_utils.py 프로젝트: AnAnteup/icp4
def test_arpack_eigsh_initialization():
    # Non-regression test that shows null-space computation is better with
    # initialization of eigsh from [-1,1] instead of [0,1]
    random_state = check_random_state(42)

    A = random_state.rand(50, 50)
    A = np.dot(A.T, A)  # create s.p.d. matrix
    A = laplacian(A) + 1e-7 * np.identity(A.shape[0])
    k = 5

    # Test if eigsh is working correctly
    # New initialization [-1,1] (as in original ARPACK)
    # Was [0,1] before, with which this test could fail
    v0 = random_state.uniform(-1, 1, A.shape[0])
    w, _ = eigsh(A, k=k, sigma=0.0, v0=v0)

    # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest
    assert_greater_equal(w[0], 0)
예제 #19
0
def test_arpack_eigsh_initialization():
    # Non-regression test that shows null-space computation is better with 
    # initialization of eigsh from [-1,1] instead of [0,1]
    random_state = check_random_state(42)

    A = random_state.rand(50, 50)
    A = np.dot(A.T, A)  # create s.p.d. matrix
    A = graph_laplacian(A) + 1e-7 * np.identity(A.shape[0])
    k = 5

    # Test if eigsh is working correctly
    # New initialization [-1,1] (as in original ARPACK)
    # Was [0,1] before, with which this test could fail
    v0 = random_state.uniform(-1,1, A.shape[0])
    w, _ = eigsh(A, k=k, sigma=0.0, v0=v0)

    # Eigenvalues of s.p.d. matrix should be nonnegative, w[0] is smallest
    assert_greater_equal(w[0], 0)
예제 #20
0
def test_select_fdr_regression():
    # Test that fdr heuristic actually has low FDR.
    def single_fdr(alpha, n_informative, random_state):
        X, y = make_regression(
            n_samples=150,
            n_features=20,
            n_informative=n_informative,
            shuffle=False,
            random_state=random_state,
            noise=10,
        )

        with warnings.catch_warnings(record=True):
            # Warnings can be raised when no features are selected
            # (low alpha or very noisy data)
            univariate_filter = SelectFdr(f_regression, alpha=alpha)
            X_r = univariate_filter.fit(X, y).transform(X)
            X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=alpha).fit(X, y).transform(X)

        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        num_false_positives = np.sum(support[n_informative:] == 1)
        num_true_positives = np.sum(support[:n_informative] == 1)

        if num_false_positives == 0:
            return 0.0
        false_discovery_rate = num_false_positives / (num_true_positives + num_false_positives)
        return false_discovery_rate

    for alpha in [0.001, 0.01, 0.1]:
        for n_informative in [1, 5, 10]:
            # As per Benjamini-Hochberg, the expected false discovery rate
            # should be lower than alpha:
            # FDR = E(FP / (TP + FP)) <= alpha
            false_discovery_rate = np.mean(
                [single_fdr(alpha, n_informative, random_state) for random_state in range(100)]
            )
            assert_greater_equal(alpha, false_discovery_rate)

            # Make sure that the empirical false discovery rate increases
            # with alpha:
            if false_discovery_rate != 0:
                assert_greater(false_discovery_rate, alpha / 10)
예제 #21
0
def test_lda_perplexity():
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_topics, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method,
                                          total_samples=100, random_state=0)
        distr_1 = lda_1.fit_transform(X)
        perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False)

        distr_2 = lda_2.fit_transform(X)
        perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False)
        assert_greater_equal(perp_1, perp_2)

        perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True)
        perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True)
        assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
예제 #22
0
def test_lda_perplexity(method):
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_components, X = _build_sparse_mtx()
    lda_1 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=1, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_2 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=10, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_1.fit(X)
    perp_1 = lda_1.perplexity(X, sub_sampling=False)

    lda_2.fit(X)
    perp_2 = lda_2.perplexity(X, sub_sampling=False)
    assert_greater_equal(perp_1, perp_2)

    perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
    perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
    assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
예제 #23
0
def test_lda_perplexity():
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_topics, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1,
                                          learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                          learning_method=method,
                                          total_samples=100, random_state=0)
        distr_1 = lda_1.fit_transform(X)
        perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False)

        distr_2 = lda_2.fit_transform(X)
        perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False)
        assert_greater_equal(perp_1, perp_2)

        perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True)
        perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True)
        assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
예제 #24
0
def test_filter():
    # Tests the filter function.

    n_clusters = 3
    X = generate_clustered_data(n_clusters=n_clusters)
    # Parameters chosen specifically for this task.
    clust = OPTICS(eps=6.0, min_samples=4, metric='euclidean')
    # Run filter (before computing OPTICS)
    bool_memb = clust.filter(X, 0.5)
    idx_memb = clust.filter(X, 0.5, index_type='idx')
    # Test for equivalence between 'idx' and 'bool' extraction
    assert_equal(sum(bool_memb), len(idx_memb))
    # Compute OPTICS
    clust.fit(X)
    clust.extract(0.5, clustering='dbscan')
    # core points from filter and extract should be the same within 1 point,
    # with extract occasionally underestimating due to start point of the
    # OPTICS algorithm. Here we test for at least 95% similarity in
    # classification of core/not core
    agree = sum(clust._is_core == bool_memb)
    assert_greater_equal(float(agree)/len(X), 0.95)
예제 #25
0
def check_min_weight_fraction_leaf(name, datasets, sparse=False):
    """Test if leaves contain at least min_weight_fraction_leaf of the
    training set"""
    if sparse:
        X = DATASETS[datasets]["X_sparse"].astype(np.float32)
    else:
        X = DATASETS[datasets]["X"].astype(np.float32)
    y = DATASETS[datasets]["y"]

    weights = rng.rand(X.shape[0])
    total_weight = np.sum(weights)

    TreeEstimator = ALL_TREES[name]

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)):
        est = TreeEstimator(min_weight_fraction_leaf=frac,
                            max_leaf_nodes=max_leaf_nodes,
                            random_state=0)
        est.fit(X, y, sample_weight=weights)

        if sparse:
            out = est.tree_.apply(X.tocsr())

        else:
            out = est.tree_.apply(X)

        node_weights = np.bincount(out, weights=weights)
        # drop inner nodes
        leaf_weights = node_weights[node_weights != 0]
        assert_greater_equal(
            np.min(leaf_weights),
            total_weight * est.min_weight_fraction_leaf,
            "Failed with {0} "
            "min_weight_fraction_leaf={1}".format(
                name, est.min_weight_fraction_leaf))
예제 #26
0
def check_min_weight_fraction_leaf(name, datasets, sparse=False):
    """Test if leaves contain at least min_weight_fraction_leaf of the
    training set"""
    if sparse:
        X = DATASETS[datasets]["X_sparse"].astype(np.float32)
    else:
        X = DATASETS[datasets]["X"].astype(np.float32)
    y = DATASETS[datasets]["y"]

    weights = rng.rand(X.shape[0])
    total_weight = np.sum(weights)

    TreeEstimator = ALL_TREES[name]

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)):
        est = TreeEstimator(min_weight_fraction_leaf=frac,
                            max_leaf_nodes=max_leaf_nodes,
                            random_state=0)
        est.fit(X, y, sample_weight=weights)

        if sparse:
            out = est.tree_.apply(X.tocsr())

        else:
            out = est.tree_.apply(X)

        node_weights = np.bincount(out, weights=weights)
        # drop inner nodes
        leaf_weights = node_weights[node_weights != 0]
        assert_greater_equal(
            np.min(leaf_weights),
            total_weight * est.min_weight_fraction_leaf,
            "Failed with {0} "
            "min_weight_fraction_leaf={1}".format(
                name, est.min_weight_fraction_leaf))
예제 #27
0
 def test_prediction_proba_unify(self):
     pred_proba = self.clf.predict_proba(self.X_test, method='unify')
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
예제 #28
0
 def test_prediction_proba(self):
     pred_proba = self.clf.predict_proba(self.X_test)
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
     # check performance
     assert_greater(roc_auc_score(self.y_test, pred_proba), self.roc_floor)
예제 #29
0
 def test_prediction_proba(self):
     pred_proba = self.clf.predict_proba(self.X_test)
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
예제 #30
0
 def test_prediction_proba_unify(self):
     pred_proba = self.clf.predict_proba(self.X_test, method='unify')
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
예제 #31
0
def test_label_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_labels = 15
    n_samples = 1000
    n_folds = 5

    X = y = np.ones(n_samples)

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    labels = rng.randint(0, n_labels, n_samples)

    ideal_n_labels_per_fold = n_samples // n_folds

    len(np.unique(labels))
    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    lkf = LabelKFold(n_folds=n_folds)
    for i, (_, test) in enumerate(lkf.split(X, y, labels)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    labels = np.asarray(labels, dtype=object)
    for train, test in lkf.split(X, y, labels):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Construct the test data
    labels = [
        'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert',
        'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura',
        'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert',
        'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary',
        'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi',
        'Silvia'
    ]

    n_labels = len(np.unique(labels))
    n_samples = len(labels)
    n_folds = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    ideal_n_labels_per_fold = n_samples // n_folds

    X = y = np.ones(n_samples)

    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    for i, (_, test) in enumerate(lkf.split(X, y, labels)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        for label in np.unique(labels):
            assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    labels = np.asarray(labels, dtype=object)
    for train, test in lkf.split(X, y, labels):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Should fail if there are more folds than labels
    labels = np.array([1, 1, 1, 2, 2])
    X = y = np.ones(len(labels))
    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
                         next,
                         LabelKFold(n_folds=3).split(X, y, labels))
예제 #32
0
def check_limits(value, lower_bound, upper_bound):
    assert_less_equal(lower_bound, value)
    assert_greater_equal(upper_bound, value)
예제 #33
0
 def test_prediction_proba(self):
     pred_proba = self.clf.predict_proba(self.X_test)
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
예제 #34
0
def test_label_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_labels = 15
    n_samples = 1000
    n_folds = 5

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    labels = rng.randint(0, n_labels, n_samples)
    folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
    ideal_n_labels_per_fold = n_samples // n_folds

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    labels = np.asarray(labels, dtype=object)
    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Construct the test data
    labels = [
        "Albert",
        "Jean",
        "Bertrand",
        "Michel",
        "Jean",
        "Francis",
        "Robert",
        "Michel",
        "Rachel",
        "Lois",
        "Michelle",
        "Bernard",
        "Marion",
        "Laura",
        "Jean",
        "Rachel",
        "Franck",
        "John",
        "Gael",
        "Anna",
        "Alix",
        "Robert",
        "Marion",
        "David",
        "Tony",
        "Abel",
        "Becky",
        "Madmood",
        "Cary",
        "Mary",
        "Alexandre",
        "David",
        "Francis",
        "Barack",
        "Abdoul",
        "Rasha",
        "Xi",
        "Silvia",
    ]
    labels = np.asarray(labels, dtype=object)

    n_labels = len(np.unique(labels))
    n_samples = len(labels)
    n_folds = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
    ideal_n_labels_per_fold = n_samples // n_folds

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Should fail if there are more folds than labels
    labels = np.array([1, 1, 1, 2, 2])
    assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
예제 #35
0
def test_group_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_groups = 15
    n_samples = 1000
    n_splits = 5

    X = y = np.ones(n_samples)

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    groups = rng.randint(0, n_groups, n_samples)

    ideal_n_groups_per_fold = n_samples // n_splits

    len(np.unique(groups))
    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    lkf = GroupKFold(n_splits=n_splits)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    for group in np.unique(groups):
        assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # Construct the test data
    groups = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
                       'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
                       'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
                       'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
                       'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
                       'Madmood', 'Cary', 'Mary', 'Alexandre', 'David',
                       'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'])

    n_groups = len(np.unique(groups))
    n_samples = len(groups)
    n_splits = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    ideal_n_groups_per_fold = n_samples // n_splits

    X = y = np.ones(n_samples)

    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        for group in np.unique(groups):
            assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # groups can also be a list
    cv_iter = list(lkf.split(X, y, groups.tolist()))
    for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups),
                                                cv_iter):
        assert_array_equal(train1, train2)
        assert_array_equal(test1, test2)

    # Should fail if there are more folds than groups
    groups = np.array([1, 1, 1, 2, 2])
    X = y = np.ones(len(groups))
    assert_raises_regexp(ValueError, "Cannot have number of splits.*greater",
                         next, GroupKFold(n_splits=3).split(X, y, groups))
예제 #36
0
def test_assert_greater_equal():
    assert_greater_equal(1, 0)
    assert_greater_equal(1, 1)
    assert_raises(AssertionError, assert_greater_equal, 0, 1)
예제 #37
0
def check_branching_factor(node, branching_factor):
    subclusters = node.subclusters_
    assert_greater_equal(branching_factor, len(subclusters))
    for cluster in subclusters:
        if cluster.child_:
            check_branching_factor(cluster.child_, branching_factor)
예제 #38
0
def test_group_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_groups = 15
    n_samples = 1000
    n_splits = 5

    X = y = np.ones(n_samples)

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    groups = rng.randint(0, n_groups, n_samples)

    ideal_n_groups_per_fold = n_samples // n_splits

    len(np.unique(groups))
    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    lkf = GroupKFold(n_splits=n_splits)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    for group in np.unique(groups):
        assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # Construct the test data
    groups = np.array([
        'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert',
        'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura',
        'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert',
        'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary',
        'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi',
        'Silvia'
    ])

    n_groups = len(np.unique(groups))
    n_samples = len(groups)
    n_splits = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    ideal_n_groups_per_fold = n_samples // n_splits

    X = y = np.ones(n_samples)

    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        for group in np.unique(groups):
            assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # groups can also be a list
    cv_iter = list(lkf.split(X, y, groups.tolist()))
    for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups),
                                                cv_iter):
        assert_array_equal(train1, train2)
        assert_array_equal(test1, test2)

    # Should fail if there are more folds than groups
    groups = np.array([1, 1, 1, 2, 2])
    X = y = np.ones(len(groups))
    assert_raises_regexp(ValueError, "Cannot have number of splits.*greater",
                         next,
                         GroupKFold(n_splits=3).split(X, y, groups))
예제 #39
0
 def test_prediction_proba(self):
     pred_proba = self.clf.predict_proba(self.X_test)
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
     # check performance
     assert_greater(roc_auc_score(self.y_test, pred_proba), self.roc_floor)
def test_label_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_labels = 15
    n_samples = 1000
    n_folds = 5

    X = y = np.ones(n_samples)

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    labels = rng.randint(0, n_labels, n_samples)

    ideal_n_labels_per_fold = n_samples // n_folds

    len(np.unique(labels))
    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    lkf = LabelKFold(n_folds=n_folds)
    for i, (_, test) in enumerate(lkf.split(X, y, labels)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    labels = np.asarray(labels, dtype=object)
    for train, test in lkf.split(X, y, labels):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Construct the test data
    labels = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
                       'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
                       'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
                       'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
                       'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
                       'Madmood', 'Cary', 'Mary', 'Alexandre', 'David',
                       'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'])

    n_labels = len(np.unique(labels))
    n_samples = len(labels)
    n_folds = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    ideal_n_labels_per_fold = n_samples // n_folds

    X = y = np.ones(n_samples)

    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    for i, (_, test) in enumerate(lkf.split(X, y, labels)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        for label in np.unique(labels):
            assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    labels = np.asarray(labels, dtype=object)
    for train, test in lkf.split(X, y, labels):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Should fail if there are more folds than labels
    labels = np.array([1, 1, 1, 2, 2])
    X = y = np.ones(len(labels))
    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
                         next, LabelKFold(n_folds=3).split(X, y, labels))
예제 #41
0
def test_label_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_labels = 15
    n_samples = 1000
    n_folds = 5

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    labels = rng.randint(0, n_labels, n_samples)
    folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
    ideal_n_labels_per_fold = n_samples // n_folds

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    labels = np.asarray(labels, dtype=object)
    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Construct the test data
    labels = [
        'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert',
        'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura',
        'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert',
        'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary',
        'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi',
        'Silvia'
    ]
    labels = np.asarray(labels, dtype=object)

    n_labels = len(np.unique(labels))
    n_samples = len(labels)
    n_folds = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
    ideal_n_labels_per_fold = n_samples // n_folds

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Should fail if there are more folds than labels
    labels = np.array([1, 1, 1, 2, 2])
    assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
예제 #42
0
def check_branching_factor(node, branching_factor):
    subclusters = node.subclusters_
    assert_greater_equal(branching_factor, len(subclusters))
    for cluster in subclusters:
        if cluster.child_:
            check_branching_factor(cluster.child_, branching_factor)
예제 #43
0
def test_assert_greater_equal():
    assert_greater_equal(1, 0)
    assert_greater_equal(1, 1)
    assert_raises(AssertionError, assert_greater_equal, 0, 1)
예제 #44
0
 def test_prediction_proba_linear(self):
     pred_proba = self.clf.predict_proba(self.X_test, proba_method='linear')
     assert_greater_equal(pred_proba.min(), 0)
     assert_less_equal(pred_proba.max(), 1)
def test_label_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_labels = 15
    n_samples = 1000
    n_folds = 5

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    labels = rng.randint(0, n_labels, n_samples)
    folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
    ideal_n_labels_per_fold = n_samples // n_folds

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    labels = np.asarray(labels, dtype=object)
    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Construct the test data
    labels = ['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
              'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
              'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
              'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
              'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
              'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis',
              'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia']
    labels = np.asarray(labels, dtype=object)

    n_labels = len(np.unique(labels))
    n_samples = len(labels)
    n_folds = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    folds = cval.LabelKFold(labels, n_folds=n_folds).idxs
    ideal_n_labels_per_fold = n_samples // n_folds

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(labels))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_labels_per_fold))

    # Check that each label appears only in 1 fold
    for label in np.unique(labels):
        assert_equal(len(np.unique(folds[labels == label])), 1)

    # Check that no label is on both sides of the split
    for train, test in cval.LabelKFold(labels, n_folds=n_folds):
        assert_equal(len(np.intersect1d(labels[train], labels[test])), 0)

    # Should fail if there are more folds than labels
    labels = np.array([1, 1, 1, 2, 2])
    assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
예제 #46
0
def check_limits(value, low, high):
    # check if low <= value <= high
    assert_less_equal(low, value)
    assert_greater_equal(high, value)