Пример #1
0
def test_spectral_clustering_not_infinite_loop(capsys, monkeypatch):
    """Check that discretize raises LinAlgError when svd never converges.

    Non-regression test for #21380
    """
    def new_svd(*args, **kwargs):
        raise LinAlgError()

    monkeypatch.setattr(np.linalg, "svd", new_svd)
    vectors = np.ones((10, 4))

    with pytest.raises(LinAlgError, match="SVD did not converge"):
        discretize(vectors)
Пример #2
0
def __regularized_spectral_clustering(adj_matrix,
                                      tau,
                                      n_clusters,
                                      algo="scan"):
    """
    :param adj_matrix: adjacency matrix representation of graph where [m][n] >0 if there is edge and [m][n] = weight
    :param n_clusters: cluster partitioning constant
    :param algo: the clustering separation algorithm, possible value kmeans++ or scan
    :return: labels, number of clustering iterations needed, smallest set of cluster found, execution time
    """
    from sklearn.cluster import k_means
    from sklearn.cluster._spectral import discretize

    regularized_laplacian = __regularized_laplacian_matrix(adj_matrix, tau)
    eigen_values, eigen_vectors = __eigen_solver(regularized_laplacian,
                                                 n_clusters=n_clusters)
    if algo == "kmeans++":
        _, labels, _, num_iterations = k_means(eigen_vectors,
                                               n_clusters=n_clusters,
                                               return_n_iter=True)
    else:
        if n_clusters == 2:  # cluster based on sign
            second_eigen_vector_index = np.argsort(eigen_values)[1]
            second_eigen_vector = eigen_vectors.T[second_eigen_vector_index]
            labels = [0 if val <= 0 else 1 for val in second_eigen_vector
                      ]  # use only the second eigenvector
            num_iterations = 1
        else:  # bisecting it into k-ways, use all eigenvectors
            labels = discretize(eigen_vectors)
            num_iterations = 20  # assume worst case scenario that it tooks 20 restarts

    smallest_cluster_size = min(np.sum(labels),
                                abs(np.sum(labels) - len(labels)))
    return labels, num_iterations, smallest_cluster_size
Пример #3
0
def test_discretize(n_samples):
    # Test the discretize using a noise assignment matrix
    random_state = np.random.RandomState(seed=8)
    for n_class in range(2, 10):
        # random class labels
        y_true = random_state.randint(0, n_class + 1, n_samples)
        y_true = np.array(y_true, np.float)
        # noise class assignment matrix
        y_indicator = sparse.coo_matrix(
            (np.ones(n_samples), (np.arange(n_samples), y_true)),
            shape=(n_samples, n_class + 1))
        y_true_noisy = (y_indicator.toarray() +
                        0.1 * random_state.randn(n_samples, n_class + 1))
        y_pred = discretize(y_true_noisy, random_state)
        assert adjusted_rand_score(y_true, y_pred) > 0.8