예제 #1
0
def spectralcluster(A,
                    n_cluster,
                    n_neighbors=6,
                    random_state=None,
                    eigen_tol=0.0):
    #maps = spectral_embedding(affinity, n_components=n_components,eigen_solver=eigen_solver,random_state=random_state,eigen_tol=eigen_tol, drop_first=False)

    # dd is diag
    laplacian, dd = graph_laplacian(A, normed=True, return_diag=True)
    # set the diagonal of the laplacian matrix and convert it to a sparse format well suited for e    # igenvalue decomposition
    laplacian = _set_diag(laplacian, 1)

    # diffusion_map is eigenvectors
    # LM largest eigenvalues
    laplacian *= -1
    eigenvalues, eigenvectors = eigsh(laplacian,
                                      k=n_cluster,
                                      sigma=1.0,
                                      which='LM',
                                      tol=eigen_tol)
    y = eigenvectors.T[n_cluster::-1] * dd
    y = _deterministic_vector_sign_flip(y)[:n_cluster].T

    random_state = check_random_state(random_state)
    centroids, labels, _ = k_means(y, n_cluster, random_state=random_state)

    return eigenvalues, y, centroids, labels
예제 #2
0
def get_laplacian_eig(adjacency, dims, normed=True, random_state=None):
    random_state = check_random_state(random_state)
    laplacian, dd = sparse.csgraph.laplacian(adjacency, normed=normed, return_diag=True)
    laplacian = _set_diag(laplacian, 1, True)
    laplacian *= -1
    v0 = random_state.uniform(-1, 1, laplacian.shape[0])
    lambdas, diffusion_map = eigsh(laplacian, k=dims, sigma=1.0, which='LM', tol=0.0, v0=v0)
    
    embedding = diffusion_map.T[dims::-1] * dd
    embedding = _deterministic_vector_sign_flip(embedding)
    
    return lambdas, embedding[:dims].T
    
예제 #3
0
def predict_k(affinity_matrix):
	
	normed_laplacian, dd = graph_laplacian(affinity_matrix, normed=True, return_diag=True)
	laplacian = _set_diag(normed_laplacian, 1,norm_laplacian=True)

	n_components = affinity_matrix.shape[0] - 1

	eigenvalues, eigenvectors = eigsh(-laplacian, k=n_components, which="LM", sigma=1.0, maxiter=5000)
	eigenvalues = -eigenvalues[::-1]  # Reverse and sign inversion.

	max_gap = 0
	gap_pre_index = 0
	for i in range(1, eigenvalues.size):
		gap = eigenvalues[i] - eigenvalues[i - 1]
		if gap > max_gap:
			max_gap = gap
			gap_pre_index = i - 1

	k = gap_pre_index + 1

	return k
예제 #4
0
def spectralcluster(A, n_cluster, n_neighbors=6, random_state=None, eigen_tol=0.0):
    #maps = spectral_embedding(affinity, n_components=n_components,eigen_solver=eigen_solver,random_state=random_state,eigen_tol=eigen_tol, drop_first=False)

    # dd is diag
    laplacian, dd = graph_laplacian(A, normed=True, return_diag=True)
    # set the diagonal of the laplacian matrix and convert it to a sparse format well suited for e    # igenvalue decomposition
    laplacian = _set_diag(laplacian, 1)
    
    # diffusion_map is eigenvectors
    # LM largest eigenvalues
    laplacian *= -1
    eigenvalues, eigenvectors = eigsh(laplacian, k=n_cluster,
                                   sigma=1.0, which='LM',
                                   tol=eigen_tol)
    y = eigenvectors.T[n_cluster::-1] * dd
    y = _deterministic_vector_sign_flip(y)[:n_cluster].T

    random_state = check_random_state(random_state)
    centroids, labels, _ = k_means(y, n_cluster, random_state=random_state)

    return eigenvalues, y, centroids, labels
예제 #5
0
def predict_k(affinity_matrix):
    """
    Predict number of clusters based on the eigengap.

    Parameters
    ----------
    affinity_matrix : array-like or sparse matrix, shape: (n_samples, n_samples)
        adjacency matrix.
        Each element of this matrix contains a measure of similarity between two of the data points.

    Returns
    ----------
    k : integer
        estimated number of cluster.

    Note
    ---------
    If graph is not fully connected, zero component as single cluster.

    References
    ----------
    A Tutorial on Spectral Clustering, 2007
        Luxburg, Ulrike
        http://www.kyb.mpg.de/fileadmin/user_upload/files/publications/attachments/Luxburg07_tutorial_4488%5b0%5d.pdf

    """

    """
    If normed=True, L = D^(-1/2) * (D - A) * D^(-1/2) else L = D - A.
    normed=True is recommended.
    """
    normed_laplacian, dd = graph_laplacian(affinity_matrix, normed=True, return_diag=True)
    laplacian = _set_diag(normed_laplacian, 1)

    """
    n_components size is N - 1.
    Setting N - 1 may lead to slow execution time...
    """
    n_components = affinity_matrix.shape[0] - 1

    """
    shift-invert mode
    The shift-invert mode provides more than just a fast way to obtain a few small eigenvalues.
    http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html

    The normalized Laplacian has eigenvalues between 0 and 2.
    I - L has eigenvalues between -1 and 1.
    """
    eigenvalues, eigenvectors = eigsh(-laplacian, k=n_components, which="LM", sigma=1.0, maxiter=5000)
    eigenvalues = -eigenvalues[::-1]  # Reverse and sign inversion.

    max_gap = 0
    gap_pre_index = 0
    for i in range(1, eigenvalues.size):
        gap = eigenvalues[i] - eigenvalues[i - 1]
        if gap > max_gap:
            max_gap = gap
            gap_pre_index = i - 1

    k = gap_pre_index + 1

    return k
예제 #6
0
def predict_k(affinity_matrix):
    """
    Predict number of clusters based on the eigengap.

    Parameters
    ----------
    affinity_matrix : array-like or sparse matrix, shape: (n_samples, n_samples)
        adjacency matrix.
        Each element of this matrix contains a measure of similarity between two of the data points.

    Returns
    ----------
    k : integer
        estimated number of cluster.

    Note
    ---------
    If graph is not fully connected, zero component as single cluster.

    References
    ----------
    A Tutorial on Spectral Clustering, 2007
        Luxburg, Ulrike
        http://www.kyb.mpg.de/fileadmin/user_upload/files/publications/attachments/Luxburg07_tutorial_4488%5b0%5d.pdf

    """
    """
    If normed=True, L = D^(-1/2) * (D - A) * D^(-1/2) else L = D - A.
    normed=True is recommended.
    """
    normed_laplacian, dd = graph_laplacian(affinity_matrix,
                                           normed=True,
                                           return_diag=True)
    laplacian = _set_diag(normed_laplacian, 1)
    """
    n_components size is N - 1.
    Setting N - 1 may lead to slow execution time...
    """
    n_components = affinity_matrix.shape[0] - 1
    """
    shift-invert mode
    The shift-invert mode provides more than just a fast way to obtain a few small eigenvalues.
    http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html

    The normalized Laplacian has eigenvalues between 0 and 2.
    I - L has eigenvalues between -1 and 1.
    """
    eigenvalues, eigenvectors = eigsh(-laplacian,
                                      k=n_components,
                                      which="LM",
                                      sigma=1.0,
                                      maxiter=5000)
    eigenvalues = -eigenvalues[::-1]  # Reverse and sign inversion.

    max_gap = 0
    gap_pre_index = 0
    for i in range(1, eigenvalues.size):
        gap = eigenvalues[i] - eigenvalues[i - 1]
        if gap > max_gap:
            max_gap = gap
            gap_pre_index = i - 1

    k = gap_pre_index + 1

    return k
예제 #7
0
def spectral_embedding(laplacian,
                       n_components=8,
                       eigen_solver=None,
                       random_state=None,
                       eigen_tol=1e-20,
                       drop_first=False):
    """
    
    ----------------------------------------------------------------
    *****!!!sklearn function variation for spectral embeding!!!*****
    ----------------------------------------------------------------
    
    Project the sample on the first eigenvectors of the graph Laplacian.

    This embedding can also 'work' even if the ``adjacency`` variable is
    not strictly the adjacency matrix of a graph but more generally
    an affinity or similarity matrix between samples (for instance the
    heat kernel of a euclidean distance matrix or a k-NN matrix).

    However care must taken to always make the affinity matrix symmetric
    so that the eigenvector decomposition works as expected.

    Read more in the :ref:`User Guide <spectral_embedding>`.

    Parameters
    ----------
    laplacian : array-like or sparse matrix, shape: (n_samples, n_samples)
        The laplacian matrix of the graph to embed.

    n_components : integer, optional, default 8
        The dimension of the projection subspace.

    eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}, default None
        The eigenvalue decomposition strategy to use. AMG requires pyamg
        to be installed. It can be faster on very large, sparse problems,
        but may also lead to instabilities.

    random_state : int seed, RandomState instance, or None (default)
        A pseudo random number generator used for the initialization of the
        lobpcg eigenvectors decomposition when eigen_solver == 'amg'.
        By default, arpack is used.

    eigen_tol : float, optional, default=0.0
        Stopping criterion for eigendecomposition of the Laplacian matrix
        when using arpack eigen_solver.

    drop_first : bool, optional, default=True
        Whether to drop the first eigenvector. For spectral embedding, this
        should be True as the first eigenvector should be constant vector for
        connected graph, but for spectral clustering, this should be kept as
        False to retain the first eigenvector.

    Returns
    -------
    embedding : array, shape=(n_samples, n_components)
        The reduced samples.

    Notes
    -----
    Spectral embedding is most useful when the graph has one connected
    component. If there graph has many components, the first few eigenvectors
    will simply uncover the connected components of the graph.

    References
    ----------
    * http://en.wikipedia.org/wiki/LOBPCG

    * Toward the Optimal Preconditioned Eigensolver: Locally Optimal
      Block Preconditioned Conjugate Gradient Method
      Andrew V. Knyazev
      http://dx.doi.org/10.1137%2FS1064827500366124
    """

    try:
        from pyamg import smoothed_aggregation_solver
    except ImportError:
        if eigen_solver == "amg":
            raise ValueError("The eigen_solver was set to 'amg', but pyamg is "
                             "not available.")

    if eigen_solver is None:
        eigen_solver = 'arpack'
    elif eigen_solver not in ('arpack', 'lobpcg', 'amg'):
        raise ValueError("Unknown value for eigen_solver: '%s'."
                         "Should be 'amg', 'arpack', or 'lobpcg'" %
                         eigen_solver)

    random_state = check_random_state(random_state)

    n_nodes = laplacian.shape[0]
    # Whether to drop the first eigenvector
    if drop_first:
        n_components = n_components + 1

    dd = laplacian.diagonal()

    if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and
        (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)):
        # lobpcg used with eigen_solver='amg' has bugs for low number of nodes
        # for details see the source code in scipy:
        # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
        # /lobpcg/lobpcg.py#L237
        # or matlab:
        # http://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
        laplacian = _set_diag(laplacian, 1)

        # Here we'll use shift-invert mode for fast eigenvalues
        # (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
        #  for a short explanation of what this means)
        # Because the normalized Laplacian has eigenvalues between 0 and 2,
        # I - L has eigenvalues between -1 and 1.  ARPACK is most efficient
        # when finding eigenvalues of largest magnitude (keyword which='LM')
        # and when these eigenvalues are very large compared to the rest.
        # For very large, very sparse graphs, I - L can have many, many
        # eigenvalues very near 1.0.  This leads to slow convergence.  So
        # instead, we'll use ARPACK's shift-invert mode, asking for the
        # eigenvalues near 1.0.  This effectively spreads-out the spectrum
        # near 1.0 and leads to much faster convergence: potentially an
        # orders-of-magnitude speedup over simply using keyword which='LA'
        # in standard mode.
        try:
            # We are computing the opposite of the laplacian inplace so as
            # to spare a memory allocation of a possibly very large array
            laplacian *= -1
            lambdas, diffusion_map = eigsh(laplacian,
                                           k=n_components,
                                           sigma=1.0,
                                           which='LM',
                                           tol=eigen_tol)
            embedding = diffusion_map.T[n_components::-1] * dd
        except RuntimeError:
            # When submatrices are exactly singular, an LU decomposition
            # in arpack fails. We fallback to lobpcg
            eigen_solver = "lobpcg"
            # Revert the laplacian to its opposite to have lobpcg work
            laplacian *= -1

    if eigen_solver == 'amg':
        # Use AMG to get a preconditioner and speed up the eigenvalue
        # problem.
        if not sparse.issparse(laplacian):
            warnings.warn("AMG works better for sparse matrices")
        # lobpcg needs double precision floats
        laplacian = check_array(laplacian,
                                dtype=np.float64,
                                accept_sparse=True)
        laplacian = _set_diag(laplacian, 1)
        ml = smoothed_aggregation_solver(check_array(laplacian, 'csr'))
        M = ml.aspreconditioner()
        X = random_state.rand(laplacian.shape[0], n_components + 1)
        X[:, 0] = dd.ravel()
        lambdas, diffusion_map = lobpcg(laplacian,
                                        X,
                                        M=M,
                                        tol=1.e-12,
                                        largest=False)
        embedding = diffusion_map.T * dd
        if embedding.shape[0] == 1:
            raise ValueError

    elif eigen_solver == "lobpcg":
        # lobpcg needs double precision floats
        laplacian = check_array(laplacian,
                                dtype=np.float64,
                                accept_sparse=True)
        if n_nodes < 5 * n_components + 1:
            # see note above under arpack why lobpcg has problems with small
            # number of nodes
            # lobpcg will fallback to eigh, so we short circuit it
            if sparse.isspmatrix(laplacian):
                laplacian = laplacian.toarray()
            lambdas, diffusion_map = eigh(laplacian)
            embedding = diffusion_map.T[:n_components] * dd
        else:
            laplacian = _set_diag(laplacian, 1)
            # We increase the number of eigenvectors requested, as lobpcg
            # doesn't behave well in low dimension
            X = random_state.rand(laplacian.shape[0], n_components + 1)
            X[:, 0] = dd.ravel()
            lambdas, diffusion_map = lobpcg(laplacian,
                                            X,
                                            tol=1e-15,
                                            largest=False,
                                            maxiter=2000)
            embedding = diffusion_map.T[:n_components] * dd
            if embedding.shape[0] == 1:
                raise ValueError

    embedding = _deterministic_vector_sign_flip(embedding)
    if drop_first:
        return embedding[1:n_components].T
    else:
        return embedding[:n_components].T
from sklearn.utils.sparsetools import connected_components
from sklearn.neighbors import kneighbors_graph
from sklearn.utils.graph import graph_laplacian
import numpy as np
from sklearn.utils.arpack import eigsh

app = service.prodbox.CinemaService()

X = app.getWeightedSearchFeatures(15)

graph = kneighbors_graph(X, 10)
lap = graph_laplacian(graph, True)

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components = 30, algorithm="arpack")
lap = spectral_embedding_._set_diag(lap, 1)
svd.fit(-lap)

eigenvalues = np.diag(svd.components_ * (-lap).todense() * svd.components_.T)

eigenvalues2, _ = eigsh(-lap, k=30, which='LM', sigma=1)
print(eigenvalues)

print(eigenvalues2)

se = SpectralEmbedding(n_components = 30, eigen_solver='arpack', affinity="nearest_neighbors")
se.fit(X)

app.quit()

# TODO : check budget distribution, draw budget conditionnaly
예제 #9
0
                          axis='index') * 100  # calculate row percentage
percent = percent.drop(['total'], axis=1)  # drop total column

cat_perc = []
for cat in matrix.columns:
    cat_tuple = (cat, matrix[cat].mean())
    cat_perc.append(cat_tuple)
# sort category percentages
cat_perc = sorted(cat_perc, key=lambda x: x[1])

graph = cosine_similarity(matrix)  # use cosine similarity, as in Noulas et al.

# https://github.com/mingmingyang/auto_spectral_clustering/blob/master/autosp.py
# how to calculate spectral clusters
norm_laplacian, dd = graph_laplacian(graph, normed=True, return_diag=True)
laplacian = _set_diag(norm_laplacian, 1, norm_laplacian=True)
n_components = graph.shape[0] - 1

eigenvalues, eigenvectors = eigsh(-laplacian,
                                  k=n_components,
                                  which="LM",
                                  sigma=1.0,
                                  maxiter=5000)
eigenvalues = -eigenvalues[::-1]

max_gap = 0
gap_pre_index = 0
for i in range(1, eigenvalues.size):
    gap = eigenvalues[i] - eigenvalues[i - 1]
    if gap > max_gap:
        max_gap = gap
예제 #10
0
    def spectral_embedding(self,
                           adjacency,
                           n_components=8,
                           eigen_solver=None,
                           random_state=None,
                           eigen_tol=0.0,
                           drop_first=True):
        """
        see original at https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/manifold/spectral_embedding_.py#L133
        custermize1: return lambdas with the embedded matrix.
        custermize2: norm_laplacian is always True
        """
        norm_laplacian = True
        adjacency = check_symmetric(adjacency)

        try:
            from pyamg import smoothed_aggregation_solver
        except ImportError:
            if eigen_solver == "amg":
                raise ValueError(
                    "The eigen_solver was set to 'amg', but pyamg is "
                    "not available.")

        if eigen_solver is None:
            eigen_solver = 'arpack'
        elif eigen_solver not in ('arpack', 'lobpcg', 'amg'):
            raise ValueError("Unknown value for eigen_solver: '%s'."
                             "Should be 'amg', 'arpack', or 'lobpcg'" %
                             eigen_solver)

        random_state = check_random_state(random_state)

        n_nodes = adjacency.shape[0]
        # Whether to drop the first eigenvector
        if drop_first:
            n_components = n_components + 1

        if not _graph_is_connected(adjacency):
            warnings.warn("Graph is not fully connected, spectral embedding"
                          " may not work as expected.")

        laplacian, dd = graph_laplacian(adjacency,
                                        normed=norm_laplacian,
                                        return_diag=True)
        if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and
            (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)):
            # lobpcg used with eigen_solver='amg' has bugs for low number of nodes
            # for details see the source code in scipy:
            # https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
            # /lobpcg/lobpcg.py#L237
            # or matlab:
            # http://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
            laplacian = _set_diag(laplacian, 1, norm_laplacian)

            # Here we'll use shift-invert mode for fast eigenvalues
            # (see http://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
            #  for a short explanation of what this means)
            # Because the normalized Laplacian has eigenvalues between 0 and 2,
            # I - L has eigenvalues between -1 and 1.  ARPACK is most efficient
            # when finding eigenvalues of largest magnitude (keyword which='LM')
            # and when these eigenvalues are very large compared to the rest.
            # For very large, very sparse graphs, I - L can have many, many
            # eigenvalues very near 1.0.  This leads to slow convergence.  So
            # instead, we'll use ARPACK's shift-invert mode, asking for the
            # eigenvalues near 1.0.  This effectively spreads-out the spectrum
            # near 1.0 and leads to much faster convergence: potentially an
            # orders-of-magnitude speedup over simply using keyword which='LA'
            # in standard mode.
            try:
                # We are computing the opposite of the laplacian inplace so as
                # to spare a memory allocation of a possibly very large array
                laplacian *= -1
                lambdas, diffusion_map = eigsh(laplacian,
                                               k=n_components,
                                               sigma=1.0,
                                               which='LM',
                                               tol=eigen_tol)
                embedding = diffusion_map.T[n_components::-1] * dd

            except RuntimeError:
                # When submatrices are exactly singular, an LU decomposition
                # in arpack fails. We fallback to lobpcg
                eigen_solver = "lobpcg"
                # Revert the laplacian to its opposite to have lobpcg work
                laplacian *= -1

        if eigen_solver == 'amg':
            # Use AMG to get a preconditioner and speed up the eigenvalue
            # problem.
            if not sparse.issparse(laplacian):
                warnings.warn("AMG works better for sparse matrices")
            # lobpcg needs double precision floats
            laplacian = check_array(laplacian,
                                    dtype=np.float64,
                                    accept_sparse=True)
            laplacian = _set_diag(laplacian, 1, norm_laplacian)
            ml = smoothed_aggregation_solver(check_array(laplacian, 'csr'))
            M = ml.aspreconditioner()
            X = random_state.rand(laplacian.shape[0], n_components + 1)
            X[:, 0] = dd.ravel()
            lambdas, diffusion_map = lobpcg(laplacian,
                                            X,
                                            M=M,
                                            tol=1.e-12,
                                            largest=False)
            embedding = diffusion_map.T * dd
            if embedding.shape[0] == 1:
                raise ValueError

        elif eigen_solver == "lobpcg":
            # lobpcg needs double precision floats
            laplacian = check_array(laplacian,
                                    dtype=np.float64,
                                    accept_sparse=True)
            if n_nodes < 5 * n_components + 1:
                # see note above under arpack why lobpcg has problems with small
                # number of nodes
                # lobpcg will fallback to eigh, so we short circuit it
                if sparse.isspmatrix(laplacian):
                    laplacian = laplacian.toarray()
                lambdas, diffusion_map = eigh(laplacian)
                embedding = diffusion_map.T[:n_components] * dd
            else:
                laplacian = _set_diag(laplacian, 1, norm_laplacian)
                # We increase the number of eigenvectors requested, as lobpcg
                # doesn't behave well in low dimension
                X = random_state.rand(laplacian.shape[0], n_components + 1)
                X[:, 0] = dd.ravel()
                lambdas, diffusion_map = lobpcg(laplacian,
                                                X,
                                                tol=1e-15,
                                                largest=False,
                                                maxiter=2000)
                embedding = diffusion_map.T[:n_components] * dd
                if embedding.shape[0] == 1:
                    raise ValueError

        embedding = _deterministic_vector_sign_flip(embedding)
        if drop_first:
            return embedding[1:n_components].T, lambdas
        else:
            return embedding[:n_components].T, lambdas
예제 #11
0
from sklearn.utils.sparsetools import connected_components
from sklearn.neighbors import kneighbors_graph
from sklearn.utils.graph import graph_laplacian
import numpy as np
from sklearn.utils.arpack import eigsh

app = service.prodbox.CinemaService()

X = app.getWeightedSearchFeatures(15)

graph = kneighbors_graph(X, 10)
lap = graph_laplacian(graph, True)

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=30, algorithm="arpack")
lap = spectral_embedding_._set_diag(lap, 1)
svd.fit(-lap)

eigenvalues = np.diag(svd.components_ * (-lap).todense() * svd.components_.T)

eigenvalues2, _ = eigsh(-lap, k=30, which='LM', sigma=1)
print(eigenvalues)

print(eigenvalues2)

se = SpectralEmbedding(n_components=30,
                       eigen_solver='arpack',
                       affinity="nearest_neighbors")
se.fit(X)

app.quit()