Пример #1
1
def scikit_pca(model, rel_wds, plot_lims, title, cluster="kmeans"):
    """
    Given a word2vec model and a cluster (choice of "kmeans" or "spectral")
    Make a plot of all word-vectors in the model.
    """
    X, keys = make_data_matrix(model)

    for i, key in enumerate(keys):
        X[i,] = model[key]

    if cluster == "kmeans":
        k_means = KMeans(n_clusters=8)
        labels = k_means.fit_predict(X)

    elif cluster == "spectral":
        sp_clust = SpectralClustering()
        labels = sp_clust.fit_predict(X)

    # PCA
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X_transf = sklearn_pca.fit_transform(X_std)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)

    return sklearn_pca.explained_variance_ratio_
Пример #2
0
def spectral_clustering(matrix, N):
    spectral = SpectralClustering(n_clusters=N)
    clusters = spectral.fit_predict(matrix)
    res = [[] for _ in range(N)]
    for i, c in enumerate(clusters):
        res[c].append(i)
    return res
def create_word2vec_cluster(word2vec_model):
    word_vectors = word2vec_model.syn0
    num_clusters = word_vectors.shape[0] / 1000
    spectral_cluster_model = SpectralClustering(n_clusters=num_clusters)
    idx = spectral_cluster_model.fit_predict(word_vectors)
    pickle.dump(spectral_cluster_model, open(r"C:\Ofir\Tau\Machine Learning\Project\project\k_means_model.pkl", "wb"))
    return spectral_cluster_model
Пример #4
0
def call_spectral(num_cluster ,mode_, data, update_flag):
    X = StandardScaler().fit_transform(data)
    spectral = SpectralClustering(n_clusters=num_cluster, eigen_solver='arpack', 
                                                        affinity='precomputed')
    connectivity = kneighbors_graph(X, n_neighbors=10)
    connectivity = 0.5 * (connectivity + connectivity.T)
    spectral.fit(connectivity)
    labels = spectral.labels_

    if update_flag:
        return labels


    label_dict = {}
    label_dict_count = 0
    for label in labels:
       label_dict[str(label_dict_count)] = float(label)
       label_dict_count = label_dict_count + 1
    print label_dict

    unique_dict = {}
    unique_dict_count = 0
    for uniq in np.unique(labels):
       print uniq
       unique_dict[str(unique_dict_count)] = float(uniq)
       unique_dict_count = unique_dict_count + 1
    print unique_dict

    return label_dict, unique_dict
def main(cm_file, perm_file, steps, labels_file, limit_classes=None):
    """Run optimization and generate output."""
    # Load confusion matrix
    with open(cm_file) as f:
        cm = json.load(f)
        cm = np.array(cm)

    # Load labels
    if os.path.isfile(labels_file):
        with open(labels_file, "r") as f:
            labels = json.load(f)
    else:
        labels = list(range(len(cm)))

    n_clusters = 14  # hyperparameter
    spectral = SpectralClustering(n_clusters=n_clusters,
                                  eigen_solver='arpack',
                                  affinity="nearest_neighbors")
    spectral.fit(cm)
    if hasattr(spectral, 'labels_'):
        y_pred = spectral.labels_.astype(np.int)
    else:
        y_pred = spectral.predict(cm)
    sscore = silhouette_score(cm, y_pred)
    print("silhouette_score={} with {} clusters"
          .format(sscore, n_clusters))
    grouping = [[] for _ in range(n_clusters)]
    for label, y in zip(labels, y_pred):
        grouping[y].append(label)
    for group in grouping:
        print("  {}: {}".format(len(group), group))
def spectral_clustering(G, graph_name, num_clusters):
    #Find a way to figure out clusters number automatically
    subgraphs = []
    write_directory = os.path.join(Constants.SPECTRAL_PATH,graph_name)
    if not os.path.exists(write_directory):
        os.makedirs(write_directory)
    nodeList = G.nodes()
    matrix_data = nx.to_numpy_matrix(G, nodelist = nodeList)
    spectral = SpectralClustering(n_clusters=2,
                                          eigen_solver='arpack',
                                          affinity="rbf")   
    spectral.fit(matrix_data)
    label = spectral.labels_
    clusters = {}
    
    for nodeIndex, nodeLabel in enumerate(label):
        if nodeLabel not in clusters:
            clusters[nodeLabel] = []
        clusters[nodeLabel].append(nodeList[nodeIndex])
        
    #countNodes is used to test whether we have all the nodes in the clusters 
   
    for clusterIndex, subGraphNodes in enumerate(clusters.keys()):
        subgraph = G.subgraph(clusters[subGraphNodes])
        subgraphs.append(subgraph)
        nx.write_gexf(subgraph, os.path.join(write_directory,graph_name+str(clusterIndex)+"_I"+Constants.GEXF_FORMAT))
        #countNodes = countNodes + len(clusters[subGraphNodes])
    return subgraphs
def fast_app_spe_cluster(data, label, k, n_cluster):
    #k-means get the representative points(centers points)
    start_time = time.clock()
    k_means = KMeans(n_clusters=k)
    k_means.fit(data)
    y_centers = k_means.cluster_centers_
    # get the correspondence table
    x_to_centers_table = list()
    m = len(data)
    for i in range(m):
        min_distance = np.inf
        min_index = None
        for j in range(k):
            i_j_dis = np.sum((data[i, :] - y_centers[j, :]) ** 2)
            if min_distance > i_j_dis:
                min_index = j
                min_distance = i_j_dis
        x_to_centers_table.append(min_index)
    # spectral cluster
    spe_cluster = SpectralClustering(n_clusters=n_cluster)
    spe_cluster.fit(y_centers)
    spe_label = spe_cluster.labels_
    # get m-way cluster membership
    x_label = list()
    for i in range(m):
        x_label.append(spe_label[x_to_centers_table[i]])
    spend_time = time.clock() - start_time
    print("spend time is %f seconds" % spend_time)
    return x_label
Пример #8
0
    def compute_centroid_set(self, **kwargs):

        INPUT_ITR = subset_iterator(X=self.docv, m=self.subcluster_m, repeats=self.subcluster_repeats)

        kn = self.subcluster_kn
        clf = SpectralClustering(n_clusters=kn, affinity="precomputed")

        C = []

        for X in INPUT_ITR:
            # Remove any rows that have zero vectors
            bad_row_idx = (X ** 2).sum(axis=1) == 0
            X = X[~bad_row_idx]
            A = cosine_affinity(X)

            labels = clf.fit_predict(A)

            # Compute the centroids
            (N, dim) = X.shape
            centroids = np.zeros((kn, dim))

            for i in range(kn):
                idx = labels == i
                mu = X[idx].mean(axis=0)
                mu /= np.linalg.norm(mu)
                centroids[i] = mu

            C.append(centroids)

        return np.vstack(C)
def spectral_clustering(k, X, G, W=None, run_times=5):
    if type(W) == type(None):
        W = np.eye(len(X))
    W2 = np.sqrt(W)
    Gtilde = W2.dot(G.dot(W2))
    sc = SpectralClustering(k, affinity='precomputed', n_init=run_times)
    zh = sc.fit_predict(Gtilde)
    return zh
Пример #10
0
 def run(self, features, number_of_clusters=2, restarts=10, delta=3.0):
     if number_of_clusters == 1:
         result = numpy.zeros(len(features), dtype=numpy.int32)
         return [result]
     classifier = SpectralClustering(k=number_of_clusters, n_init=restarts)
     similarity = get_similarity(features, delta)
     classifier.fit(similarity)
     return [classifier.labels_]
Пример #11
0
def spectral_clustering2(similarity, concepts=2, euclid=False):
    if euclid:
        model = SpectralClustering(n_clusters=concepts, affinity='nearest_neighbors')
        return model.fit_predict(similarity)
    else:
        model = SpectralClustering(n_clusters=concepts, affinity='precomputed')
        similarity[similarity < 0] = 0
        return model.fit_predict(similarity)
Пример #12
0
def get_coregulatory_states(corr_matrices, similarity_matrix, n_clusters):
    spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed')
    labels = spectral.fit_predict(similarity_matrix)

    coreg_states = {}
    for ci in np.unique(labels):
        coreg_states[ci] = corr_matrices[labels == ci, :, :].mean(axis=0)
    return coreg_states, labels
def spectral(k, X, G, run_times=10):
    """Spectral clustering from sklearn library. 
    run_times is the number of times the algorithm is gonna run with different
    initializations.
    
    """
    sc = SpectralClustering(k, affinity='precomputed', n_init=run_times)
    zh = sc.fit_predict(G)
    return zh
Пример #14
0
def dist_spectral(x, y):

    plot = []
    for s in range(dataset.shape[0]):
        plot.append(np.array([x[s], y[s]]))
    plot = np.array(plot)
    spectral = SpectralClustering(n_clusters=3, eigen_solver='arpack', affinity="nearest_neighbors")
    clusters = spectral.fit_predict(plot)
    return clusters
Пример #15
0
def spectral_clustering(S,X,config):
    '''
    Computes spectral clustering from an input similarity matrix.
    Returns the labels associated with the clustering.
    '''
    from sklearn.cluster import SpectralClustering

    nk = int(config["n_clusters"])
    clf = SpectralClustering(affinity='cosine',n_clusters=nk)
    return clf.fit_predict(X)
Пример #16
0
def test_affinities():
    X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4)
    # nearest neighbors affinity
    sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)
Пример #17
0
def cluster_faces_CNN(name = '9_8913259@N03', img_list = 'faces_list.txt'):
    root = '/Users/wangyufei/Documents/Study/intern_adobe/face_recognition_CNN/'+name + '/'
    f = open(root + model_name + 'similarity_matrix.cPickle','r')
    affinity_matrix = cPickle.load(f)
    f.close()

    f = SpectralClustering(affinity='precomputed', n_clusters=min(8, affinity_matrix.shape[0] - 1), eigen_solver = 'arpack', n_neighbors=min(5, affinity_matrix.shape[0]))
    a = f.fit_predict(affinity_matrix)

    groups = {}
    temp = zip(a, xrange(len(a)))
    for i in temp:
        if i[0] not in groups:
            groups[i[0]] = [i[1]]
        else:
            groups[i[0]].append(i[1])
    unique_person_id = []
    for kk in groups:
        min_similarity = np.Inf
        max_similarity = -np.Inf
        mean_similarity = 0
        this_group_ids = groups[kk]
        for j in xrange(len(this_group_ids)):
            for i in xrange(j+1, len(this_group_ids)):
                temp = affinity_matrix[this_group_ids[i],this_group_ids[j]]
                if temp < min_similarity:
                    min_similarity = temp
                if temp > max_similarity:
                    max_similarity = temp
                mean_similarity += temp
        mean_similarity /= max(1, len(this_group_ids)*(len(this_group_ids) - 1) / 2)
        print len(this_group_ids), mean_similarity, max_similarity, min_similarity
        if mean_similarity > 0.5:
            unique_person_id.append(kk)
    important_person = []
    for i in unique_person_id:
        important_person.append([i, len(groups[i])])
    important_person.sort(key = lambda x:x[1], reverse=True)
    in_path = root + img_list
    imgs_list = []
    with open(in_path, 'r') as data:
        for line in data:
            line = line[:-1]
            imgs_list.append(line.split('/')[-1])

    temp = zip(a, imgs_list)
    face_groups = {}
    for i in temp:
        if i[0] not in face_groups:
            face_groups[i[0]] = [i[1]]
        else:
            face_groups[i[0]].append(i[1])

    create_face_group_html_CNN(name, face_groups, important_person)
def spectral(k, X, G, z, run_times=10):
    """Spectral clustering from sklearn library. 
    run_times is the number of times the algorithm is gonna run with different
    initializations.
    
    """
    sc = SpectralClustering(k, affinity='precomputed', n_init=run_times)
    zh = sc.fit_predict(G)
    a = metric.accuracy(z, zh)
    v = metric.variation_information(z, zh)
    return a, v
Пример #19
0
	def run(self, k):
		if self.data_is_kernel:
			clf = SpectralClustering(n_clusters=k, gamma=self.gammav, affinity='precomputed')	
			self.allocation = clf.fit_predict(self.X)
			self.kernel = self.X
		else:
			clf = SpectralClustering(n_clusters=k, gamma=self.gammav)		#, affinity='precomputed'
			self.allocation = clf.fit_predict(self.X)
			self.kernel = clf.affinity_matrix_
	
		return self.allocation
def spectral_clustering(crime_rows, column_names, num_clusters, affinity='rbf', n_neighbors=0,
        assign_labels='kmeans'):
    """
        n_clusters : integer, optional
            The dimension of the projection subspace.
        affinity : string, array-like or callable, default ‘rbf’
            If a string, this may be one of ‘nearest_neighbors’, ‘precomputed’, ‘rbf’ 
            or one of the kernels supported by sklearn.metrics.pairwise_kernels.
            Only kernels that produce similarity scores 
                (non-negative values that increase with similarity) should be used. 
                This property is not checked by the clustering algorithm.
        gamma : float
            Scaling factor of RBF, polynomial, exponential chi^2 and sigmoid affinity kernel. 
            Ignored for affinity='nearest_neighbors'.
        degree : float, default=3
            Degree of the polynomial kernel. Ignored by other kernels.
        coef0 : float, default=1
            Zero coefficient for polynomial and sigmoid kernels. Ignored by other kernels.
        n_neighbors : integer
            Number of neighbors to use when constructing the affinity matrix 
            using the nearest neighbors method. Ignored for affinity='rbf'.
        n_init : int, optional, default: 10
            Number of time the k-means algorithm will be run with different 
                centroid seeds. 
            The final results will be the best output of n_init consecutive runs in 
                terms of inertia.
        assign_labels : {‘kmeans’, ‘discretize’}, default: ‘kmeans’
            The strategy to use to assign labels in the embedding space. 
            There are two ways to assign labels after the laplacian embedding. 
            k-means can be applied and is a popular choice. 
            But it can also be sensitive to initialization. 
            Discretization is another approach which is less sensitive to 
            random initialization.
        kernel_params : dictionary of string to any, optional
            Parameters (keyword arguments) and values for kernel passed 
                as callable object. Ignored by other kernels.
    """
    crime_xy = [crime[0:2] for crime in crime_rows]
    crime_info = [crime[2:] for crime in crime_rows]
    #crime_xy = [crime[1:] for crime in crime_rows]
    spectral_clustering = SpectralClustering(
            n_clusters=num_clusters, 
            affinity=affinity, 
            n_neighbors=n_neighbors, 
            assign_labels=assign_labels)
    print("Running spectral clustering....")
    print("length crimexy")
    print(len(crime_xy))
    spectral_clustering_labels = spectral_clustering.fit_predict(
            random_sampling(crime_xy, num_samples=3000))
    print("Formatting......")
    return _format_clustering(spectral_clustering_labels, crime_xy, crime_info,
            column_names, num_clusters=num_clusters)
Пример #21
0
def predictSpectralClustering(X, y, n=2, val='rbf'):
	ranX, ranY = shuffle(X, y, random_state=0)
	X = X[:600,]
	y = y[:600,]
	sc = SpectralClustering(n_clusters=n)
	results = sc.fit_predict(X)
	gini = compute_gini(results)
	if n == 2:
		same = calculate_score(results, y)
		opp = calculate_score(results, y, True)
		return (results, max(same, opp), gini)
	else:
		return (results, 0, gini)
def run_clustering(methods, cases):
    true_method_groups = [m[1] for m in methods]
    edge_model = GraphLassoCV(alphas=4, n_refinements=5, n_jobs=3, max_iter=100)
    edge_model.fit(cases)
    CV = edge_model.covariance_
    
    num_clusters=3
    spectral = SpectralClustering(n_clusters=num_clusters,affinity='precomputed') 
    spectral.fit(np.asarray(CV))
    spec_sort=np.argsort(spectral.labels_)
    
    for i,m in enumerate(methods):
        print "%s:%d\t%s"%(m[1],spectral.labels_[i],m[0])
    print "Adj. Rand Score: %f"%adjusted_rand_score(spectral.labels_,true_method_groups)
Пример #23
0
def spectral_clustering(vectors: list, num_rows, k):
    matrix = []
    ## num_rows X len(vectors)
    for s in range(num_rows):
        row = []
        for v in vectors:
            row.append(v[s])
        matrix.append(np.array(row))

    matrix = np.array(matrix)

    spectral = SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity="nearest_neighbors")
    clusters = spectral.fit_predict(matrix)
    return clusters
Пример #24
0
def eval_k(max_k):
    a_score, idx = [], []
    for k in xrange(2, max_k + 1):
        print 'k={}'.format(k)
        est = SpectralClustering(n_clusters=k, affinity='nearest_neighbors')
#         est = SpectralClustering(n_clusters=k, affinity='rbf', gamma=0.00001)
        est.fit(x)
        ari = metrics.adjusted_rand_score(y, est.labels_)
        print ari
        a_score.append(ari)
        idx.append(k)
    pl.plot(idx, a_score)
    pl.xlabel('# of clusters')
    pl.ylabel('ARI')
    pl.show()
Пример #25
0
    def _small_partition(self, data):
        _logger.debug("Running _small_partition on %s observations", len(data))

        similarity = self._get_similarity(data, sparse = self.sparse_similarity)
        _logger.debug("Spectral clustering")
        spc_obj = SpectralClustering(n_clusters = 2, affinity = 'precomputed',
            assign_labels = 'discretize')
        partition = spc_obj.fit_predict(similarity)
        _logger.debug("Done spectral clustering")

        sizes = [len(partition[partition == x]) for x in [0, 1]]
        _logger.debug("Result of _small_partition: #0: {}, #1: {}" \
            .format(*sizes))

        return partition
Пример #26
0
def spectral(X, num_clusters):
    """
    Spectral Clustering on X for response y
    Returns array of cluster groups
    """
    model = SpectralClustering(
        n_clusters=num_clusters,
        eigen_solver="arpack",
        affinity="nearest_neighbors",
        n_neighbors=4,
        assign_labels="discretize",
    )
    cleanX = preprocessing.scale(X.as_matrix())
    model.fit(cleanX)
    return model.labels_
def spectral(x, num_clusters):
  spec = SpectralClustering(
    affinity='rbf', # 'rbf'
    n_clusters=num_clusters,
    n_init=10,
    assign_labels='kmeans', 
    gamma=1.0, 
    degree=3, 
    coef0=1
  )
  spec.fit(x)

  c = spec.labels_
  k = len(np.unique(c))

  return spec, (None, c, k)
Пример #28
0
def test_affinities():
    X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]],
                      cluster_std=0.4)
    # nearest neighbors affinity
    sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                            random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    # raise error on unknown affinity
    sp = SpectralClustering(n_clusters=2, affinity='<unknown>')
    assert_raises(ValueError, sp.fit, X)
Пример #29
0
def test_affinities():
    # Note: in the following, random_state has been selected to have
    # a dataset that yields a stable eigen decomposition both when built
    # on OSX and Linux
    X, y = make_blobs(n_samples=20, random_state=0,
                      centers=[[1, 1], [-1, -1]], cluster_std=0.01
                      )
    # nearest neighbors affinity
    sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                            random_state=0)
    assert_warns_message(UserWarning, 'not fully connected', sp.fit, X)
    assert_equal(adjusted_rand_score(y, sp.labels_), 1)

    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    X = check_random_state(10).rand(10, 5) * 10

    kernels_available = kernel_metrics()
    for kern in kernels_available:
        # Additive chi^2 gives a negative similarity matrix which
        # doesn't make sense for spectral clustering
        if kern != 'additive_chi2':
            sp = SpectralClustering(n_clusters=2, affinity=kern,
                                    random_state=0)
            labels = sp.fit(X).labels_
            assert_equal((X.shape[0],), labels.shape)

    sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1,
                            random_state=0)
    labels = sp.fit(X).labels_
    assert_equal((X.shape[0],), labels.shape)

    def histogram(x, y, **kwargs):
        """Histogram kernel implemented as a callable."""
        assert_equal(kwargs, {})    # no kernel_params that we didn't ask for
        return np.minimum(x, y).sum()

    sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal((X.shape[0],), labels.shape)

    # raise error on unknown affinity
    sp = SpectralClustering(n_clusters=2, affinity='<unknown>')
    assert_raises(ValueError, sp.fit, X)
Пример #30
0
def compute_spectral_clustering(n_vertex, edge_list, n_clusters):

    from sklearn.cluster import SpectralClustering

    clst = SpectralClustering(n_clusters, affinity="precomputed")

    adjacency_matrix = tf.compute_adjacency_matrix(n_vertex, edge_list)

    t = time.time()
    labels = clst.fit_predict(adjacency_matrix, n_clusters)
    exectime = time.time() - t

    labels = tf.compute_normal_labels(labels)

    clusters = tf.compute_clusters_from_labels(labels)

    return labels, clusters, exectime
Пример #31
0
 def dbscan(self):
     # DBSCAN Clustering
     self.Clusters = DBSCAN(eps=0.3)
     self.clusterIndex = self.Clusters.fit_predict(self.values)
     #self.centers = self.DBSCANClusters.core_sample_indices_
     self.output()
Пример #32
0
     clustering = KMeans(n_clusters=k, random_state=0).fit(zOut)
     listResult = clustering.predict(zOut)
 elif args.clustering_method == 'LouvainB':
     listResult, size = generateLouvainCluster(edgeList)
     k = len(np.unique(listResult))
     print('Louvain cluster: ' + str(k))
     k = int(k * resolution) if k > 3 else 2
     clustering = Birch(n_clusters=k).fit(zOut)
     listResult = clustering.predict(zOut)
 elif args.clustering_method == 'KMeans':
     clustering = KMeans(n_clusters=args.n_clusters,
                         random_state=0).fit(zOut)
     listResult = clustering.predict(zOut)
 elif args.clustering_method == 'SpectralClustering':
     clustering = SpectralClustering(n_clusters=args.n_clusters,
                                     assign_labels="discretize",
                                     random_state=0).fit(zOut)
     listResult = clustering.labels_.tolist()
 elif args.clustering_method == 'AffinityPropagation':
     clustering = AffinityPropagation().fit(zOut)
     listResult = clustering.predict(zOut)
 elif args.clustering_method == 'AgglomerativeClustering':
     clustering = AgglomerativeClustering().fit(zOut)
     listResult = clustering.labels_.tolist()
 elif args.clustering_method == 'AgglomerativeClusteringK':
     clustering = AgglomerativeClustering(
         n_clusters=args.n_clusters).fit(zOut)
     listResult = clustering.labels_.tolist()
 elif args.clustering_method == 'Birch':
     clustering = Birch(n_clusters=args.n_clusters).fit(zOut)
     listResult = clustering.predict(zOut)
    'SimplePP': SimplePPEncoder(),
    'CESAMOEncoder': CESAMOEncoder(),
    'CENG': CENGEncoder(verbose=0)
}
"""END: Import encoders"""

import random
"""START: Import models"""
try:
    from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering  # Birch DBSCAN
except:
    raise Exception('Scikit-Learn 0.22.2+ not available')

Models = {
    'K-Means': KMeans(n_clusters=n_clusters),
    'Spectral': SpectralClustering(n_clusters=n_clusters,
                                   eigen_solver='lobpcg'),
    'Agglomerative': AgglomerativeClustering(n_clusters=n_clusters)
}
#'DBSCAN': DBSCAN(eps=0.3, min_samples=15)}
"""END: Import models"""

# Performance evaluation function
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import calinski_harabasz_score as chs
from sklearn.metrics import silhouette_score as sil
import time


def performance(encoder, models, K):
Пример #34
0
ax.matshow(a)
locs, labels = plt.xticks(range(size), empty)
plt.setp(labels, rotation=90)
plt.yticks(range(size), empty)
plt.show()

# In[24]:

from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score

x_score = []
y_score = []

for i in range(2, 10):  #Get scores for n_clusters from 2 to 10
    tmp_clf = SpectralClustering(n_clusters=i, affinity='precomputed')
    tmp_clf.fit(a)
    score = silhouette_score(a, tmp_clf.labels_, metric='precomputed')
    x_score.append(i)
    y_score.append(score)

plt.subplots(figsize=(10, 10))
plt.plot(x_score, y_score)
plt.grid()
plt.show()

# In[25]:

clusters_count = 3
clusters = [[] for i in range(clusters_count)]
clf = SpectralClustering(n_clusters=clusters_count,
Пример #35
0
def spectral_cluster(k, X):
    from sklearn.cluster import SpectralClustering
    y_pred = SpectralClustering(n_clusters=k, gamma=0.1).fit_predict(X)
    return y_pred
Пример #36
0
    # Show the dataset
    sns.set()

    fig, ax = plt.subplots(figsize=(12, 8))

    ax.scatter(data[:, 0], data[:, 1])
    ax.set_xlabel(r'$x_0$', fontsize=14)
    ax.set_ylabel(r'$x_1$', fontsize=14)

    plt.show()

    # Perform the clustering
    km = KMeans(n_clusters=2, random_state=1000)
    sc = SpectralClustering(n_clusters=2,
                            affinity='rbf',
                            gamma=2.0,
                            random_state=1000)

    Y_pred_km = km.fit_predict(data)
    Y_pred_sc = sc.fit_predict(data)

    # Show the results
    fig, ax = plt.subplots(1, 3, figsize=(20, 6), sharey=True)

    ax[0].scatter(data[:, 0], data[:, 1], c='b', s=5)

    ax[1].scatter(data[Y_pred_sc == 0, 0],
                  data[Y_pred_sc == 0, 1],
                  marker='o',
                  s=5,
                  c='b',
Пример #37
0
def test_clustering(df,
                    gmms,
                    title="",
                    save_to_file=False,
                    highlight_point=None):
    # preprocessing
    df_train = copy.deepcopy(df)
    df_train.drop('attack', 1, inplace=True)
    df_train.drop('difficulty', 1, inplace=True)

    # from about 30 dimension to 2 dimension
    proj = reduction.gmm_reduction(df_train, headers, gmms)
    cproj = copy.deepcopy(proj)

    # data_per_true_labels : try to make sort of dictionary per each label
    data_per_true_labels = []
    for i in range(len(attacks)):
        data_per_true_labels.append([])

    true_attack_types = df["attack"].values.tolist()
    for i, d in enumerate(cproj):
        data_per_true_labels[true_attack_types[i]].append(d)

    A = affinity.get_affinity_matrix(cproj,
                                     metric_method=distance.cosdist,
                                     knn=8)

    k = predict_k(A)
    print "supposed k : " + str(k)

    lim = int(len(df) * 0.01)
    lim = 12
    #    if lim < 3 or lim > 10 :
    #        lim = 10
    k = lim
    print "Total number of clusters : " + str(k)

    sc = SpectralClustering(n_clusters=k,
                            affinity="precomputed",
                            assign_labels="kmeans").fit(A)
    res = sc.labels_

    # cluster data set
    clusters = [0] * k
    clusters_data = []
    clusters_xmean = [-1] * k
    clusters_ymean = [-1] * k
    clusters_xstd = [-1] * k
    clusters_ystd = [-1] * k
    for i in range(k):
        clusters_data.append([])
    for i, p in enumerate(cproj):
        true_label = true_attack_types[i]
        if true_label == model.attack_normal:
            clusters[res[i]] = clusters[res[i]] + 1
        else:
            clusters[res[i]] = clusters[res[i]] - 1
        clusters_data[res[i]].append(p)

    # cluster recheck with density
    for i, cluster in enumerate(clusters):
        p = clusters_data[i]
        x = np.array([t[0] for t in p])
        y = np.array([t[1] for t in p])
        clusters_xmean[i] = np.mean(x)
        clusters_ymean[i] = np.mean(y)
        clusters_xstd[i] = np.std(x)
        clusters_ystd[i] = np.std(y)

    ds = []
    for i, cluster in enumerate(clusters):
        if cluster > 0:
            d = check_abnormal_with_density(clusters_xmean[i],
                                            clusters_ymean[i],
                                            clusters_xstd[i], clusters_ystd[i],
                                            len(clusters_data[i]))
            ds.append(d)
            if 0 > d:
                clusters[i] = -99999
        else:
            ds.append(None)
    print("ds")
    print ds
Пример #38
0
# (array([21, 23, 25, 33, 37, 41], dtype=int64),)0
# (array([ 7,  9, 11, 13, 15, 17, 27, 29, 31], dtype=int64),)1
# (array([42, 44], dtype=int64),)2
# (array([18, 38, 46], dtype=int64),)3
# (array([ 5, 19, 43, 45, 47], dtype=int64),)4
# (array([ 6,  8, 10, 12, 14, 16], dtype=int64),)5
# (array([20, 22, 24, 32, 36, 40], dtype=int64),)6
# (array([28, 30, 34], dtype=int64),)7
# (array([ 0,  2,  4, 26], dtype=int64),)8
# (array([ 1,  3, 35, 39], dtype=int64),)9

x1,y1,z1=[],[],[]

for index, n_neighbors in enumerate((11,12,13,14,15,16,17,18,19,20)):  # 4,5,6,7,8,9,10,11,12,13,14,15,16
    for index, k in enumerate((10,11,12,13,14,15,16,17,18,19,20)):
        y_pred = SpectralClustering(affinity='nearest_neighbors',n_clusters=k, n_neighbors=n_neighbors).fit_predict(X)
        print ("Calinski-Harabasz Score with n_neighbors=", n_neighbors, "n_clusters=", k,"score:", metrics.calinski_harabaz_score(X, y_pred) )
        x1.append(n_neighbors)
        y1.append(k)
        z1.append(metrics.calinski_harabaz_score(X, y_pred))

print(x1,y1,z1)

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

# 生成画布、3D图形对象、三维散点图
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(x1,y1,z1)
class SpectralClusteringPrimitive(TransformerPrimitiveBase[Inputs, Outputs,
                                                           Hyperparams]):
    '''
        Primitive that applies sklearn spectral clustering algorithm to unsupervised, 
        supervised or semi-supervised datasets. 
        
        Training inputs: D3M dataframe with features and labels, and D3M indices

        Outputs:D3M dataframe with cluster predictions and D3M indices. Clusterlabels are of "suggestTarget" semantic type if
        the task_type hyperparameter is clustering, and "Attribute" if the task_type is classification.  
    '''
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        "d13a4529-f0ba-44ee-a867-e0fdbb71d6e2",
        'version':
        __version__,
        'name':
        "tsne",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Clustering', 'Graph Clustering'],
        'source': {
            'name':
            __author__,
            'contact':
            __contact__,
            "uris": [
                # Unstructured URIs.
                "https://github.com/kungfuai/d3m-primitives",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        "installation": [
            {
                "type": "PIP",
                "package": "cython",
                "version": "0.29.16"
            },
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.clustering.spectral_graph.SpectralClustering',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.SPECTRAL_CLUSTERING,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.CLUSTERING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self.sc = SC(n_clusters=self.hyperparams['n_clusters'],
                     n_init=self.hyperparams['n_init'],
                     n_neighbors=self.hyperparams['n_neighbors'],
                     affinity=self.hyperparams['affinity'],
                     random_state=self.random_seed)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : dataframe 

        Returns
        ----------
        Outputs
            The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter
            For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe
        """

        targets = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/Target')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]

        X_test = inputs.drop(columns=list(inputs)[index[0]])
        X_test = X_test.drop(columns=target_names).values

        # special semi-supervised case - during training, only produce rows with labels
        series = inputs[target_names] != ''
        if series.any().any():
            inputs = select_rows(inputs, np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        sc_df = d3m_DataFrame(
            pandas.DataFrame(self.sc.fit_predict(X_test),
                             columns=['cluster_labels']))

        # just add last column of last column ('clusters')
        col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type(1)
        if self.hyperparams['task_type'] == 'classification':
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/Attribute')
            col_dict['name'] = 'cluster_labels'
        else:
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
            )
            col_dict['name'] = target_names[0]
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0),
                                               col_dict)

        df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = (
            'https://metadata.datadrivendiscovery.org/types/TabularColumn', )
        df_dict_1['length'] = 1
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, ),
                                               df_dict)

        return CallResult(inputs.append_columns(sc_df))
Пример #40
0
import numpy as np
import scipy
import random
from sklearn.cluster import SpectralClustering
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse

eight = np.array(([-3, -2, -2, -2, 1, 1, 2, 4], [0, 4, -1, -2, 4, 2, -4,
                                                 -3])).T
eight = eight[[7, 6, 2, 0, 3, 1, 5, 4], :]

random.seed(11)

sc = SpectralClustering(n_clusters=2,
                        eigen_solver="arpack",
                        affinity="rbf",
                        random_state=11).fit(eight)

scipy.linalg.eigh(sc.affinity_matrix_)

covm = np.cov(eight[np.where(sc.labels_ == 0)][:, 0],
              eight[np.where(sc.labels_ == 0)][:, 1])
eigva = np.sqrt(np.linalg.eig(covm)[0])
eigve = np.linalg.eig(covm)[1]

covm1 = np.cov(eight[np.where(sc.labels_ == 1)][:, 0],
               eight[np.where(sc.labels_ == 1)][:, 1])
eigva1 = np.sqrt(np.linalg.eig(covm1)[0])
eigve1 = np.linalg.eig(covm1)[1]

fig, ax = plt.subplots(figsize=(10, 10))
Пример #41
0
if (algo == 'MiniBatch1000'):
    kmeans = MiniBatchKMeans(
        n_clusters=n_cluster,
        batch_size=1000,
    ).fit(word_embeddings)

    kmeans.fit(word_embeddings),

    y_kmeans = kmeans.predict(word_embeddings)
    print(y_kmeans)
    pca(y_kmeans)

if (algo == 'Spectral'):
    clustering = SpectralClustering(n_clusters=n_cluster,
                                    assign_labels="discretize",
                                    random_state=0).fit(word_embeddings)
    labels = clustering.labels_
    print(labels)
    pca(labels)

if (algo == 'Agglomerative'):
    cluster = AgglomerativeClustering(n_clusters=n_cluster,
                                      affinity='euclidean',
                                      linkage='ward')
    cluster.fit_predict(word_embeddings)
    labels = cluster.labels_
    pca(labels)

if (algo == 'BIRCH'):
    brc = Birch(n_clusters=n_cluster)
Пример #42
0
class PCA_and_Spectral():

    # Create adjacency matrix
    with open('/Users/kat/Desktop/Kaggle/Graph.csv', 'rb') as csvfile1:
        graphreader = csv.reader(csvfile1, delimiter=' ', quotechar='|')
        adjgraph = np.empty((6000, 6000))
        adjgraph.fill(0)
        for row in graphreader:
            arr = row[0].split(",")
            adjgraph[int(arr[0]) - 1][int(arr[1]) - 1] = 1
            adjgraph[int(arr[1]) - 1][int(arr[0]) - 1] = 1

    # Get features data into newEF matrix
    with open('/Users/kat/Desktop/Kaggle/Extracted_features.csv',
              'rb') as csvfile3:
        EF = csv.reader(csvfile3, delimiter=' ', quotechar='|')
        newEF = []
        for row in EF:
            arr = row[0].split(",")
            arr2 = np.asarray(arr)
            arr3 = arr2.astype(np.float)
            newEF.append(arr3)

    # PCA reduce features data to 800 dim (instead of 1084)
    pca = PCA(n_components=800)
    red_pca = pca.fit_transform(newEF)

    # spectral clustering on adjacency matrix
    spectral = SpectralClustering(10, affinity="precomputed")
    new_plot = spectral.fit_predict(
        adjgraph)  #6000 x 1 Array with cluster labels

    matching = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        5: [],
        6: [],
        7: [],
        8: [],
        9: []
    }

    # get cluster matchings for first 60 points
    with open('/Users/kat/Desktop/Kaggle/Seed.csv', 'rb') as csvfile2:
        seedreader = csv.reader(csvfile2, delimiter=' ', quotechar='|')
        for row in seedreader:
            arr = row[0].split(",")
            findClust = new_plot[int(arr[0]) - 1]
            matching[int(arr[1])].append(
                [int(arr[0]), red_pca[int(arr[0]) - 1], findClust])

    clusters = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        5: [],
        6: [],
        7: [],
        8: [],
        9: []
    }

    # Get points of each cluster
    for i in range(1, 6001):
        findClust = new_plot[i - 1]
        clusters[findClust].append(red_pca[i - 1])

    #for i in range(10):
    #    print "item is " + str(i)
    #    for item in matching[i]:
    #        print item[2]

    finalmatches = {0: 5, 1: 2, 2: 9, 3: 3, 4: 6, 5: 8, 6: 7, 7: 4, 8: 1, 9: 0}

    # match clusters to digits
    adjustedcluster = {}
    for i in range(10):
        index = finalmatches[i]
        adjustedcluster[i] = clusters[index]

    filtered_features = []
    filtered_features_idx = []

    # save clusters for digits 1 and 6
    cluster_2_digit_1 = adjustedcluster[1]
    cluster_7_digit_6 = adjustedcluster[6]

    # filter out clusters for digits 1 and 6
    for i in range(len(new_plot)):
        if not new_plot[i] == 2 and not new_plot[i] == 7:
            filtered_features.append(red_pca[i])
            filtered_features_idx.append(i + 1)

    centroids_pca_8_clusters = []

    # get initial centroids of the 8 digits based on seed
    for i in range(10):
        newarray = []
        if not i == 1 and not i == 6:
            for j in range(len(matching[i])):
                newarray.append(np.asarray(matching[i][j][1]))
            newa = np.asarray(newarray)
            centroids_pca_8_clusters.append(newa.mean(axis=0))

    centroids_pca_8_clusters = np.asarray(centroids_pca_8_clusters)

    # do kmeans to clean up 8 clusters
    kmeans_8 = KMeans(
        n_clusters=8,
        init=centroids_pca_8_clusters).fit_predict(filtered_features)

    kmeans_matching = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        5: [],
        6: [],
        7: [],
        8: [],
        9: []
    }

    with open('/Users/kat/Desktop/Kaggle/Seed.csv', 'rb') as csvfile2:
        seedreader = csv.reader(csvfile2, delimiter=' ', quotechar='|')
        for row in seedreader:
            arr = row[0].split(",")
            if not int(arr[1]) == 1 and not int(arr[1]) == 6:
                try:
                    idx = filtered_features_idx.index(int(arr[0]))
                    kmeans_matching[int(arr[1])].append(
                        [int(arr[0]), red_pca[int(arr[0]) - 1], kmeans_8[idx]])
                except ValueError:
                    pass

    #for i in range(10):
    #    print "item is " + str(i)
    #    for j in range(len(kmeans_matching[i])):
    #        print kmeans_matching[i][j][2]

    clusters_kmeans = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        5: [],
        6: [],
        7: [],
        8: [],
        9: []
    }

    # Get points of each cluster from kmeans
    for i in range(len(kmeans_8)):
        findClust = kmeans_8[i]
        idx = filtered_features_idx[i]
        clusters_kmeans[findClust].append(red_pca[idx - 1])

    finalmatches_kmeans = {0: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5, 8: 6, 9: 7}

    # match clusters to digits
    adjustedcluster_kmeans = {}
    for i in range(10):
        if not i == 1 and not i == 6:
            index = finalmatches_kmeans[i]
            adjustedcluster_kmeans[i] = clusters_kmeans[index]

    # get features of digit 1 from spectral
    cluster_2_digit_1 = np.asarray(cluster_2_digit_1)
    digit_1_centroid = cluster_2_digit_1.mean(axis=0)

    # get features of digit 6 from spectral
    cluster_7_digit_6 = np.asarray(cluster_7_digit_6)
    digit_6_centroid = cluster_7_digit_6.mean(axis=0)

    cluster_centers = []

    # calculate the cluster center for each cluster (digit)
    for i in range(10):
        if i == 1:
            cluster_centers.append(digit_1_centroid)
        elif i == 6:
            cluster_centers.append(digit_6_centroid)
        else:
            newa = np.asarray(adjustedcluster_kmeans[i])
            cluster_centers.append(newa.mean(axis=0))

    finalclusters = [[0 for i in range(2)] for j in range(4001)]
    finalclusters[0][0] = 'Id'
    finalclusters[0][1] = 'Label'

    for i in range(1, 4001):
        finalclusters[i][0] = 6000 + i
        newdist = []
        for j in range(10):
            newdist.append(
                dist.euclidean(red_pca[i + 5999], cluster_centers[j]))
        label = np.argmin(newdist)
        finalclusters[i][1] = label

    with open('submission10.csv', "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerows(finalclusters)
"""
Created on Fri Jan 08 17:34:11 2016

@author: wu34
"""

from sklearn.cluster import SpectralClustering
import visSimilarityMat
import utilise

Domain = ['DietType', 'ActType']

# dist is to set the similarity measurement method, the default is TFIDFCosin
# jaccard,novelJaccard,TFIDFCosin,TFIDFEclud,TFCosin,TFEclud
dist = 'TFEclud'
for domain in Domain:
    dietSimilarity_dict = {}
    if domain == 'DietItem':
        Similarity_dict = utilise.SimilarityDict(domain, dist)
    elif domain == 'ActItem':
        Similarity_dict = utilise.SimilarityDict(domain, dist)
    elif domain == 'DietType':
        Similarity_dict = utilise.SimilarityDict(domain, dist)
    elif domain == 'ActType':
        Similarity_dict = utilise.SimilarityDict(domain, dist)
    X = visSimilarityMat.similarityDict2array(Similarity_dict, 0)

    af = SpectralClustering(affinity="precomputed").fit(X)
    labels = af.labels_
    print labels
Пример #44
0
y_ID_2 = []

# generate X_train using X_id
#ID_list = df.ix[:,0]
ID_bipart = df_bipart.ix[:, 0].astype(str)
data_bipart = df_bipart.ix[:, 1:]
print 'number of nodes' + str(len(ID_bipart))

# need to determin how do you get num_group
num_group = num_cluster(data_bipart)
#num_group = 5
print 'number of groups' + str(num_group)
#kmeans_bipart = KMeans(n_clusters=num_group, random_state=0).fit(data_bipart)
#labels_bipart = kmeans_bipart.labels
random.seed(17)
labels_bipart = SpectralClustering(num_group, gamma=0.7,
                                   affinity='rbf').fit(data_bipart).labels_
# get ktruth's group
#k_groups = kTruth_groups (ID_bipart, labels_bipart, kTruth)

# add new column group to data
df_bipart['group'] = labels_bipart
#############
#group data by their 'group'
#df_bipart = df_bipart.sort_values('group')

#divide by group i
# @i means i is a variable in group
global_y = 0
global_len = 0
global_truth = []
global_fitted = []
Пример #45
0
kmed = KMedoids(2).fit_predict(data)

# In[10]:

plt.scatter(data[:, 0], data[:, 1], c=kmed, s=5, cmap="autumn")

# # Algoritmo del Clustering Espectral

# In[11]:

from sklearn.cluster import SpectralClustering

# In[12]:

clust = SpectralClustering(2).fit_predict(data)

# In[13]:

plt.scatter(data[:, 0], data[:, 1], c=clust, s=5, cmap="autumn")

# * Podemos estimar la k:
#     * No: Propagación de la afinidad
#     * Si: Podemos usar la distancia Euclídea:
#         * Si: K-Means
#         * No: Buscar valores centrales:
#             * Si: K-Medoides
#             * No: Los datos son linealmente separables:
#                 * Si: Clustering aglomerativo
#                 * No: Clustering Espectral
Пример #46
0
    data1 = np.vstack((np.cos(t), np.sin(t))).T
    data2 = np.vstack((2 * np.cos(t), 2 * np.sin(t))).T
    data3 = np.vstack((3 * np.cos(t), 3 * np.sin(t))).T
    data = np.vstack((data1, data2, data3))

    n_clusters = 3
    m = euclidean_distances(data, squared=True)

    plt.figure(figsize=(12, 8), facecolor='w')
    plt.suptitle(u'谱聚类', fontsize=20)
    clrs = plt.cm.Spectral(np.linspace(0, 0.8, n_clusters))
    for i, s in enumerate(np.logspace(-2, 0, 6)):
        print(s)
        af = np.exp(-m**2 / (s**2)) + 1e-6
        model = SpectralClustering(n_clusters=n_clusters,
                                   affinity='precomputed',
                                   assign_labels='kmeans',
                                   random_state=1)
        y_hat = model.fit_predict(af)
        plt.subplot(2, 3, i + 1)
        for k, clr in enumerate(clrs):
            cur = (y_hat == k)
            plt.scatter(data[cur, 0],
                        data[cur, 1],
                        s=40,
                        c=clr,
                        edgecolors='k')
        x1_min, x2_min = np.min(data, axis=0)
        x1_max, x2_max = np.max(data, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing


newsgroups_train = fetch_20newsgroups(subset='train')
labels = newsgroups_train.target

vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words='english')
X = vectorizer.fit_transform(newsgroups_train.data)
Y = preprocessing.normalize(X, norm='l1', axis=1, copy=True, return_norm=False)

#--------------Kernalize K-means------------------------------------------
km=SpectralClustering(n_clusters=20,gamma= 0.01, affinity='rbf')
km.fit(Y)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000))

#Performance
print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels, km.labels_))
print("NMI:%0.3f" % metrics.normalized_mutual_info_score(labels,km.labels_))
print("AMI:%0.3f" %metrics.adjusted_mutual_info_score(labels,km.labels_))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("FMI:%0.3f" % metrics.fowlkes_mallows_score(labels, km.labels_))
Пример #48
0
for i in [0,2,4,5]:
    le = preprocessing.LabelEncoder()
    le.fit(traindata3.iloc[:,i])
    #print(le.classes_)
    traindata3.iloc[:,i]=le.transform(traindata3.iloc[:,i])
#print(traindata3.head())
for i in [0,2,4,5]:
    le = preprocessing.LabelEncoder()
    le.fit(traindata4.iloc[:,i])
    #print(le.classes_)
    traindata4.iloc[:,i]=le.transform(traindata4.iloc[:,i])
#print(traindata4.head())

data=pd.concat([traindata1,traindata2,traindata3,traindata4])

model=SpectralClustering(n_clusters=2)
model.fit(data)
labels=model.labels_

data=data.assign(label=labels)

X=data.iloc[:,0:6]
Y=data.iloc[:,6]
#print(X.head())
#print(Y.head())
scaler = Normalizer().fit(X)
trainX = scaler.transform(X)
traindata = np.array(X)
trainlabel = np.array(Y)
traindata, testdata, trainlabel, testlabel = model_selection.train_test_split(traindata,trainlabel , test_size=0.5)
#print(testdata.shape)
Пример #49
0
    savemat('data_' + str(n) + '.mat', {
        'train_x': train_x,
        'train_y': train_y
    })
    os.chdir('../')

    ## Perform KMeans
    km = KMeans(n_clusters=nClass, init='k-means++', n_init=10)
    ypred = km.fit_predict(train_x)
    nmi_km[n] = metrics.adjusted_mutual_info_score(train_y, ypred)
    ari_km[n] = metrics.adjusted_rand_score(train_y, ypred)

    ## Perform spectral clustering
    sc = SpectralClustering(n_clusters=nClass,
                            n_init=10,
                            gamma=0.1,
                            affinity='rbf',
                            assign_labels='kmeans')
    ypred = sc.fit_predict(train_x)
    nmi_sc[n] = metrics.adjusted_mutual_info_score(train_y, ypred)
    ari_sc[n] = metrics.adjusted_rand_score(train_y, ypred)

    train_set = train_x, train_y
    dataset = [train_set, train_set, train_set]

    f = gzip.open('toy.pkl.gz', 'wb')
    cPickle.dump(dataset, f, protocol=2)
    f.close()
    ## Perform non-joint SAE+KM
    nmi_nj[n], ari_nj[n] = test_SdC_NJ(lbd=0,
                                       finetune_lr=.01,
from kernels.dataset_generators import Generator

np.random.seed(0)

nsamples = 100
X, y = Generator.generate(dataset_name="manual_circles", n_samples=nsamples)
reds = y == 0
blues = y == 1

# My KKmeans
kkm_model_rbf = KernelKMeans(n_clusters=2, max_iter=200, kernel="rbf")
kkm_clusters = kkm_model_rbf.fit(X)

# scikit_lear kkmeans
spectrual_clusters = SpectralClustering(n_clusters=2,
                                        affinity='nearest_neighbors',
                                        assign_labels='kmeans').fit_predict(X)
print('y: ', y)
plt.figure()
plt.subplot(3, 1, 1)
plt.title("Original Datasset")
plt.scatter(X[:, 0], X[:, 1], s=15, linewidth=0, c=y, cmap='flag')
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

plt.subplot(3, 1, 2)
plt.title("Kkmeans(rbf, gamma =0.1, clusters =2)")
plt.scatter(X[:, 0], X[:, 1], s=15, linewidth=0, c=kkm_clusters, cmap='flag')
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
Пример #51
0

correction(X_train, y_train, km)

# In[63]:

from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering

# In[64]:

algorithms = []
algorithms.append(KMeans(n_clusters=2, random_state=1))
algorithms.append(AffinityPropagation())
algorithms.append(
    SpectralClustering(n_clusters=2,
                       random_state=1,
                       affinity='nearest_neighbors'))
algorithms.append(AgglomerativeClustering(n_clusters=2))

# In[67]:

data = []
for algo in algorithms:
    algo.fit(X_train)
    data.append(({
        'ARI':
        metrics.adjusted_rand_score(y_train, algo.labels_),
        'AMI':
        metrics.adjusted_mutual_info_score(y_train, algo.labels_),
        'Homogenity':
        metrics.homogeneity_score(y_train, algo.labels_),
t0 = time()
X_lle = clf.fit_transform(X_train_normalized)
plot_embedding_v2(X_lle, X_train, "LLE (time %.2fs)" % (time() - t0))


### CLUSTERING ###

X_iso = pd.DataFrame(X_iso) 
print(X_iso)

print(X_iso.columns)



# Building the clustering model 
spectral_model = SpectralClustering(n_clusters = 5, affinity ='nearest_neighbors')   
# Training the model and Storing the predicted cluster labels 
labels_sp = spectral_model.fit_predict(X_iso)

plt.scatter(X_iso.iloc[:,0] , X_iso.iloc[:,1], c=labels_sp, cmap = 'rainbow')
plt.show()


X_iso = pd.DataFrame(X_iso)
df_labels = df_origin.iloc[1:, 0].values


print(df_labels)
print(df_labels[0])

Пример #53
0
             metrics.silhouette_score(X, labels,
                                      metric='euclidean'),
             ))

#**************************error analysis**************************************
from sklearn.metrics.cluster import contingency_matrix
x = labels #actual labels
y = clusters #predicted labels
error_analysis = contingency_matrix(x, y)


#**************************Plot************************************************
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
import numpy as np
#
#from sklearn.datasets import make_moons
#X,y = make_moons(200, noise=.05, random_state=0)
#
#labels = KMeans(2, random_state=0).fit_predict(X)
#plt.scatter(X[:,0], X[:, 1], c=labels, s=50,cmap='viridis');

from sklearn.datasets import make_moons
X,Y = make_moons(200, noise=.05, random_state=0)
from sklearn.cluster import SpectralClustering
model = SpectralClustering(n_clusters=10, affinity='nearest_neighbors',
                           assign_labels='kmeans')
plottinglabels = model.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=plottinglabels,
            s=50, cmap='viridis');
Пример #54
0
adjacency_matrix[2, [0, 1, 5, 4]] = 1

# cluster 2
adjacency_matrix[6, [1, 9]] = 1

# cluster 3
adjacency_matrix[9, [12, 13]] = 1
adjacency_matrix[13, [9, 11, 12]] = 1

transp = np.transpose(adjacency_matrix)
print(np.where(adjacency_matrix - transp))
print(adjacency_matrix)

nb_datapoints = adjacency_matrix.shape[0]
dataset = [x for x in range(nb_datapoints)]

# CHANGE HERE
# choose a relevant number of clusters
nb_clusters = 2

sc = SpectralClustering(nb_clusters, affinity='precomputed')

# apply the Spectral Clustering to the adjacency matrix
sc.fit_predict(adjacency_matrix)

# print the clusters
for cluster_index in range(nb_clusters):
    cluster = np.where(sc.labels_ == cluster_index)[0]
    print("cluster {}".format(cluster_index))
    print(cluster)
Пример #55
0
        import numpy as np
        from sklearn.cluster import SpectralClustering
        fragment_names = list(fragments.keys())
        nfragments = len(fragment_names)
        n_clusters = 30
        affinity_matrix = np.ones([nfragments, nfragments], np.float32)
        for i in range(nfragments):
            shapeFunc = oeshape.OEAnalyticShapeFunc()
            shapeFunc.SetupRef(fragments[fragment_names[i]])
            result = oeshape.OEOverlapResults()
            for j in range(i+1,nfragments):
                shapeFunc.Overlap(fragments[fragment_names[j]], result)
                overlap = result.GetTanimoto()
                affinity_matrix[i,j] = overlap
                affinity_matrix[j,i] = overlap
        clustering = SpectralClustering(n_clusters=n_clusters, affinity='precomputed').fit(affinity_matrix)
        unique_fragment_names = [ index for index in range(n_clusters) ]
        for fragment_index, cluster_index in enumerate(clustering.labels_):
            unique_fragment_names[cluster_index] = fragment_names[fragment_index]
        fragments = { fragment_name : fragments[fragment_name] for fragment_name in unique_fragment_names }

    print('Computing overlap scores...')
    OVERLAP_THRESHOLD = 0.4
    molecules = list()
    directories = glob('Files/x*')
    max_overlap = 0.0
    for directory in tqdm(directories):
        _, docked_fragment = os.path.split(directory)
        with oechem.oemolistream(os.path.join(directory, 'poses.mol2')) as ifs:
            molecule = oechem.OEGraphMol()
            index = 1
Пример #56
0
        classes = dataset[:, 0]
        dataset = np.delete(dataset, 0, axis=1)
        dataset = np.asarray(dataset, dtype=np.float)
    else:
        classes = dataset[:, -1]
        dataset = np.delete(dataset, -1, axis=1)
        dataset = np.asarray(dataset, dtype=np.float)
    return dataset, classes


dataset, classes = loadData(filepath="./BERT/ATT_DPTC.txt",
                            has_id=None,
                            class_position='last')

spectral = SpectralClustering(n_clusters=len(set(classes)),
                              affinity="nearest_neighbors",
                              n_neighbors=10,
                              gamma=2.0)
pred_y = spectral.fit_predict(dataset)
print(pred_y)

classify = defaultdict(list)
for k, va in [(v, i) for i, v in enumerate(pred_y)]:
    classify[k].append(va)
classify = dict(classify)
print(classify)

# accuracy
acc = 0
for i in classify.values():
    acc += Counter(np.array(classes)[i]).most_common(1)[0][1]
print("准确率:%.10f" % (acc / len(pred_y)))
Пример #57
0
 def spectral(self):
     # Spectral Clustering
     self.Clusters = SpectralClustering(n_clusters=self.k,
                                        affinity='nearest_neighbors')
     self.clusterIndex = self.Clusters.fit_predict(self.values)
     self.output()
fig.scatter(X[y_pred == 3, 0],
            X[y_pred == 3, 1],
            X[y_pred == 3, 2],
            s=20,
            c='black',
            marker='o',
            label='Cluster4')

# Accracy of 91

# # Using Spectral Clustering on links Dataset

# In[37]:

clustering = SpectralClustering(n_clusters=4,
                                assign_labels="discretize",
                                random_state=0).fit(Y_link)
y_pred = clustering.labels_

# In[40]:

np.save('result_94.2.npy', y_pred)

# In[41]:

X = Y_link
f = plt.figure(1, figsize=(14, 14))
fig = f.add_subplot(1, 1, 1, projection='3d')
fig.scatter(X[y_pred == 0, 0],
            X[y_pred == 0, 1],
            X[y_pred == 0, 2],
        float(row[9]),
        float(row[10]),
        float(row[11]),
        float(row[12]),
        float(row[13])
    ])
    #X.append([float(row[2]), float(row[3])])
    y.append(row[0])
#print(X)
#print(y)

# In[103]:

from sklearn.cluster import SpectralClustering
y_pred = SpectralClustering(n_clusters=3,
                            affinity='poly',
                            degree=2,
                            gamma=0.0000955).fit_predict(X)
from sklearn import metrics
print "Adjusted Rand index", metrics.adjusted_rand_score(y, y_pred)
print "Mutual Information based scores", metrics.adjusted_mutual_info_score(
    y, y_pred)
print "V-measure", metrics.v_measure_score(y, y_pred)
print "Calinski-Harabasz Score", metrics.calinski_harabaz_score(X, y_pred)

result = y_pred
print(y_pred)

# In[96]:

colors = []
colors.append('red')
def cluster(img, grups):
    normalizedimg = Utils.normalize(img)
    spectral = SpectralClustering(n_clusters=grups, eigen_solver='amg')

    grp = spectral.fit_predict(normalizedimg)
    return grp