예제 #1
1
def scikit_pca(model, rel_wds, plot_lims, title, cluster="kmeans"):
    """
    Given a word2vec model and a cluster (choice of "kmeans" or "spectral")
    Make a plot of all word-vectors in the model.
    """
    X, keys = make_data_matrix(model)

    for i, key in enumerate(keys):
        X[i,] = model[key]

    if cluster == "kmeans":
        k_means = KMeans(n_clusters=8)
        labels = k_means.fit_predict(X)

    elif cluster == "spectral":
        sp_clust = SpectralClustering()
        labels = sp_clust.fit_predict(X)

    # PCA
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = PCA(n_components=2)
    X_transf = sklearn_pca.fit_transform(X_std)

    scatter_plot(X_transf[:,0], X_transf[:,1],  rel_wds, labels, title, keys, plot_lims)

    return sklearn_pca.explained_variance_ratio_
예제 #2
0
def spectral_clustering2(similarity, concepts=2, euclid=False):
    if euclid:
        model = SpectralClustering(n_clusters=concepts, affinity='nearest_neighbors')
        return model.fit_predict(similarity)
    else:
        model = SpectralClustering(n_clusters=concepts, affinity='precomputed')
        similarity[similarity < 0] = 0
        return model.fit_predict(similarity)
	def run(self, k):
		if self.data_is_kernel:
			clf = SpectralClustering(n_clusters=k, gamma=self.gammav, affinity='precomputed')	
			self.allocation = clf.fit_predict(self.X)
			self.kernel = self.X
		else:
			clf = SpectralClustering(n_clusters=k, gamma=self.gammav)		#, affinity='precomputed'
			self.allocation = clf.fit_predict(self.X)
			self.kernel = clf.affinity_matrix_
	
		return self.allocation
예제 #4
0
    def compute_centroid_set(self, **kwargs):

        INPUT_ITR = subset_iterator(X=self.docv, m=self.subcluster_m, repeats=self.subcluster_repeats)

        kn = self.subcluster_kn
        clf = SpectralClustering(n_clusters=kn, affinity="precomputed")

        C = []

        for X in INPUT_ITR:
            # Remove any rows that have zero vectors
            bad_row_idx = (X ** 2).sum(axis=1) == 0
            X = X[~bad_row_idx]
            A = cosine_affinity(X)

            labels = clf.fit_predict(A)

            # Compute the centroids
            (N, dim) = X.shape
            centroids = np.zeros((kn, dim))

            for i in range(kn):
                idx = labels == i
                mu = X[idx].mean(axis=0)
                mu /= np.linalg.norm(mu)
                centroids[i] = mu

            C.append(centroids)

        return np.vstack(C)
예제 #5
0
def spectral_clustering(matrix, N):
    spectral = SpectralClustering(n_clusters=N)
    clusters = spectral.fit_predict(matrix)
    res = [[] for _ in range(N)]
    for i, c in enumerate(clusters):
        res[c].append(i)
    return res
def create_word2vec_cluster(word2vec_model):
    word_vectors = word2vec_model.syn0
    num_clusters = word_vectors.shape[0] / 1000
    spectral_cluster_model = SpectralClustering(n_clusters=num_clusters)
    idx = spectral_cluster_model.fit_predict(word_vectors)
    pickle.dump(spectral_cluster_model, open(r"C:\Ofir\Tau\Machine Learning\Project\project\k_means_model.pkl", "wb"))
    return spectral_cluster_model
def spectral_clustering(k, X, G, W=None, run_times=5):
    if type(W) == type(None):
        W = np.eye(len(X))
    W2 = np.sqrt(W)
    Gtilde = W2.dot(G.dot(W2))
    sc = SpectralClustering(k, affinity='precomputed', n_init=run_times)
    zh = sc.fit_predict(Gtilde)
    return zh
예제 #8
0
def get_coregulatory_states(corr_matrices, similarity_matrix, n_clusters):
    spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed')
    labels = spectral.fit_predict(similarity_matrix)

    coreg_states = {}
    for ci in np.unique(labels):
        coreg_states[ci] = corr_matrices[labels == ci, :, :].mean(axis=0)
    return coreg_states, labels
예제 #9
0
def dist_spectral(x, y):

    plot = []
    for s in range(dataset.shape[0]):
        plot.append(np.array([x[s], y[s]]))
    plot = np.array(plot)
    spectral = SpectralClustering(n_clusters=3, eigen_solver='arpack', affinity="nearest_neighbors")
    clusters = spectral.fit_predict(plot)
    return clusters
def spectral(k, X, G, run_times=10):
    """Spectral clustering from sklearn library. 
    run_times is the number of times the algorithm is gonna run with different
    initializations.
    
    """
    sc = SpectralClustering(k, affinity='precomputed', n_init=run_times)
    zh = sc.fit_predict(G)
    return zh
예제 #11
0
def spectral_clustering(S,X,config):
    '''
    Computes spectral clustering from an input similarity matrix.
    Returns the labels associated with the clustering.
    '''
    from sklearn.cluster import SpectralClustering

    nk = int(config["n_clusters"])
    clf = SpectralClustering(affinity='cosine',n_clusters=nk)
    return clf.fit_predict(X)
예제 #12
0
def cluster_faces_CNN(name = '9_8913259@N03', img_list = 'faces_list.txt'):
    root = '/Users/wangyufei/Documents/Study/intern_adobe/face_recognition_CNN/'+name + '/'
    f = open(root + model_name + 'similarity_matrix.cPickle','r')
    affinity_matrix = cPickle.load(f)
    f.close()

    f = SpectralClustering(affinity='precomputed', n_clusters=min(8, affinity_matrix.shape[0] - 1), eigen_solver = 'arpack', n_neighbors=min(5, affinity_matrix.shape[0]))
    a = f.fit_predict(affinity_matrix)

    groups = {}
    temp = zip(a, xrange(len(a)))
    for i in temp:
        if i[0] not in groups:
            groups[i[0]] = [i[1]]
        else:
            groups[i[0]].append(i[1])
    unique_person_id = []
    for kk in groups:
        min_similarity = np.Inf
        max_similarity = -np.Inf
        mean_similarity = 0
        this_group_ids = groups[kk]
        for j in xrange(len(this_group_ids)):
            for i in xrange(j+1, len(this_group_ids)):
                temp = affinity_matrix[this_group_ids[i],this_group_ids[j]]
                if temp < min_similarity:
                    min_similarity = temp
                if temp > max_similarity:
                    max_similarity = temp
                mean_similarity += temp
        mean_similarity /= max(1, len(this_group_ids)*(len(this_group_ids) - 1) / 2)
        print len(this_group_ids), mean_similarity, max_similarity, min_similarity
        if mean_similarity > 0.5:
            unique_person_id.append(kk)
    important_person = []
    for i in unique_person_id:
        important_person.append([i, len(groups[i])])
    important_person.sort(key = lambda x:x[1], reverse=True)
    in_path = root + img_list
    imgs_list = []
    with open(in_path, 'r') as data:
        for line in data:
            line = line[:-1]
            imgs_list.append(line.split('/')[-1])

    temp = zip(a, imgs_list)
    face_groups = {}
    for i in temp:
        if i[0] not in face_groups:
            face_groups[i[0]] = [i[1]]
        else:
            face_groups[i[0]].append(i[1])

    create_face_group_html_CNN(name, face_groups, important_person)
def spectral(k, X, G, z, run_times=10):
    """Spectral clustering from sklearn library. 
    run_times is the number of times the algorithm is gonna run with different
    initializations.
    
    """
    sc = SpectralClustering(k, affinity='precomputed', n_init=run_times)
    zh = sc.fit_predict(G)
    a = metric.accuracy(z, zh)
    v = metric.variation_information(z, zh)
    return a, v
def spectral_clustering(crime_rows, column_names, num_clusters, affinity='rbf', n_neighbors=0,
        assign_labels='kmeans'):
    """
        n_clusters : integer, optional
            The dimension of the projection subspace.
        affinity : string, array-like or callable, default ‘rbf’
            If a string, this may be one of ‘nearest_neighbors’, ‘precomputed’, ‘rbf’ 
            or one of the kernels supported by sklearn.metrics.pairwise_kernels.
            Only kernels that produce similarity scores 
                (non-negative values that increase with similarity) should be used. 
                This property is not checked by the clustering algorithm.
        gamma : float
            Scaling factor of RBF, polynomial, exponential chi^2 and sigmoid affinity kernel. 
            Ignored for affinity='nearest_neighbors'.
        degree : float, default=3
            Degree of the polynomial kernel. Ignored by other kernels.
        coef0 : float, default=1
            Zero coefficient for polynomial and sigmoid kernels. Ignored by other kernels.
        n_neighbors : integer
            Number of neighbors to use when constructing the affinity matrix 
            using the nearest neighbors method. Ignored for affinity='rbf'.
        n_init : int, optional, default: 10
            Number of time the k-means algorithm will be run with different 
                centroid seeds. 
            The final results will be the best output of n_init consecutive runs in 
                terms of inertia.
        assign_labels : {‘kmeans’, ‘discretize’}, default: ‘kmeans’
            The strategy to use to assign labels in the embedding space. 
            There are two ways to assign labels after the laplacian embedding. 
            k-means can be applied and is a popular choice. 
            But it can also be sensitive to initialization. 
            Discretization is another approach which is less sensitive to 
            random initialization.
        kernel_params : dictionary of string to any, optional
            Parameters (keyword arguments) and values for kernel passed 
                as callable object. Ignored by other kernels.
    """
    crime_xy = [crime[0:2] for crime in crime_rows]
    crime_info = [crime[2:] for crime in crime_rows]
    #crime_xy = [crime[1:] for crime in crime_rows]
    spectral_clustering = SpectralClustering(
            n_clusters=num_clusters, 
            affinity=affinity, 
            n_neighbors=n_neighbors, 
            assign_labels=assign_labels)
    print("Running spectral clustering....")
    print("length crimexy")
    print(len(crime_xy))
    spectral_clustering_labels = spectral_clustering.fit_predict(
            random_sampling(crime_xy, num_samples=3000))
    print("Formatting......")
    return _format_clustering(spectral_clustering_labels, crime_xy, crime_info,
            column_names, num_clusters=num_clusters)
예제 #15
0
def predictSpectralClustering(X, y, n=2, val='rbf'):
	ranX, ranY = shuffle(X, y, random_state=0)
	X = X[:600,]
	y = y[:600,]
	sc = SpectralClustering(n_clusters=n)
	results = sc.fit_predict(X)
	gini = compute_gini(results)
	if n == 2:
		same = calculate_score(results, y)
		opp = calculate_score(results, y, True)
		return (results, max(same, opp), gini)
	else:
		return (results, 0, gini)
예제 #16
0
def spectral_clustering(vectors: list, num_rows, k):
    matrix = []
    ## num_rows X len(vectors)
    for s in range(num_rows):
        row = []
        for v in vectors:
            row.append(v[s])
        matrix.append(np.array(row))

    matrix = np.array(matrix)

    spectral = SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity="nearest_neighbors")
    clusters = spectral.fit_predict(matrix)
    return clusters
예제 #17
0
    def _small_partition(self, data):
        _logger.debug("Running _small_partition on %s observations", len(data))

        similarity = self._get_similarity(data, sparse = self.sparse_similarity)
        _logger.debug("Spectral clustering")
        spc_obj = SpectralClustering(n_clusters = 2, affinity = 'precomputed',
            assign_labels = 'discretize')
        partition = spc_obj.fit_predict(similarity)
        _logger.debug("Done spectral clustering")

        sizes = [len(partition[partition == x]) for x in [0, 1]]
        _logger.debug("Result of _small_partition: #0: {}, #1: {}" \
            .format(*sizes))

        return partition
예제 #18
0
def compute_spectral_clustering(n_vertex, edge_list, n_clusters):

    from sklearn.cluster import SpectralClustering

    clst = SpectralClustering(n_clusters, affinity="precomputed")

    adjacency_matrix = tf.compute_adjacency_matrix(n_vertex, edge_list)

    t = time.time()
    labels = clst.fit_predict(adjacency_matrix, n_clusters)
    exectime = time.time() - t

    labels = tf.compute_normal_labels(labels)

    clusters = tf.compute_clusters_from_labels(labels)

    return labels, clusters, exectime
예제 #19
0
def bench_cluster(X, y, pca_n_comp):
    n = len(np.unique(y))
    pca = PCA(pca_n_comp)
    X_ = pca.fit_transform(X)
    sc = SpectralClustering(n)
    km = KMeans(n)
    sc_pred = sc.fit_predict(X_)
    km_pred = km.fit_predict(X_)
    distances = PairwiseDistances(X_.tolist())
    distances = ExplicitDistances(distances)
    singlel_pred = fcluster(linkage(ssd.squareform(distances.distances)), n, criterion='maxclust')
    print "single-linkage clustering prediction:", singlel_pred
    print "single-linkage clustering score:", adjusted_rand_score(y, singlel_pred), mutual_info_score(y, singlel_pred)
    print "spectral clustering prediction:", sc_pred
    print "spectral clustering score:", adjusted_rand_score(y, sc_pred), mutual_info_score(y, sc_pred)
    print "kmeans clustering prediction", km_pred
    print "kmeans clustering score:", adjusted_rand_score(y, km_pred), mutual_info_score(y, km_pred)
    print "ground truth labels", y
예제 #20
0
파일: preql.py 프로젝트: adammendoza/loom
    def cluster(
            self,
            rows_to_cluster=None,
            seed_rows=None,
            cluster_count=None,
            nearest_neighbors=10):
        if seed_rows is None:
            seed_rows = self._query_server.sample(
                [None for _ in self.feature_names],
                sample_count=SAMPLE_COUNT)
        row_limit = len(seed_rows) ** 2 + 1
        similar_string = StringIO(self.similar(seed_rows, row_limit=row_limit))
        similar = numpy.genfromtxt(
            similar_string,
            delimiter=',',
            skip_header=0)
        similar = similar.clip(0., 5.)
        similar = numpy.exp(similar)
        clustering = SpectralClustering(
            n_clusters=cluster_count,
            affinity='precomputed')
        labels = clustering.fit_predict(similar)

        if rows_to_cluster is None:
            return zip(labels, seed_rows)
        else:
            row_labels = []
            for row in rows_to_cluster:
                similar_scores = self.similar(
                    [row],
                    seed_rows,
                    row_limit=row_limit)
                similar_scores = numpy.genfromtxt(
                    StringIO(similar_scores),
                    delimiter=',',
                    skip_header=0)
                assert len(similar_scores) == len(labels)
                label_scores = zip(similar_scores, labels)
                top = sorted(label_scores, reverse=True)[:nearest_neighbors]
                label_counts = Counter(zip(*top)[1]).items()
                top_label = sorted(label_counts, key=lambda x: -x[1])[0][0]
                row_labels.append(top_label)
            return zip(row_labels, rows_to_cluster)
def clusterSentencesandConsolidate():
	ClusterFile = open("../../Temp/SentencesToCluster.txt",'r')
	documents = ClusterFile.readlines()
	ClusterFile.close()
	line_count = len(documents)
	vectorizer = TfidfVectorizer(stop_words='english')
	X = vectorizer.fit_transform(documents)

	noOfClusters = line_count/10
	#####
	model = SpectralClustering(n_clusters=noOfClusters,eigen_solver='arpack',eigen_tol=0.01,assign_labels = 'discretize')
	y = model.fit_predict(X)

	clusterSentenceIndex = []
	for i in xrange(len(y)):
		temp = []
		temp.append(y[i])
		temp.append(documents[i])

		clusterSentenceIndex.append(temp)

	clusterSentenceIndex.sort()

	# Writing to the file
#	outputIndexFile = open('../../Temp/sentence-sluster-sorted-index.txt','w')
#	for i in xrange(len(clusterSentenceIndex)):
#		if int(clusterSentenceIndex[i][0]) >= 0:
#			line = clusterSentenceIndex[i][1] +'$'+clusterSentenceIndex[i][0]+'\n'
#			outputIndexFile.write(line)
#	outputIndexFile.close()		

## Consolidate into different clusterd
	
	cluster_to_sentence_dict = defaultdict(list)
	for each_line in clusterSentenceIndex:
		cluster,sentence=each_line
		if cluster in cluster_to_sentence_dict:
			cluster_to_sentence_dict[cluster].append(sentence)
		else:
			cluster_to_sentence_dict[cluster] = [sentence]

	return cluster_to_sentence_dict		
예제 #22
0
def Spectral(Aff,k):
    '''***************Imports****************'''
    ##################################################################
    import os, sys, inspect, time  # @UnusedImport
    sys.path.insert(0, 'C:\Users\user\Anaconda\Lib\site-packages')
    from sklearn.cluster import SpectralClustering  # @UnresolvedImport @UnusedImport
    
    ##################################################################
    
    '''***************Spectral***************'''
    ##################################################################

    print "clustering with Spectral clustering, k = " +str(k)
    end = time.time()
    estimator = SpectralClustering(n_clusters=k,affinity='precomputed')
    labels = estimator.fit_predict(Aff)
    ##################################################################
    
    end2 = time.time()
    print "model time is %s seconds " %str(int(end2-end))
    print "%s clusters found" %str(len(set(labels)))
    return labels
예제 #23
0
    def compute_meta_centroid_set(self, **kwargs):

        C = self.load_centroid_dataset("subcluster_centroids")
        print "Intermediate clusters", C.shape

        # By eye, it looks like the top 60%-80% of the
        # remaining clusters are stable...

        nc = int(self.subcluster_pcut * self.subcluster_kn)
        clf = SpectralClustering(n_clusters=nc, affinity="precomputed")

        S = cosine_affinity(C)
        labels = clf.fit_predict(S)

        meta_clusters = []
        meta_cluster_size = []
        for i in range(labels.max() + 1):
            idx = labels == i
            mu = C[idx].mean(axis=0)
            mu /= np.linalg.norm(mu)
            meta_clusters.append(mu)
            meta_cluster_size.append(idx.sum())

        return meta_clusters
예제 #24
0
#%%
df = pd.read_csv('../../data/Stage2DataFiles/RegularSeasonCompactResults.csv')
teams = pd.read_csv('../../data/Stage2DataFiles/Teams.csv')
df = pd.merge(df,
              teams[['TeamID', 'TeamName']],
              left_on='WTeamID',
              right_on='TeamID')
del df['TeamID']
df = df.rename(columns={'TeamName': 'TmName'})
df = pd.merge(df,
              teams[['TeamID', 'TeamName']],
              left_on='LTeamID',
              right_on='TeamID')
del df['TeamID']
df = df.rename(columns={'TeamName': 'OppName'})
df = df.loc[df['Season'] == 2018]

# %%
g = nx.Graph()
edges = [tuple(x) for x in df[['TmName', 'OppName']].to_numpy()]
g.add_edges_from(edges)
A = nx.to_numpy_matrix(g)
teamList = g.nodes()

# %%
clustering = SpectralClustering(affinity='precomputed')
labels = clustering.fit_predict(A)
results = pd.DataFrame({'TeamName': teamList, 'cluster': labels})

# %%
예제 #25
0
@author: Jie.Hu
"""

# spectral clustering
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.cluster import SpectralClustering

# define dataset
X, _ = make_classification(n_samples=1000,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           random_state=4)
# define the model
model = SpectralClustering(n_clusters=2)
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = np.unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = np.where(yhat == cluster)
    # create scatter of these samples
    plt.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
plt.show()
예제 #26
0
def get_clusters(data, k):
	model = SpectralClustering(n_clusters=k, gamma = 0.3)
	# model = DPGMM(n_components = k)
	return model.fit_predict(data)
예제 #27
0
plt.figure(2)
kmeans = KMeans(n_clusters=2)
kmeans.fit(points)
clusters_kmeans = kmeans.predict(points)
 

plt.scatter(x,y, c=clusters_kmeans, s=50);


# narysuj centra klastrow
centers=kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);


plt.title("kmeans")
print("Kliknij w obrazek..")
plt.waitforbuttonpress()
plt.figure(3)

 

model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', assign_labels='kmeans')
labels = model.fit_predict(points)
plt.scatter(points[:, 0], points[:, 1], c=labels, s=50, cmap='viridis');
plt.title("Spectral Clustering")



plt.show()
예제 #28
0
## Clustering

print()
print("Compute clusters")

scores_spc_nn_pca = []
labels_spc_nn_pca = []

spc = SpectralClustering(n_clusters=5,
                         affinity="nearest_neighbors",
                         n_neighbors=5,
                         n_jobs=3)

for i, da in enumerate(data_pca):
    labels = spc.fit_predict(da)
    score = nmi_score(labels_true, labels)

    scores_spc_nn_pca.append(score)
    labels_spc_nn_pca.append(labels)

    print(i, score)
    if score > 0.99:
        print("x")

max_i = np.argmax(scores_spc_nn_pca)
max_labels = labels_spc_nn_pca[max_i]

print()
print("Best score:", scores_spc_nn_pca[max_i])
print(
예제 #29
0
def writeSpectralClustering(X, number, objectsNames):
    clustering = SpectralClustering(n_clusters=number, affinity='nearest_neighbors')
    results = np.array(clustering.fit_predict(X))

    resuldDF = pd.DataFrame({IMAGES:objectsNames, CLUSTERS:results})
    resuldDF.to_csv(FOLDER + "/" + SPECTRALCLUSTERING+"_"+str(number)+".csv", index=False)
예제 #30
0
def main():
    x,label = iter(all_loader).next()
    print('x:',x.shape, 'label:',label.shape)

    model = Lenet()
    criteon1 = nn.CrossEntropyLoss()
    criteon2 = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(),lr=1e-3)
    print(model)
    for epoch in range(500):
        model.train()
        for batchidx, (x, label) in enumerate(all_loader):
            # x: [b,1,100,100]
            # label: [b]
            #print(label)
            if int(label) == -1:
                print(label)
                logits = model(x,-1,True)
                print(logits.shape)
                print(x.shape)
                loss = 0.00001*criteon2(logits, x)
            else:
                logits = model(x,label, True)
                loss = criteon1(logits, label)

            # logits: [b, 10]
            # label: [b]
            # loss: tensor scalar

            # print(logits)
            # print(label)


            # backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(epoch, loss.item())

        model.eval()
        with torch.no_grad():
            # test

            # 使用conv与fc_unit1部分
            encoder = torch.randn(301, 184)
            label = torch.randn(301)
            for x,y in encoder_loader:
                #x,y = iter(all_loader).next()
                with torch.no_grad():
                    x_encoder = model(x,y,False)
                    # label.append(y)
                    # encoder.append(x_encoder)
                    label = y
                    encoder = x_encoder
            encoder = encoder.numpy()
            label = label.numpy()
            #print(encoder.shape,label.shape)

            from sklearn.cluster import SpectralClustering
            from sklearn.metrics import adjusted_rand_score
            from sklearn.metrics import normalized_mutual_info_score
            from sklearn.metrics.pairwise import cosine_similarity
            #simatrix = np.arange(len(encoder) ** 2, dtype=float).reshape(len(encoder), -1)
            simatrix = 0.5 * cosine_similarity(encoder) + 0.5
            SC = SpectralClustering(affinity='precomputed', assign_labels='discretize')#, random_state=100)
            label1 = SC.fit_predict(simatrix)
            print('epoch:',epoch)
            print('label:',label.shape)
            ARI = adjusted_rand_score(label, label1)
            NMI = normalized_mutual_info_score(label, label1)
            # if ARI > 0.9:
            #     print("谱聚类:ARI", ARI)
            #
            # if NMI > 0.9:
            #     print("谱聚类:NMI", NMI)
            print("谱聚类:ARI", ARI)
            print("谱聚类:NMI", NMI)


            # k-means 聚类
            from sklearn.metrics import adjusted_rand_score
            from sklearn.metrics import normalized_mutual_info_score
            from sklearn.cluster import KMeans
            from sklearn import metrics

            label1 = KMeans(n_clusters=11).fit_predict(encoder)
            ARI = adjusted_rand_score(label, label1)
            NMI = normalized_mutual_info_score(label, label1)
            # if ARI > 0.9:
            #     print("k-means:ARI", ARI)
            #
            # if NMI > 0.9:
            #     print("k-means:NMI", NMI)
            print("k-means:ARI", ARI)
            print("k-means:NMI", NMI)
예제 #31
0
def main(FILE_ANALISE, N_CLUSTERS=2):
    # Feature Extraction
    def extract_features(y, sr, window, hop, n_mfcc):
        mfcc = librosa.feature.mfcc(
                y=y, sr=sr, hop_length=int(hop*sr),
                n_fft=int(window*sr), n_mfcc=n_mfcc, dct_type=2)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
        stacked = np.vstack((mfcc, mfcc_delta, mfcc_delta2))
        return stacked.T

    # code modified for compactness
    # orignal code
    # https://github.com/wiseman/py-webrtcvad/blob/master/example.py
    def write_wave(path, audio, sample_rate):
        with contextlib.closing(wave.open(path, 'wb')) as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(sample_rate)
            wf.writeframes(audio)

    class Frame(object):
        def __init__(self, bytes, timestamp, duration):
            self.bytes = bytes
            self.timestamp = timestamp
            self.duration = duration

    def frame_generator(frame_duration_ms, audio, sample_rate):
        n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
        offset = 0
        timestamp = 0.0
        duration = (float(n) / sample_rate) / 2.0
        while offset + n < len(audio):
            yield Frame(audio[offset:offset + n], timestamp, duration)
            timestamp += duration
            offset += n

    def vad_collector(
            sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
        num_padding_frames = int(padding_duration_ms / frame_duration_ms)
        ring_buffer = collections.deque(maxlen=num_padding_frames)
        triggered = False

        voiced_frames = []
        for frame in frames:
            is_speech = vad.is_speech(frame.bytes, sample_rate)
            if not triggered:
                ring_buffer.append((frame, is_speech))
                num_voiced = len([f for f, speech in ring_buffer if speech])
                if num_voiced > 0.9 * ring_buffer.maxlen:
                    triggered = True
                    for f, s in ring_buffer:
                        voiced_frames.append(f)
                    ring_buffer.clear()
            else:
                voiced_frames.append(frame)
                ring_buffer.append((frame, is_speech))
                num_unvoiced = len(
                        [f for f, speech in ring_buffer if not speech])
                if num_unvoiced > 0.9 * ring_buffer.maxlen:
                    triggered = False
                    yield b''.join([f.bytes for f in voiced_frames])
                    ring_buffer.clear()
                    voiced_frames = []
        if voiced_frames:
            yield b''.join([f.bytes for f in voiced_frames])

    def map_adaptation(
            gmm, data, max_iterations=300,
            likelihood_threshold=1e-20, relevance_factor=16):
        N = data.shape[0]
        D = data.shape[1]
        K = gmm.n_components

        mu_new = np.zeros((K, D))
        n_k = np.zeros((K, 1))

        mu_k = gmm.means_
        cov_k = gmm.covariances_
        pi_k = gmm.weights_

        old_likelihood = gmm.score(data)
        new_likelihood = 0
        iterations = 0
        while(abs(
                old_likelihood - new_likelihood) >
                likelihood_threshold and iterations < max_iterations):
            iterations += 1
            old_likelihood = new_likelihood
            z_n_k = gmm.predict_proba(data)
            n_k = np.sum(z_n_k, axis=0)

            for i in range(K):
                temp = np.zeros((1, D))
                for n in range(N):
                    temp += z_n_k[n][i]*data[n, :]
                mu_new[i] = (1/n_k[i])*temp

            adaptation_coefficient = n_k/(n_k + relevance_factor)
            for k in range(K):
                mu_k[k] = (
                        adaptation_coefficient[k] * mu_new[k]) + (
                                (1 - adaptation_coefficient[k]) * mu_k[k])
            gmm.means_ = mu_k

            log_likelihood = gmm.score(data)
            new_likelihood = log_likelihood
            print(log_likelihood)
        return gmm

    # Setings
    SR = 16000  # sample rate
    N_MFCC = 13  # number of MFCC to extract
    N_FFT = 0.032  # length of the FFT window in seconds
    HOP_LENGTH = 0.010  # number of samples between successive frames in sec

    N_COMPONENTS = 16  # number of gaussians
    COVARINACE_TYPE = 'full'  # cov type for GMM

    y = []
    # LOAD_SIGNAL = False
    LOAD_SIGNAL = True
    if LOAD_SIGNAL:
        y, sr = librosa.load(FILE_ANALISE, sr=SR)
        pre_emphasis = 0.97
        y = np.append(y[0], y[1:] - pre_emphasis * y[:-1])

    # MAKE_CHUNKS = False
    MAKE_CHUNKS = True

    if MAKE_CHUNKS:
        vad = webrtcvad.Vad(2)
        audio = np.int16(y/np.max(np.abs(y)) * 32768)

        frames = frame_generator(10, audio, sr)
        frames = list(frames)
        segments = vad_collector(sr, 50, 200, vad, frames)

        if not os.path.exists('data/chunks'):
            os.makedirs('data/chunks')

        for i, segment in enumerate(segments):
            chunk_name = 'data/chunks/chunk-%003d.wav' % (i,)
            write_wave(
                    chunk_name, segment[0: len(segment)-int(100*sr/1000)], sr)

    # extract MFCC, first and second derivatives
    FEATURES_FROM_FILE = True
    # FEATURES_FROM_FILE = False

    feature_file_name = 'data/param/features_{0}.pkl'.format(N_MFCC)

    if FEATURES_FROM_FILE:
        ubm_features = pickle.load(open(feature_file_name, 'rb'))
    else:
        ubm_features = extract_features(
                np.array(y), sr, window=N_FFT, hop=HOP_LENGTH, n_mfcc=N_MFCC)
        ubm_features = preprocessing.scale(ubm_features)
        pickle.dump(ubm_features, open(feature_file_name, "wb"))

    # UBM Train
    UBM_FROM_FILE = True
    # UBM_FROM_FILE = False

    ubm_file_name = 'data/param/ubm_{0}_{1}_{2}MFCC.pkl'.format(
            N_COMPONENTS, COVARINACE_TYPE, N_MFCC)

    if UBM_FROM_FILE:
        ubm = pickle.load(open(ubm_file_name, 'rb'))
    else:
        ubm = GaussianMixture(
                n_components=N_COMPONENTS, covariance_type=COVARINACE_TYPE)
        ubm.fit(ubm_features)
        pickle.dump(ubm, open(ubm_file_name, "wb"))
    # print(ubm.score(ubm_features))

    SV = []
    num_chunk = len(listdir(os.getcwd()+'\data\chunks'))
    for i in range(num_chunk):
        clear_output(wait=True)
        fname = 'data/chunks/chunk-%003d.wav' % (i,)
        # print('UBM MAP adaptation for {0}'.format(fname))
        y_, sr_ = librosa.load(fname, sr=None)
        f_ = extract_features(
                y_, sr_, window=N_FFT, hop=HOP_LENGTH, n_mfcc=N_MFCC)
        f_ = preprocessing.scale(f_)
        gmm = copy.deepcopy(ubm)
        gmm = map_adaptation(gmm, f_, max_iterations=1, relevance_factor=16)
        sv = gmm.means_.flatten()
        try:
            sv = preprocessing.scale(sv)
        except:
            pass
        SV.append(sv)

    SV = np.array(SV)
    clear_output()
    # print(SV.shape)

    def rearrange(labels, n):
        seen = set()
        distinct = [x for x in labels if x not in seen and not seen.add(x)]
        correct = [i for i in range(n)]
        dict_ = dict(zip(distinct, correct))
        return [x if x not in dict_ else dict_[x] for x in labels]

    sc = SpectralClustering(n_clusters=N_CLUSTERS, affinity='cosine')
    labels = sc.fit_predict(SV)
    labels = rearrange(labels, N_CLUSTERS)
    print('Обработка завершена.')
    return labels
n_class = 5
Xs, labels = load_UCImultifeature(select_labeled=list(range(n_class)),
                                  views=[0, 1])

###############################################################################
# Singleview spectral clustering
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# Cluster each view separately and compute nmi

s_spectral = SpectralClustering(n_clusters=n_class,
                                random_state=RANDOM_SEED,
                                n_init=100)

for i in range(len(Xs)):
    s_clusters = s_spectral.fit_predict(Xs[i])
    s_nmi = nmi_score(labels, s_clusters, average_method='arithmetic')
    print('Single-view View {0:d} NMI Score: {1:.3f}\n'.format(i + 1, s_nmi))

# Concatenate the multiple views into a single view and produce clusters
s_data = np.hstack(Xs)
s_clusters = s_spectral.fit_predict(s_data)

s_nmi = nmi_score(labels, s_clusters)
print('Single-view Concatenated NMI Score: {0:.3f}\n'.format(s_nmi))

###############################################################################
# Co-Regularized multiview spectral clustering
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# Use the MultiviewSpectralClustering instance to cluster the data
import scipy as sp
from centroid import all_separate, initialize, plot_graph, find_location_smallmask
import numpy as np
from mayavi import mlab
from sklearn.cluster import SpectralClustering, AgglomerativeClustering
from sklearn.linear_model import LogisticRegression

nSubjects = 40
training_data = np.array([])
training_labels = np.array([])
R_all, vertices, faces, mask, rho, rho_1 = initialize()
nCluster = 3
SC = SpectralClustering(n_clusters=nCluster, affinity='precomputed')
labels = SC.fit_predict(rho)
label = np.zeros(vertices.shape[0], dtype=float)
label[mask] = labels + 1

temp_d = R_all[mask, :39 * 1200]
temp_rho = np.corrcoef(temp_d)
temp_rho[~np.isfinite(temp_rho)] = 0
temp_labels = SC.fit_predict(temp_rho)
temp_label = np.zeros(vertices.shape[0], dtype=float)
temp_label[mask] = temp_labels + 1
mlab.triangular_mesh(vertices[:, 0],
                     vertices[:, 1],
                     vertices[:, 2],
                     faces,
                     representation='surface',
                     opacity=1,
                     scalars=np.float64(temp_label))
mlab.gcf().scene.parallel_projection = True
def intuitive_semi_supervised(userId, file_path, inputlabels, k_min, k_max,
                              num_cluster, assignLabels, seed, method):
    # labels = pd.read_csv(label_path)
    # label_list = labels["Labels"].to_list()
    label_list = inputlabels.to_list()
    total_len = len(label_list)
    unknown_label = -1
    total_labeled = 0
    optimal_accuracy = 0
    optimal_k_min = 0
    optimal_k_max = 0
    kmer_table = pd.DataFrame(data={})
    output_df = pd.DataFrame(data={})
    for i in label_list:
        if label_list[i] != unknown_label:
            total_labeled = total_labeled + 1
    res = [0] * total_len
    if assignLabels == "none":
        for i in range(k_min, k_max + 1):
            for j in range(i, k_max + 1):
                temp_k_min = i
                temp_k_max = j
                kmer_table, output_df = get_kmer_table(file_path, temp_k_min,
                                                       temp_k_max)
                spectral_clustering = SpectralClustering(
                    n_clusters=num_cluster,
                    assign_labels="kmeans",
                    random_state=seed)
                labels = spectral_clustering.fit_predict(kmer_table)
                correct_count = 0
                for k in range(len(label_list)):
                    if label_list[k] != unknown_label:
                        if label_list[k] == labels[k]:
                            correct_count += 1
                temp_accuracy = correct_count / total_labeled
                if temp_accuracy > optimal_accuracy:
                    optimal_accuracy = temp_accuracy
                    optimal_k_min = i
                    optimal_k_max = j
                    res = labels
        for i in range(k_min, k_max + 1):
            for j in range(i, k_max + 1):
                temp_k_min = i
                temp_k_max = j
                kmer_table, output_df = get_kmer_table(file_path, temp_k_min,
                                                       temp_k_max)
                spectral_clustering = SpectralClustering(
                    n_clusters=num_cluster,
                    assign_labels="discretize",
                    random_state=seed)
                labels = spectral_clustering.fit_predict(kmer_table)
                correct_count = 0
                for k in range(len(label_list)):
                    if label_list[k] != unknown_label:
                        if label_list[k] == labels[k]:
                            correct_count += 1
                temp_accuracy = correct_count / total_labeled
                if temp_accuracy > optimal_accuracy:
                    optimal_accuracy = temp_accuracy
                    optimal_k_min = i
                    optimal_k_max = j
                    res = labels
        # update parameters for front end
        new_params = {
            'accuracy': optimal_accuracy,
            'k_min': optimal_k_min,
            'k_max': optimal_k_max
        }
        update_parameters(userId, new_params)
    else:
        for i in range(k_min, k_max + 1):
            for j in range(i, k_max + 1):
                temp_k_min = i
                temp_k_max = j
                kmer_table, output_df = get_kmer_table(file_path, temp_k_min,
                                                       temp_k_max)
                spectral_clustering = SpectralClustering(
                    n_clusters=num_cluster,
                    assign_labels=assignLabels,
                    random_state=seed)
                labels = spectral_clustering.fit_predict(kmer_table)
                correct_count = 0
                temp_accuracy = 0
                for k in range(len(label_list)):
                    if label_list[k] != unknown_label:
                        if label_list[k] == labels[k]:
                            correct_count += 1
                temp_accuracy = correct_count / total_labeled
                if temp_accuracy > optimal_accuracy:
                    optimal_accuracy = temp_accuracy
                    optimal_k_min = i
                    optimal_k_max = j
                    res = labels
        # update parameters for front end
        new_params = {
            'accuracy': optimal_accuracy,
            'k_min': optimal_k_min,
            'k_max': optimal_k_max
        }
        update_parameters(userId, new_params)
    plot_div = plotly_dash_show_plot(userId, kmer_table, res,
                                     "Semi-supervised Spectral Clustering",
                                     method)
    output_df.insert(0, "Labels", res)
    return [[output_df], [plot_div]]
예제 #35
0
 def spectral_init(self):
     from sklearn.cluster import SpectralClustering
     spectral_clust = SpectralClustering(n_clusters=self.k, affinity='precomputed')
     spectral_clust.fit(self.A)
     self.Z = spectral_clust.fit_predict(self.A)
예제 #36
0
def supervised_clu(feature, rmMulti, trial):
    (part1Pos, part1Neg, part2Pos, part2Neg, part3Pos, part3Neg, part4Pos,
     part4Neg, part5Pos, part5Neg, globalPos,
     globalNeg) = data_selection(feature, rmMulti)
    sumpurity = 0
    sumfone = 0
    for i in range(0, trial):
        print '#', i + 1, 'trial!!!'
        pos_dataset = dic2List(
            globalPos
        )  # dic2List(part1Pos) + dic2List(part2Pos) + dic2List(part3Pos) + dic2List(part4Pos) + dic2List(part5Pos)  #
        neg_dataset = dic2List(
            globalNeg
        )  # dic2List(part1Neg) + dic2List(part2Neg) + dic2List(part3Neg) + dic2List(part4Neg) + dic2List(part5Neg)  #
        # print len(pos_dataset)

        num_pos_sample = int(0.3 * len(pos_dataset))
        num_neg_sample = num_pos_sample

        (posPicked, posNotPicked) = takingSamples(pos_dataset,
                                                  num=num_pos_sample)
        (negPicked, negNotPicked) = takingSamples(neg_dataset,
                                                  num=num_neg_sample)
        # print len(posPicked),len(negPicked)
        # print posPicked, posNotPicked

        # train_X = pd.DataFrame(mat2arr(list2Dic(posPicked).values() + list2Dic(negPicked).values()))
        train_X = pd.DataFrame(
            list2Dic(posPicked).values() + list2Dic(negPicked).values())
        train_y = np.array(
            [1 for i in range(len(list2Dic(posPicked).values()))] +
            [0 for i in range(len(list2Dic(negPicked).values()))])
        print len(train_X), len(train_y)

        reg = RFC(n_estimators=200, max_features='log2')
        model = reg.fit(train_X, train_y)
        # print 'model ready!'

        # print 'get affinity matrix...'
        matrixVal = {}
        for item in posPicked:
            matrixVal[str(item.keys()[0])] = 1
        for item in negPicked:
            matrixVal[str(item.keys()[0])] = 0

        test_X = posNotPicked + negNotPicked
        modelIn = list2Dic(test_X)
        test_Y = model.predict_proba(modelIn.values())[:, 1]
        for i in range(0, len(modelIn)):
            matrixVal[modelIn.keys()[i]] = test_Y[i]

        # print matrixVal.keys()
        # print map(eval,matrixVal.keys())
        # print matrixVal.values()
        # print size
        row = []
        col = []
        docMap = {}
        mapDoc = {}
        size = 0
        for pair in map(eval, matrixVal.keys()):
            for doc in pair:
                if not docMap.has_key(doc):
                    docMap[doc] = size
                    mapDoc[size] = doc
                    size += 1
        # print mapDoc
        # print docMap
        for pair in map(eval, matrixVal.keys()):
            row.append(docMap[pair[0]])
            col.append(docMap[pair[1]])
        for pair in map(eval, matrixVal.keys()):
            row.append(docMap[pair[1]])
            col.append(docMap[pair[0]])
        data = matrixVal.values() + matrixVal.values()
        # print size
        affinity = csc_matrix((data, (row, col)), shape=(size, size)).toarray()
        # print 'affinity matrix get!'

        # print 'run clustering...'
        # groundTruth = json.loads(open('groundTruth.json').read())
        # groundTruth = json.loads(open('rmMultiGroundTruth.json').read()) # some documents appears in one part only once, but multiple time in global
        groundTruth = json.loads(open('rmMultiGroundTruthNew.json').read(
        ))  # rmMultiGroundTruthNew.json is for simply combining all parts only
        # groundTruth = json.loads(open('part1CluInd.json').read())
        # groundTruth = json.loads(open('rmMultiPart5CluInd.json').read())
        num_clu = len(groundTruth)
        # print num_clu
        model = SC(n_clusters=num_clu, affinity='precomputed')
        res = model.fit_predict(affinity)
        # print res
        # print len(res), len(set(res))

        resDic = {}
        for i in range(len(res)):
            if not resDic.has_key(res[i]):
                resDic[res[i]] = []
                resDic[res[i]].append(mapDoc[i])
            else:
                resDic[res[i]].append(mapDoc[i])
        result = resDic.values()

        purVal = purity(result, groundTruth)
        (pre, rec, fone) = fmeasure(result, groundTruth)
        sumpurity += purVal
        sumfone += fone
        print 'purity %.4f' % purVal, 'precision: %.4f' % pre, 'recall: %.4f' % rec, 'f1: %.4f' % fone

        return (sumpurity, sumfone)
예제 #37
0
파일: train.py 프로젝트: lavoiems/Cats-UDT
def train(args):
    parameters = vars(args)
    train_loader1, test_loader1 = args.loaders1
    train_loader2, test_loader2 = args.loaders2

    models = define_models(**parameters)
    initialize(models, args.reload, args.save_path, args.model_path)

    ssx = args.ssx.to(args.device)
    ssx.eval()

    zxs, labelsx = get_initial_zx(train_loader1, ssx, args.device)
    zys, labelsy = get_initial_zx(train_loader2, ssx, args.device)

    sc = SpectralClustering(args.nc, affinity='sigmoid', gamma=1.7)
    clusters = sc.fit_predict(zxs.cpu().numpy())
    clusters = torch.from_numpy(clusters).to(args.device)

    classifier = models['classifier'].to(args.device)
    discriminator = models['discriminator'].to(args.device)
    classifier.apply(he_init)
    discriminator.apply(he_init)
    print(classifier)
    print(discriminator)

    optim_discriminator = optim.Adam(discriminator.parameters(),
                                     lr=args.lr,
                                     betas=(args.beta1, args.beta2))
    optim_classifier = optim.Adam(classifier.parameters(),
                                  lr=args.lr,
                                  betas=(args.beta1, args.beta2))
    optims = {
        'optim_discriminator': optim_discriminator,
        'optim_classifier': optim_classifier
    }

    iteration = infer_iteration(
        list(models.keys())[0], args.reload, args.model_path, args.save_path)
    t0 = time.time()
    for i in range(iteration, args.iterations):
        classifier.train()
        discriminator.train()

        perm = torch.randperm(len(zxs))
        ix = perm[:args.train_batch_size]
        zx = zxs[ix]
        perm = torch.randperm(len(zys))
        iy = perm[:args.train_batch_size]
        zy = zys[iy]

        optim_discriminator.zero_grad()
        d_loss = disc_loss(zx, zy, discriminator, classifier.x, classifier.mlp,
                           args.device)
        d_loss.backward()
        optim_discriminator.step()

        perm = torch.randperm(len(zxs))
        ix = perm[:args.train_batch_size]
        zx = zxs[ix]
        label = clusters[ix].long()
        perm = torch.randperm(len(zys))
        iy = perm[:args.train_batch_size]
        zy = zys[iy]

        optim_classifier.zero_grad()
        c_loss = classification_loss(zx, label, classifier)
        tcw_loss = classification_target_loss(zy, classifier)
        dw_loss = embed_div_loss(zx, zy, discriminator, classifier.x,
                                 classifier.mlp, args.device)
        m_loss1 = mixup_loss(zx, classifier, args.device)
        m_loss2 = mixup_loss(zy, classifier, args.device)
        (args.cw * c_loss).backward()
        (args.tcw * tcw_loss).backward()
        (args.dw * dw_loss).backward()
        (args.smw * m_loss1).backward()
        (args.tmw * m_loss2).backward()
        optim_classifier.step()

        if i % args.evaluate == 0:
            print('Iter: %s' % i, time.time() - t0)
            classifier.eval()

            class_map = evaluate_cluster(args.visualiser, i, args.nc, zxs,
                                         labelsx, classifier, f'x',
                                         args.device)
            evaluate_cluster_accuracy(args.visualiser, i, zxs, labelsx,
                                      class_map, classifier, f'x', args.device)
            evaluate_cluster_accuracy(args.visualiser, i, zys, labelsy,
                                      class_map, classifier, f'y', args.device)

            save_path = args.save_path
            with open(os.path.join(save_path, 'c_loss'), 'a') as f:
                f.write(f'{i},{c_loss.cpu().item()}\n')
            with open(os.path.join(save_path, 'tcw_loss'), 'a') as f:
                f.write(f'{i},{tcw_loss.cpu().item()}\n')
            with open(os.path.join(save_path, 'dw_loss'), 'a') as f:
                f.write(f'{i},{dw_loss.cpu().item()}\n')
            with open(os.path.join(save_path, 'm_loss1'), 'a') as f:
                f.write(f'{i},{m_loss1.cpu().item()}\n')
            with open(os.path.join(save_path, 'm_loss2'), 'a') as f:
                f.write(f'{i},{m_loss2.cpu().item()}\n')
            with open(os.path.join(save_path, 'd_loss2'), 'a') as f:
                f.write(f'{i},{d_loss.cpu().item()}\n')
            args.visualiser.plot(c_loss.cpu().detach().numpy(),
                                 title='Source classifier loss',
                                 step=i)
            args.visualiser.plot(tcw_loss.cpu().detach().numpy(),
                                 title='Target classifier cross entropy',
                                 step=i)
            args.visualiser.plot(dw_loss.cpu().detach().numpy(),
                                 title='Classifier marginal divergence',
                                 step=i)
            args.visualiser.plot(m_loss1.cpu().detach().numpy(),
                                 title='Source mix up loss',
                                 step=i)
            args.visualiser.plot(m_loss2.cpu().detach().numpy(),
                                 title='Target mix up loss',
                                 step=i)
            args.visualiser.plot(d_loss.cpu().detach().numpy(),
                                 title='Discriminator loss',
                                 step=i)
            t0 = time.time()
            save_models(models, i, args.model_path, args.evaluate)
            save_models(optims, i, args.model_path, args.evaluate)
# divide dataset into train and test sets in 7 : 3 ratio
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

# print(X_train[:, 2:])
# # print(len(y))
# print(X_train)

# derive the culsters
n_clusters = 3  # len(np.unique(y_train))
clu = SpectralClustering(n_clusters=n_clusters, n_jobs=-1)
clu.fit(X_train[:, 2:])
y_labels_train = clu.labels_
y_labels_test = clu.fit_predict(X_test[:, 2:])

predict = []
predict.append(new)
predict = np.array(predict)

# train the dataset using a classification algorithm by using the clusters derived above as the traget class/ output column
clf = XGBClassifier(n_jobs=-1)
clf.fit(X_train[:, 2:], y_labels_train)
print(clf)

# predict the target class / output of the new user's datapoint
prediction = clf.predict(predict[:, 2:])
predict_class = prediction[0]
s = new.size
prediction_np = np.array(prediction)
예제 #39
0
class CIF_Dataset(Dataset):
    def __init__(self,
                 part_data=None,
                 norm_obj=None,
                 normalization=None,
                 max_num_nbr=12,
                 radius=8,
                 dmin=0,
                 step=0.2,
                 cls_num=3,
                 root_dir='DATA/CIF-DATA/'):
        self.root_dir = root_dir
        self.max_num_nbr, self.radius = max_num_nbr, radius
        self.normalizer = norm_obj
        self.normalization = normalization
        self.full_data = part_data
        self.ari = AtomCustomJSONInitializer(self.root_dir + 'atom_init.json')
        self.gdf = GaussianDistance(dmin=dmin, dmax=self.radius, step=step)
        self.clusterizer = SPCL(n_clusters=cls_num,
                                random_state=None,
                                assign_labels='discretize')
        self.clusterizer2 = KMeans(n_clusters=cls_num, random_state=None)
        self.encoder_elem = ELEM_Encoder()
        self.update_root = None

    def __len__(self):
        return len(self.partial_data)

    @functools.lru_cache(maxsize=None)  # Cache loaded structures
    def __getitem__(self, idx):
        cif_id, target = self.full_data.iloc[idx]
        crystal = Structure.from_file(
            os.path.join(self.root_dir, cif_id + '.cif'))
        atom_fea = np.vstack([
            self.ari.get_atom_fea(crystal[i].specie.number)
            for i in range(len(crystal))
        ])
        atom_fea = torch.Tensor(atom_fea)
        all_nbrs = crystal.get_all_neighbors(self.radius, include_index=True)
        all_nbrs = [sorted(nbrs, key=lambda x: x[1]) for nbrs in all_nbrs]
        nbr_fea_idx, nbr_fea = [], []
        for nbr in all_nbrs:
            if len(nbr) < self.max_num_nbr:
                nbr_fea_idx.append(
                    list(map(lambda x: x[2], nbr)) + [0] *
                    (self.max_num_nbr - len(nbr)))
                nbr_fea.append(
                    list(map(lambda x: x[1], nbr)) + [self.radius + 1.] *
                    (self.max_num_nbr - len(nbr)))
            else:
                nbr_fea_idx.append(
                    list(map(lambda x: x[2], nbr[:self.max_num_nbr])))
                nbr_fea.append(
                    list(map(lambda x: x[1], nbr[:self.max_num_nbr])))
        nbr_fea_idx, nbr_fea = np.array(nbr_fea_idx), np.array(nbr_fea)

        nbr_fea = self.gdf.expand(nbr_fea)
        g_coords = crystal.cart_coords
        groups = [0] * len(g_coords)
        if len(g_coords) > 2:
            try:
                groups = self.clusterizer.fit_predict(g_coords)
            except:
                groups = self.clusterizer2.fit_predict(g_coords)
        groups = torch.tensor(groups).long()

        atom_fea = torch.Tensor(atom_fea)
        nbr_fea = torch.Tensor(nbr_fea)
        nbr_fea_idx = self.format_adj_matrix(torch.LongTensor(nbr_fea_idx))
        target = torch.Tensor([float(target)])

        coordinates = torch.tensor(g_coords)
        enc_compo = self.encoder_elem.encode(crystal.composition)
        return (atom_fea, nbr_fea,
                nbr_fea_idx), groups, enc_compo, coordinates, target, cif_id, [
                    crystal[i].specie for i in range(len(crystal))
                ]

    def format_adj_matrix(self, adj_matrix):
        size = len(adj_matrix)
        src_list = list(range(size))
        all_src_nodes = torch.tensor([[x] * adj_matrix.shape[1]
                                      for x in src_list
                                      ]).view(-1).long().unsqueeze(0)
        all_dst_nodes = adj_matrix.view(-1).unsqueeze(0)
        return torch.cat((all_src_nodes, all_dst_nodes), dim=0)
예제 #40
0
파일: test2.py 프로젝트: tauovir/Nielit
"""Prepare an ML model using KMeans algorithm to cluster some sample input
generated using make_moon function. Plot the clusters. Also plot the same
points by clustering it with Spectral Clustering Model.
"""
from sklearn.datasets.samples_generator  import make_moons
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
import sklearn.metrics
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

X,y_true = make_moons(n_samples = 300,noise = 0.05)
kmeans = KMeans(n_clusters = 4)
kmeans.fit(X)
y_means = kmeans.predict(X)
plt.scatter(X[ :,0], X[ :,1], s=50,c = y_means, cmap = 'viridis' )
#plt.show()

model = SpectralClustering(2,affinity = 'nearest_neighbors')
labels = model.fit_predict(X)
plt.scatter(X[ :,0], X[ :,1], s=50,c = labels, cmap = 'viridis' )
plt.show()

예제 #41
0
def mgm_floyd(X, K, num_graph, num_node):
    """
    :param K: affinity matrix, (num_graph, num_graph, num_node^2, num_node^2)
    :param num_graph: number of graph, int
    :param num_node: number of node, int
    :return: matching results, (num_graph, num_graph, num_node, num_node)
    """
    Lambda = 0
    affinity_matrix = cal_affinity_matrix(X, K, num_graph)
    max = np.max(affinity_matrix)
    min = np.min(affinity_matrix)
    affinity_matrix = (affinity_matrix - min) / (max - min)

    clu_number = 2
    cluster = SpectralClustering(n_clusters=clu_number, affinity='precomputed')
    labels_ = cluster.fit_predict(affinity_matrix)
    # print(labels_)
    clusters = [[] for i in range(clu_number)]
    for i in range(num_graph - 1):
        clusters[labels_[i]].append(i)
    index = [0]
    tmp = 0
    for i in range(len(clusters)):
        tmp += len(clusters[i])
        index.append(tmp)
    graph_rearrange = []
    for item in clusters:
        graph_rearrange.extend(item)
    # raise Exception('STOP!')

    for i in range(len(clusters)):
        for v in clusters[i]:
            begin = index[i]
            end = index[i + 1]
            rearrange = graph_rearrange[
                begin:end] + graph_rearrange[:begin] + graph_rearrange[end:]
            for x in rearrange:
                for y in rearrange:
                    # calculate S_org
                    J_xy_ori = single_affinity(X[x][y], K[x][y])
                    J_xy = (J_xy_ori - min) / (max - min)
                    S_org = J_xy

                    # calculate S_opt
                    X_xv_vy = np.matmul(X[x][v], X[v][y])
                    J_xv_vy = (single_affinity(X_xv_vy, K[x][y]) -
                               min) / (max - min)
                    S_opt = J_xv_vy

                    # compare and update
                    if S_org < S_opt:
                        X[x][y] = np.matmul(X[x][v], X[v][y])
                        X[y][x] = np.matmul(X[y][v], X[v][x])
                    if J_xy_ori < min:
                        min = J_xy_ori
                    elif J_xy_ori > max:
                        max = J_xy_ori

    # set lambda and repeat above process
    Lambda = 0.45
    affinity_matrix = cal_affinity_matrix(X, K, num_graph)
    max = np.max(affinity_matrix)
    min = np.min(affinity_matrix)
    # consistency_matrix = cal_pairwise_consistency_matrix(X, num_graph, num_node)
    # use unary consistency to speed up
    consistency_matrix = cal_unary_consistency_matrix(X, num_graph, num_node)

    flag = False  # use flag to check whether X is updated
    for i in range(len(clusters)):
        for v in clusters[i]:
            begin = index[i]
            end = index[i + 1]
            rearrange = graph_rearrange[
                begin:end] + graph_rearrange[:begin] + graph_rearrange[end:]
            if flag:
                # consistency_matrix = cal_pairwise_consistency_matrix(X, num_graph, num_node)
                # use unary consistency to speed up
                consistency_matrix = cal_unary_consistency_matrix(
                    X, num_graph, num_node)
                flag = False
            for x in rearrange:
                for y in rearrange:
                    J_xy_ori = single_affinity(X[x][y], K[x][y])
                    J_xy = (J_xy_ori - min) / (max - min)
                    Cp_xy = consistency_matrix[y]
                    S_org = (1 - Lambda) * J_xy + Lambda * Cp_xy

                    X_xv_vy = np.matmul(X[x][v], X[v][y])
                    J_xv_vy = (single_affinity(X_xv_vy, K[x][y]) -
                               min) / (max - min)
                    # if use unary consistency
                    C_xv_vy = consistency_matrix[v]
                    # if use pairwise consistency
                    # C_xv_vy = math.sqrt(consistency_matrix[x][v] * consistency_matrix[v][y])
                    S_opt = (1 - Lambda) * J_xv_vy + Lambda * C_xv_vy

                    if S_org < S_opt:
                        X[x][y] = np.matmul(X[x][v], X[v][y])
                        X[y][x] = np.matmul(X[y][v], X[v][x])
                        flag = True
                    if J_xy_ori < min:
                        min = J_xy_ori
                    elif J_xy_ori > max:
                        max = J_xy_ori
    return X
def TestCaltech():
    similarityMatrix, labels = DataLoad.LoadCaltech()
    spectralSimMatrix = sklearn.preprocessing.normalize(similarityMatrix)
    n = similarityMatrix.shape[0]
    nmiVals = np.zeros((21, 2))
    numConstraints = np.zeros((21))

    classRanges = DataGen.GenClassRanges(labels)
    normAssocAverages = []
    spectralAverages = []
    # The value inside range indicates the number of iterations to be averaged
    for j in range(1):
        for i in range(21):
            constraintMatrix = np.zeros((n, n))
            spectralConstraintMatrix = np.zeros((n, n))
            constraintMatrix = DataGen.GenerateConstraints(
                constraintMatrix, classRanges, i * 50, n, 4, False, False)
            spectralConstraintMatrix = DataGen.GenerateConstraints(
                spectralSimMatrix, classRanges, i * 50, n, 4, False, True)

            ssKernelKMeansAgent = SSKernelKMeans()
            spectralClusteringAgent = SpectralClustering(
                n_clusters=4, affinity='precomputed')

            spectralAffMatrix = spectralConstraintMatrix - csgraph.laplacian(
                spectralSimMatrix)
            spectralAffMatrix = (
                ssKernelKMeansAgent.findSigma(spectralAffMatrix) *
                np.identity(n)) + spectralAffMatrix

            ssClusterAssignments = ssKernelKMeansAgent.Cluster(
                similarityMatrix, constraintMatrix, 4)
            spectralClusteringAssignments = spectralClusteringAgent.fit_predict(
                spectralAffMatrix)

            nmiVals[i, 0] = max(
                0,
                sklearn.metrics.normalized_mutual_info_score(
                    DataGen.GetTestLabels(classRanges, n, labels.tolist()),
                    DataGen.GetTestLabels(classRanges, n,
                                          ssClusterAssignments)))
            nmiVals[i, 1] = sklearn.metrics.normalized_mutual_info_score(
                DataGen.GetTestLabels(classRanges, n, labels.tolist()),
                DataGen.GetTestLabels(classRanges, n,
                                      spectralClusteringAssignments.tolist()))

            numConstraints[i] = i * 50
            print('SS Kernel K Means NMI with ' + str(numConstraints[i]) +
                  ' constraints = ' + str(nmiVals[i, 0]))
            print('Spectral Clustering NMI with ' + str(numConstraints[i]) +
                  ' constraints = ' + str(nmiVals[i, 1]))

        normAssocAverages.append(nmiVals[:, 0])
        spectralAverages.append(nmiVals[:, 1])

    plt.plot(numConstraints, np.mean(normAssocAverages, axis=0), '--x')
    plt.plot(numConstraints, np.mean(spectralAverages, axis=0), ':s')

    plt.legend(['SS Kernel KMeans - Ratio Association', 'Spectral Clustering'],
               loc='upper left')
    plt.xlabel('Number of Constraints')
    plt.ylabel('NMI Value')
    plt.title('Caltech Data Set')

    plt.show()
예제 #43
0
# consider only 10000 data (spectralclustering memory complexity):
ind = np.array(10000 * [1] + (X.shape[0] - 10000) * [0]).astype(bool)
ind = shuffle(ind)
data_thr10 = pd.DataFrame(X[ind])
data_thr10.columns = data.columns

scaler = StandardScaler()
X = scaler.fit_transform(X)

X = X[ind]

for n_clusters in range(2, 10):

    km = SpectralClustering(n_clusters=n_clusters)
    preds = km.fit_predict(X)

    print "components:", set(preds)
    print np.bincount(preds)

    data_thr10['preds'] = pd.Series(preds).astype("category")
    color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
                 "brown", "green", "orange"]

    title = str(np.bincount(preds))
    TOOLS = "wheel_zoom,box_zoom,reset,box_select,pan"
    plot_width = 900
    plot_height = 300
    x_name = 'rateCA'
    y_name = 'rate'
    xmin_p = np.percentile(data_thr10[x_name], 0.1)
def spectral(X, n):
    instance = SpectralClustering(n_clusters=n, affinity='linear')
    return instance.fit_predict(X)
예제 #45
0
def learn_mix_model_beta(categroy, K=4, kappa=5):

    with open(Dict['Dictionary'], 'rb') as fh:
        _, centers, _ = pickle.load(fh)

    sim_fname = os.path.join(Feat['cache_dir'], 'simmat',
                             'simmat_mthrh045_{}.pickle'.format(category))
    feat_fname = os.path.join(Feat['cache_dir'],
                              'feat_{}_train.pickle'.format(category))
    savename = os.path.join(
        root_dir, 'mix_model',
        'mmodel_{}_K{}_notrain_beta.pickle'.format(category, K))

    # Spectral clustering based on the similarity matrix
    with open(sim_fname, 'rb') as fh:
        mat_dis1, _ = pickle.load(fh)

    mat_dis = mat_dis1
    N = mat_dis.shape[0]
    print('total number of instances for obj {}: {}'.format(categroy, N))

    mat_full = mat_dis + mat_dis.T - np.ones((N, N))
    np.fill_diagonal(mat_full, 0)

    W_mat = 1. - mat_full
    print('W_mat stats: {}, {}'.format(np.mean(W_mat), np.std(W_mat)))

    K1 = 2
    cls_solver = SpectralClustering(n_clusters=K1,
                                    affinity='precomputed',
                                    random_state=666)
    lb = cls_solver.fit_predict(W_mat)

    K2 = 2
    idx2 = []
    W_mat2 = []
    lb2 = []
    for k in range(K1):
        idx2.append(np.where(lb == k)[0])
        W_mat2.append(W_mat[np.ix_(idx2[k], idx2[k])])
        print('W_mat_i stats: {}, {}'.format(np.mean(W_mat2[k]),
                                             np.std(W_mat2[k])))

        cls_solver = SpectralClustering(n_clusters=K2,
                                        affinity='precomputed',
                                        random_state=666)
        lb2.append(cls_solver.fit_predict(W_mat2[k]))

    rst_lbs1 = np.ones(len(idx2[0])) * -1
    rst_lbs1[np.where(lb2[0] == 0)[0]] = 0
    rst_lbs1[np.where(lb2[0] == 1)[0]] = 1
    rst_lbs2 = np.ones(len(idx2[1])) * -1
    rst_lbs2[np.where(lb2[1] == 0)[0]] = 2
    rst_lbs2[np.where(lb2[1] == 1)[0]] = 3
    rst_lbs = np.ones(N) * -1
    rst_lbs[idx2[0]] = rst_lbs1
    rst_lbs[idx2[1]] = rst_lbs2
    rst_lbs = rst_lbs.astype('int')

    del (mat_dis)

    for kk in range(K):
        print('cluster {} has {} samples'.format(kk, np.sum(rst_lbs == kk)))

    # Load the feature vector and compute VC encoding
    with open(feat_fname, 'rb') as fh:
        layer_feature = pickle.load(fh)

    assert (N == len(layer_feature))

    r_set = [None for nn in range(N)]
    for nn in range(N):
        iheight, iwidth = layer_feature[nn].shape[0:2]
        lff = layer_feature[nn].reshape(-1, featDim)
        lff_norm = lff / np.sqrt(np.sum(lff**2, 1)).reshape(-1, 1)
        r_set[nn] = cdist(lff_norm, centers,
                          'cosine').reshape(iheight, iwidth, -1)

    # transfer from distance space to firing rate space, center crop
    layer_feature_fr = [None for nn in range(N)]
    for nn in range(N):
        hnn, wnn = r_set[nn].shape[0:2]
        if hnn > 14:
            marg = (hnn - 14) // 2
            r_set[nn] = r_set[nn][marg:marg + 14, :, :]
        elif wnn > 14:
            marg = (wnn - 14) // 2
            r_set[nn] = r_set[nn][:, marg:marg + 14, :]

        layer_feature_fr[nn] = np.exp(-kappa * r_set[nn])

    del (layer_feature)
    del (r_set)

    all_train = [[] for kk in range(K)]
    for nn in range(N):
        if nn % 100 == 0:
            print(nn, end=' ', flush=True)

        all_train[rst_lbs[nn]].append(layer_feature_fr[nn].ravel())

    print('')

    all_alphas = [None for kk in range(K)]
    all_betas = [None for kk in range(K)]
    all_N = [0 for kk in range(K)]
    for kk in range(K):
        data_kk = np.array(all_train[kk])
        all_alphas[kk] = np.zeros(data_kk.shape[1])
        all_betas[kk] = np.zeros(data_kk.shape[1])
        for dd in range(data_kk.shape[1]):
            all_alphas[kk][dd], all_betas[kk][dd], _, _ = beta.fit(data_kk[:,
                                                                           dd])

        all_N[kk] = data_kk.shape[0]

    assert (N == np.sum(all_N))
    all_priors = np.array(all_N) / N

    with open(savename, 'wb') as fh:
        pickle.dump([all_alphas, all_betas, all_priors], fh)
def do_work(cluster_test, cluster_cnt):
    
# Pass a list of tuples and a counter that increments each time we go
# through the loop. The tuples are the data to be used by k-means,
# and the PCA-derived features for graphing. We use k-means to fit a
# model to the data, then store the predicted values and the two-feature
# PCA solution in the data frame.
    for counter, data in enumerate([
        (X1, X_pca1),
        (X2, X_pca2),
        (X3, X_pca3),
        (X4, X_pca4)]):
    
        # Put the features into ypred.
        ypred['pca_f1' + '_sample' + str(counter)] = data[1][:, 0]
        ypred['pca_f2' + '_sample' + str(counter)] = data[1][:, 1]
        
        # Generate cluster predictions and store them for clusters 2 to 4.
        for nclust in range(2, 5):
            pred = KMeans(n_clusters=cluster_cnt, random_state=42).fit_predict(data[0])
            ypred['clust' + str(nclust) + '_sample' + str(counter)] = pred
            
 
        # Get predicted clusters.
    if (cluster_test == KMEANS):
        print( "In test - >",cluster_test)
        output[LABEL] = type_lbls[cluster_test]
        output[CLUSTERS] = cluster_cnt
        full_pred = KMeans(n_clusters=cluster_cnt, random_state=42).fit_predict(X_norm)
        # Create a list of pairs, where each pair is the ground truth group
        # and the assigned cluster.
        y = df.iloc[:, 13]
        y = np.where(y > 0, 0, 1)
        c = list(itertools.product(y, full_pred))
# Count how often each type of pair (a, b, c, or d) appears.
        #c = np.array(c, dtype=np.float16)
        RIcounts = [[x, c.count(x)] for x in set(c)]
        #print(clusters, " clusters, RIcounts - >",RIcounts)
        # Create the same counts but without the label, for easier math below.
        RIcounts_nolabel = [c.count(x) for x in set(c)]
        # Calculate the Rand Index.
        RIscore = (RIcounts_nolabel[3] + RIcounts_nolabel[2]) / np.sum(RIcounts_nolabel)
        output[RISCORE] = RIscore
        output[ARS] = (metrics.adjusted_rand_score(y, full_pred))
#        print(clusters, " clusters, adjustd Rand Score->", metrics.adjusted_rand_score(y, full_pred))
        output[METS1] = metrics.silhouette_score(X_norm, type_lbls, metric='sqeuclidean')

    if (cluster_test == MEANSHFT):
        #output.clear()
       # output=[0]*8
        print( "In test - >",cluster_test)
        output[LABEL] = type_lbls[cluster_test]
        output[CLUSTERS] = cluster_cnt
        output[RISCORE] = 0#RIscore
        output[ARS] = 0#(metrics.adjusted_rand_score(y, full_pred))
        
        bandwidth = estimate_bandwidth(X_norm, quantile=0.2, n_samples=500)
# Declare and fit the model.
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        full_pred = ms.fit(X_norm)
# Extract cluster assignments for each data point.
        labels = ms.labels_
        y = df.iloc[:, 13]
        y = np.where(y > 0, 0, 1)
        y = print(pd.crosstab(y,labels))
# Coordinates of the cluster centers.
        cluster_centers = ms.cluster_centers_
        output[RISCORE] = 999#RIscore
 ####       output[ARS] = (metrics.adjusted_rand_score(y, full_pred))
        
        output[METS1] = metrics.silhouette_score(X_norm, labels, metric='sqeuclidean')
    # Extract cluster assignments for each data point.
        labels = ms.labels_
    
    if (cluster_test == SPECTRAL):
        print( "In test - >",cluster_test)
        output[LABEL] = type_lbls[cluster_test]
        output[CLUSTERS] = cluster_cnt
        # Declare and fit the model.
        sc = SpectralClustering(n_clusters=cluster_cnt)
        sc.fit(X_norm)
        
        #Predicted clusters.
        y = df.iloc[:, 13]
        y = np.where(y > 0, 0, 1)
        full_pred =sc.fit_predict(X_norm)
        print("spectral->",pd.crosstab(y,full_pred))
        # Create a list of pairs, where each pair is the ground truth group
        # and the assigned cluster.
        c = list(itertools.product(y, full_pred))
        
        # Count how often each type of pair (a, b, c, or d) appears.
        RIcounts = [[x, c.count(x)] for x in set(c)]
        print("Kssspectral  RI Count ->",RIcounts)
        
        output[ARS] = (metrics.adjusted_rand_score(y, full_pred))
        
        # Create the same counts but without the label, for easier math below.
        RIcounts_nolabel = [c.count(x) for x in set(c)]
        # Calculate the Rand Index.
        RIscore = (RIcounts_nolabel[3] + RIcounts_nolabel[2]) / np.sum(RIcounts_nolabel)
        output[RISCORE] = RIscore
#        print("spectral Menas RIscore ->", RIscore)
        output[METS1] = metrics.silhouette_score(X_norm, labels, metric='sqeuclidean')

    if (cluster_test == AFFINITY):
        print( "In test - >",cluster_test)
        output[LABEL] = type_lbls[cluster_test]
        output[CLUSTERS] = cluster_cnt
        
        # Compute Affinity Propagation
        input_x = np.array(X_norm)#  X_norm.values
        af = AffinityPropagation().fit(input_x)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_
        print("labels = ",labels)
        n_clusters_ = len(cluster_centers_indices)
        y = df.iloc[:, 13]
        y = np.where(y > 0, 0, 1)
        labels_true = y
        print('Estimated number of clusters: %d' % n_clusters_)
        print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
        print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
        print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
        print("Adjusted Rand Index: %0.3f"  \
              % metrics.adjusted_rand_score(labels_true, labels))
        print("Adjusted Mutual Information: %0.3f" \
              % metrics.adjusted_mutual_info_score(labels_true, labels))
 #       print("Silhouette Coefficient: %0.3f" \
 #             % metrics.silhouette_score(X_norm, labels, metric='sqeuclidean'))
        
        output[RISCORE] = 0#RIscore
#        output[ARS] = (metrics.adjusted_rand_score(y, full_pred))
        output[ARS] = metrics.adjusted_rand_score(labels_true, labels)#0#(metrics.adjusted_rand_score(y, full_pred))
        output[METS1] = metrics.silhouette_score(X_norm, labels, metric='sqeuclidean')
예제 #47
0
        visualizer_inter.show()
        st.pyplot()
    except:
        st.write("Fill all parameters.")

########################################
# Spectral Clustering
########################################
if ML_option == "Spectral Clustering":
    try:
        # Spectral parameters
        Nk = st.number_input("Number of clusters: ", min_value=1, step=1)
        SpecClus = SpectralClustering(n_clusters=Nk,
                                      affinity='nearest_neighbors',
                                      assign_labels='kmeans')
        pred = SpecClus.fit_predict(data_feature)

        st.subheader("Classification Report")
        st.text(classification_report(data_target, pred))

        #Confusion matrix
        plot_confusion_matrix(data_target, pred, figsize=(7, 5), cmap="PuBuGn")
        bottom, top = plt.ylim()
        plt.ylim(bottom + 0.5, top - 0.5)
        st.pyplot()

        # Elbow Method
        visualizer = KElbowVisualizer(SpecClus, k=(1, 10))
        visualizer.fit(data_feature)
        visualizer.show()
        st.pyplot()
예제 #48
0
    data1 = np.vstack((np.cos(t), np.sin(t))).T
    data2 = np.vstack((2*np.cos(t), 2*np.sin(t))).T
    data3 = np.vstack((3*np.cos(t), 3*np.sin(t))).T
    data = np.vstack((data1, data2, data3))

    n_clusters = 3
    m = euclidean_distances(data, squared=True)

    plt.figure(figsize=(12, 8), facecolor='w')
    plt.suptitle('谱聚类', fontsize=16)
    clrs = plt.cm.Spectral(np.linspace(0, 0.8, n_clusters))
    for i, s in enumerate(np.logspace(-2, 0, 6)):
        print(s)
        af = np.exp(-m ** 2 / (s ** 2)) + 1e-6
        model = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', assign_labels='kmeans', random_state=1)
        y_hat = model.fit_predict(af)
        plt.subplot(2, 3, i+1)
        for k, clr in enumerate(clrs):
            cur = (y_hat == k)
            plt.scatter(data[cur, 0], data[cur, 1], s=40, c=clr, edgecolors='k')
        x1_min, x2_min = np.min(data, axis=0)
        x1_max, x2_max = np.max(data, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(b=True, ls=':', color='#808080')
        plt.title(r'$\sigma$ = %.2f' % s, fontsize=13)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()
예제 #49
0
    players = {}
    data = []
    names = []
    data_file = open("kda_200.txt", "r")

    # Build data from file
    for line in data_file:
        fields = line.split(",")
        data.append([float(fields[1]), float(fields[3]), float(fields[4]), float(fields[4])])
        names.append(fields[0])

    # Create and fit model
    clus = SpectralClustering(n_clusters=5,eigen_solver='arpack',affinity= "nearest_neighbors")
    clus.fit(data)
    labels = clus.fit_predict(data)

    # Sort the fitted data into 5 boxes, one for each role
    boxes = [[],[],[],[],[]]
    for x in range(len(data)):
        pred = labels[x]
        name = names[x]
        # names like "Amazing (Maurice Stuckenschneider)" are too long, cut at first space
        if " " in name:
            name = name[0:name.find(" ")+1]
        
        boxes[pred].append(name.ljust(10))

    # Get size of largest cluster so you can pad the others
    sizes = [len(boxes[0]), len(boxes[1]), len(boxes[2]), len(boxes[3]), len(boxes[4])]
    biggest = max(sizes)
예제 #50
0
from sklearn import datasets
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
from sklearn import cluster
import numpy as np

iris = datasets.load_iris()

X = iris.data
y = iris.target
target_names = iris.target_names
print(target_names)

A = np.array([[0, 1, 1, 0, 0, 0, 0, 0, 1, 1], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
              [1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
              [0, 0, 0, 1, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 1, 0, 1, 1, 0, 0],
              [0, 0, 0, 0, 0, 1, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
              [1, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 1, 0]])

sc = SpectralClustering(3,
                        affinity='precomputed',
                        n_init=100,
                        assign_labels='discretize')
sc.fit_predict(A)
예제 #51
0
                marker=marker[i],
                label='%s' % i)
plt.legend()
plt.show()

## Perform spectral clustering
sc = SpectralClustering(n_clusters=nClass,
                        n_init=10,
                        gamma=0.1,
                        affinity='rbf',
                        n_neighbors=3,
                        assign_labels='kmeans',
                        degree=3,
                        coef0=1,
                        kernel_params=None)
ypred = sc.fit_predict(train_x)
nmi_sc = metrics.adjusted_mutual_info_score(train_y, ypred)
ari_sc = metrics.adjusted_rand_score(train_y, ypred)
print >> sys.stderr, ('NMI for spectral clustering: %.2f' % (nmi_sc))
print >> sys.stderr, ('ARI for spectral clustering: %.2f' % (ari_sc))

## Perform KMeans
km = KMeans(n_clusters=nClass, init='k-means++', n_init=10)
ypred = km.fit_predict(train_x)
nmi_km = metrics.adjusted_mutual_info_score(train_y, ypred)
ari_km = metrics.adjusted_rand_score(train_y, ypred)
print >> sys.stderr, ('NMI for Kmeans: %.2f' % (nmi_km))
print >> sys.stderr, ('ARI for Kmeans: %.2f' % (ari_km))

train_set = train_x, train_y
dataset = [train_set, train_set, train_set]
def show_clustered_dataset(X, Y):
    fig, ax = plt.subplots(1, 1, figsize=(30, 25))

    ax.grid()
    ax.set_xlabel('X')
    ax.set_ylabel('Y')

    for i in range(nb_samples):
        if Y[i] == 0:
            ax.scatter(X[i, 0], X[i, 1], marker='o', color='r')
        else:
            ax.scatter(X[i, 0], X[i, 1], marker='^', color='b')

    plt.show()


if __name__ == '__main__':
    warnings.simplefilter("ignore")

    # Create dataset
    X, Y = make_moons(n_samples=nb_samples, noise=0.05)

    # Show dataset
    show_dataset(X, Y)

    # Create and train Spectral Clustering
    sc = SpectralClustering(n_clusters=2, affinity='nearest_neighbors')
    Y = sc.fit_predict(X)

    # Show clustered dataset
    show_clustered_dataset(X, Y)
예제 #53
0
def cluster_faces(name, img_list = 'all-scores-faces-list-new'):
    root = root_all + 'face_recognition/'+ '@'.join(name.split('-'))
    cnn_root = root_all + 'face_recognition_CNN/'+name + '/'

    f = open(cnn_root + 'waldo_normalized_combined.cPickle','r')
    combined_matrix = cPickle.load(f)
    f.close()

    diag = np.diag(combined_matrix)
    diag = diag[:, np.newaxis]
    normalize_matrix = np.dot(diag, np.transpose(diag))
    normalize_matrix = np.sqrt(normalize_matrix)
    affinity_matrix = np.divide(combined_matrix, normalize_matrix)
    min_ = np.min(affinity_matrix); max_ = np.max(affinity_matrix)
    affinity_matrix =  (affinity_matrix - min_) / (max_ - min_)


    f = SpectralClustering(affinity='precomputed', n_clusters=min(8, affinity_matrix.shape[0] - 1), eigen_solver = 'arpack', n_neighbors=min(5, affinity_matrix.shape[0]))
    a = f.fit_predict(affinity_matrix)

    groups = {}
    temp = zip(a, xrange(len(a)))
    for i in temp:
        if i[0] not in groups:
            groups[i[0]] = [i[1]]
        else:
            groups[i[0]].append(i[1])
    unique_person_id = []
    for kk in groups:
        min_similarity = np.Inf
        max_similarity = -np.Inf
        mean_similarity = 0
        this_group_ids = groups[kk]
        for j in xrange(len(this_group_ids)):
            for i in xrange(j+1, len(this_group_ids)):
                temp = combined_matrix[this_group_ids[i],this_group_ids[j]]
                if temp < min_similarity:
                    min_similarity = temp
                if temp > max_similarity:
                    max_similarity = temp
                mean_similarity += temp
        mean_similarity /= max(1, len(this_group_ids)*(len(this_group_ids) - 1) / 2)
        print len(this_group_ids), mean_similarity, max_similarity, min_similarity
        print mean_similarity
        if mean_similarity > 0.4 and len(this_group_ids) > 1:
            unique_person_id.append(kk)
    important_person = []
    for i in unique_person_id:
        important_person.append([i, len(groups[i])])
    important_person.sort(key = lambda x:x[1], reverse=True)
    in_path = root + '-dir/' + img_list
    imgs_list = []
    with open(in_path, 'r') as data:
        for line in data:
            line = line[:-1]
            imgs_list.append(line.split('/')[-1])

    temp = zip(a, imgs_list)
    face_groups = {}
    for i in temp:
        if i[0] not in face_groups:
            face_groups[i[0]] = [i[1]]
        else:
            face_groups[i[0]].append(i[1])

    create_face_group_html(name, face_groups, important_person)

    f = open(cnn_root + 'waldo_group_combined.cPickle','w')
    cPickle.dump([face_groups, important_person], f)
    f.close()
def silhouette(X, n_clusters, algorithm, monthNo):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    X = np.array(X)
    if algorithm == KMeans:
        clusterer = algorithm(n_clusters=n_clusters, random_state=10)
    elif algorithm == AgglomerativeClustering:
        clusterer = algorithm(n_clusters=n_clusters, linkage='ward')
    elif algorithm == SpectralClustering:
        clusterer = SpectralClustering(n_clusters=n_clusters)
    elif algorithm == AffinityPropagation:
        clusterer = AffinityPropagation(preference=-5.0, damping=0.95)
    elif algorithm == MeanShift:
        clusterer = MeanShift(0.175, cluster_all=False)
    cluster_labels = clusterer.fit_predict(X)

    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For algorithm =", algorithm, "   n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
                          edgecolor=color,
                          alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
    X = np.array(X)
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)
    ax2.scatter(X[:, 0],
                X[:, 1],
                marker='.',
                s=30,
                lw=0,
                alpha=0.7,
                c=colors,
                edgecolor='k')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0],
                centers[:, 1],
                marker='o',
                c="white",
                alpha=1,
                s=200,
                edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0],
                    c[1],
                    marker='$%d$' % i,
                    alpha=1,
                    s=50,
                    edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")
    alg = ''
    if algorithm == KMeans:
        alg = "Silhouette analysis for Preprocessing = real , algorith = KMeans " + str(
            n_clusters) + " , month = " + str(
                monthNo) + ",average silhouette_score = " + str(silhouette_avg)
        plt.suptitle((alg), fontsize=14, fontweight='bold')
        plt.savefig('result/Kmean/real' + '_Kmeans' + str(n_clusters) + '_m' +
                    str(monthNo))
    elif algorithm == AgglomerativeClustering:
        alg = "Silhouette analysis for Preprocessing = real , algorith = AgglomerativeClustering " + str(
            n_clusters) + " , month = " + str(
                monthNo) + ",average silhouette_score = " + str(silhouette_avg)
        plt.suptitle((alg), fontsize=14, fontweight='bold')
        plt.savefig('result/AgglomerativeClustering/real' + '_Agg' +
                    str(n_clusters) + '_m' + str(monthNo))
    elif algorithm == AffinityPropagation:
        alg = "Silhouette analysis for Preprocessing = real , algorith = AffinityPropagation , month = " + str(
            monthNo) + ",average silhouette_score = " + str(silhouette_avg)
        plt.suptitle((alg), fontsize=14, fontweight='bold')
        plt.savefig('result/AffinityPropagation/real' + '_Aff_m' +
                    str(monthNo))
    elif algorithm == MeanShift:
        alg = "Silhouette analysis for Preprocessing = real , algorith = AffinityPropagation , month = " + str(
            monthNo) + ",average silhouette_score = " + str(silhouette_avg)
        plt.suptitle((alg), fontsize=14, fontweight='bold')
        plt.savefig('result/MeanShift/real' + '_MeanShift_m' + str(monthNo))
예제 #55
0
        writer.writerow(rows[i] + [labels[i]])

# <p style="font-family:courier;">5. We apply spectral clustering with 66 clusters</p>

# In[5]:
"""
spectral = SpectralClustering(n_clusters = 66, eigen_solver = 'arpack', 
                              affinity='nearest_neighbors', n_neighbors = 10, 
                              kernel_params = {'radius':0.095, 'metric':'euclidean','mode':'distance'}, 
                              n_init = 20)
"""
spectral = SpectralClustering(n_clusters=66,
                              eigen_solver='arpack',
                              affinity='nearest_neighbors',
                              gamma=0.095)
labels = spectral.fit_predict(X)
unique_labels = set(labels)

# <p style="font-family:courier;">6. We plot the results of spectral clustering</p>

# In[6]:

#Plot
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
    if k != -1:
        class_member_mask = (labels == k)
        xy = X[labels == k]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markersize=6)
plt.title('Estimated number of clusters: %d' % k)
plt.show()
onehotencoder = OneHotEncoder(categorical_features=[17])
x = onehotencoder.fit_transform(x).toarray()
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
x = min_max_scaler.fit_transform(x)

print(x)

from sklearn.cluster import SpectralClustering

spec = SpectralClustering(n_clusters=9,
                          assign_labels="discretize",
                          random_state=0)
spec_predict = spec.fit_predict(x)

print(spec_predict)
from sklearn import metrics

from sklearn.metrics import pairwise_distances
print("Silhouette Score: %0.3f" %
      metrics.silhouette_score(x, spec_predict, metric='euclidean'))
print("Calinski-Harabaz Index: %0.3f" %
      metrics.calinski_harabaz_score(x, spec_predict))
"""



# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
    def spectral_plot(x,
                      y,
                      ncl=3,
                      alabels="kmeans",
                      title="Spectral Clustering"):
        scluster = SpectralClustering(n_clusters=ncl,
                                      affinity='rbf',
                                      assign_labels=alabels,
                                      random_state=0)
        pred_y = scluster.fit_predict(y)
        '''
        indx_low = 3
        indx_high = ul_-1
        tru_list = []
        colbox = scluster.labels_
        while indx_high < cluster_data_:
            for indx in range(indx_low, indx_high):
                if colbox[indx] != colbox[indx+1]:
                    if colbox[indx] == colbox[indx-1] and colbox[indx]==colbox[indx-2]: 
                        tru_list.append(indx+1)
            indx_low += ul_
            indx_high += ul_
        fig = plt.figure(figsize=(20,8))
        plt.subplot(121)
        plt.scatter(y[:,0], y[:,1], c=scluster.labels_, cmap='coolwarm', label='Spectral, '+ str(alabels)+" "+str(ncl)+' clusters')
        plt.legend(bbox_to_anchor=(0, 1.06), loc='upper left', ncol=1)
        plt.xlabel("VAE1", fontsize=10)
        plt.ylabel("VAE2", fontsize=10)
        plt.rc('xtick', labelsize=10)
        plt.rc('ytick', labelsize=10)
        plt.subplot(122)
        plt.scatter(x[:,0], x[:,1], c=scluster.labels_, cmap='coolwarm', label='Spectral, '+ str(alabels)+" "+str(ncl)+' clusters')
        plt.legend(bbox_to_anchor=(0, 1.06), loc='upper left', ncol=1)
        plt.ylabel('Temperature('+u"\u2103"+")", fontsize = 10)
        plt.xlabel('Composition(%)', fontsize=10)
        plt.rc('xtick', labelsize=10)
        plt.rc('ytick', labelsize=10)
        for i, val in enumerate(tru_list):
            plt.annotate(str(int(x[:,0][val]))+'%', xy=(x[:,0][val], x[:,1][val]), xytext=(x[:,0][val], 1+x[:,1][val]), ha='center', arrowprops=dict(arrowstyle="->"),)
        fig.suptitle(title, fontsize = 16)
        plt.savefig(os.getcwd() + '/plots/' + str(n_latent)+"d_"+title[:8]+"_"+str(ncl)+"_"+str(alabels) +'.png')
        plt.close()
        '''

        label_data = [float(run_i)]
        relabel_data = [float(run_i)]
        relabelled_list = []
        uni_vals = []
        for k in scluster.labels_:
            if k not in uni_vals:
                uni_vals.append(k)
        for k in scluster.labels_:
            relabelled_list.append(np.where(uni_vals == k)[0][0])
        label_data.extend(scluster.labels_)
        relabel_data.extend(relabelled_list)
        if ncl == 2:
            if alabels == "kmeans":
                two_spectral.append(label_data)
                two_spectral.append(relabel_data)
            if alabels == "discretize":
                twod_spectral.append(label_data)
                twod_spectral.append(relabel_data)
        if ncl == 3:
            if alabels == "kmeans":
                three_spectral.append(label_data)
                three_spectral.append(relabel_data)
            if alabels == "discretize":
                threed_spectral.append(label_data)
                threed_spectral.append(relabel_data)
        if ncl == 4:
            if alabels == "kmeans":
                four_spectral.append(label_data)
                four_spectral.append(relabel_data)
            if alabels == "discretize":
                fourd_spectral.append(label_data)
                fourd_spectral.append(relabel_data)
예제 #58
0
    data2 = np.vstack((2 * np.cos(t), 2 * np.sin(t))).T
    data3 = np.vstack((3 * np.cos(t), 3 * np.sin(t))).T
    data = np.vstack((data1, data2, data3))

    n_clusters = 3
    m = euclidean_distances(data,data, squared=True)  # 一组数据两点之间的相似度

    plt.figure(figsize=(12, 8), facecolor='w')
    plt.suptitle('谱聚类', fontsize=16)
    clrs = plt.cm.Spectral(np.linspace(0, 0.8, n_clusters))
    for i, s in enumerate(np.logspace(-2, 0, 6)):
        print(s)
        af = np.exp(-m ** 2 / (s ** 2)) + 1e-6
        model = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', assign_labels='kmeans',
                                   random_state=1)  # 簇为3
        y_hat = model.fit_predict(af)
        plt.subplot(2, 3, i + 1)
        for k, clr in enumerate(clrs):
            cur = (y_hat == k)
            plt.scatter(data[cur, 0], data[cur, 1], s=40, c=clr, edgecolors='k')
        x1_min, x2_min = np.min(data, axis=0)
        x1_max, x2_max = np.max(data, axis=0)
        x1_min, x1_max = expand(x1_min, x1_max)
        x2_min, x2_max = expand(x2_min, x2_max)
        plt.xlim((x1_min, x1_max))
        plt.ylim((x2_min, x2_max))
        plt.grid(b=True, ls=':', color='#808080')
        plt.title(r'$\sigma$ = %.2f' % s, fontsize=13)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()
예제 #59
0
for mc in range(MC1):
    
    print("\n\n\033[91mIteration",mc+1,"\033[0m")
    
    trainX,testX,trainy,testy = train_test_split(X,y,train_size=0.8,random_state=np.random.randint(1000),stratify=y)
    NtrainX = normalize(trainX)
    NtestX = normalize(testX)
    train = pd.concat([trainX,trainy],axis=1)
    test = pd.concat([testX,testy],axis=1)
    Ntrain = pd.concat([pd.DataFrame(NtrainX,index=trainX.index),trainy],axis=1)
    Ntest = pd.concat([pd.DataFrame(NtestX,index=testX.index),testy],axis=1)
    
    sc = SpectralClustering(n_clusters=2, gamma=1.0, affinity="rbf",random_state=np.random.randint(1000)).fit(NtrainX)
    cls = sc.labels_
    clste = sc.fit_predict(NtestX)
    temp = trainy.copy()
    temp['cluster'] = cls
    
    # B
    c = []
    for cluster in range(2):
        classlist = temp.loc[temp['cluster']==cluster].tolist()
        c.append(max(classlist,key=classlist.count))    
    print("Clusters:",c)
    
    bivpredtr = []
    bivdftr = []
    tempdf = decfunc(NtrainX,cls)
    j = 0
    for i in cls:
예제 #60
0
    gt_label = []
    for tv in video_index:
        gt_label.append(video_2_action[tv])
    #     non_over_label = []
    #     for i in range(label_pred.shape[0]):
    #         non_over_label.append(int(label_pred[i]))
    gt_label = np.array(gt_label)
    gt_label = np.squeeze(gt_label)
    # print(gt_label.shape)
    from sklearn import metrics
    print("Adjusted rand score %.4f" % metrics.adjusted_rand_score(gt_label, label_pred))
    print("NMI %.4f" % metrics.normalized_mutual_info_score(gt_label, label_pred))

    return cluser_2_action, soft_cluster_2_action


num_subset_class = 100
subset_class, subset_index, subset_atten_fea = get_subset(num_subset_class, action_2_video, training_index, att_fea_v1)

affinity_matrix, sorted_video_index, sorted_video_fea = get_affinity(subset_index, subset_atten_fea, action_2_video)
subset_atten_fea = sorted_video_fea
subset_index = sorted_video_index

# cluster
num_of_cluster = num_subset_class
estimator = SpectralClustering(n_clusters=num_of_cluster, random_state=0, affinity='precomputed')
estimator.fit_predict(affinity_matrix)
label_pred = estimator.labels_
cluser_2_action, soft_cluster_2_action = get_cluster_performance(num_of_cluster, label_pred, subset_index,
                                                                 action_2_video, video_2_action)