def __hieclu(self): #use Hierarchical clustering print 'using hierarchical clustering......' ac = Ward(n_clusters=self.k) ac.fit(self.data_matrix) result = ac.fit_predict(self.data_matrix) return result
def hieclu(data_matrix, k): #use Hierarchical clustering print 'using hierarchical clustering......' ac = Ward(n_clusters=k) ac.fit(data_matrix) result = ac.fit_predict(data_matrix) return result
def test_connectivity_popagation(): """ Check that connectivity in the ward tree is propagated correctly during merging. """ from sklearn.neighbors import NearestNeighbors X = np.array( [ (0.014, 0.120), (0.014, 0.099), (0.014, 0.097), (0.017, 0.153), (0.017, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.152), (0.018, 0.149), (0.018, 0.144), ] ) nn = NearestNeighbors(n_neighbors=10).fit(X) connectivity = nn.kneighbors_graph(X) ward = Ward(n_clusters=4, connectivity=connectivity) # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def compute_clusters(dataset, features_vector): """ Apply clustering method """ labels = dataset.target true_k = np.unique(labels).shape[0] # Run clustering method print "Performing clustering with method ", cmd_options.clust_method.upper( ) print if (cmd_options.clust_method == "hclust"): result = features_vector.toarray() ward = Ward(n_clusters=true_k) ward.fit(result) return ward if (cmd_options.clust_method == "kmeans"): km = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, verbose=1) km.fit(features_vector) return km
def test_connectivity_popagation(): """ Check that connectivity in the ward tree is propagated correctly during merging. """ from sklearn.neighbors import kneighbors_graph X = np.array([ (.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144), ]) connectivity = kneighbors_graph(X, 10) ward = Ward(n_clusters=4, connectivity=connectivity) # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def __hieclu(self): #use Hierarchical clustering print 'using hierarchical clustering......' ac = Ward(n_clusters = self.k) ac.fit(self.data_matrix) result = ac.fit_predict(self.data_matrix) return result
def test_connectivity_popagation(): """ Check that connectivity in the ward tree is propagated correctly during merging. """ from sklearn.neighbors import NearestNeighbors X = np.array([ (.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144), ]) nn = NearestNeighbors(n_neighbors=10, warn_on_equidistant=False).fit(X) connectivity = nn.kneighbors_graph(X) ward = Ward(n_clusters=4, connectivity=connectivity) # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def test_ward_clustering(): """ Check that we obtain the correct number of clusters with Ward clustering. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(100, 50) connectivity = grid_to_graph(*mask.shape) clustering = Ward(n_clusters=10, connectivity=connectivity) clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10)
def test_ward_clustering(): """ Check that we obtain the correct number of clusters with Ward clustering. """ np.random.seed(0) mask = np.ones([10, 10], dtype=np.bool) X = np.random.randn(100, 50) connectivity = grid_to_graph(*mask.shape) clustering = Ward(n_clusters=10, connectivity=connectivity) clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10)
def test_connectivity_fixing_non_lil(): """ Check non regression of a bug if a non item assignable connectivity is provided with more than one component. """ # create dummy data x = np.array([[0, 0], [1, 1]]) # create a mask with several components to force connectivity fixing m = np.array([[True, False], [False, True]]) c = grid_to_graph(n_x=2, n_y=2, mask=m) w = Ward(connectivity=c) w.fit(x)
def test_ward_clustering(): """ Check that we obtain the correct number of clusters with Ward clustering. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(100, 50) connectivity = grid_to_graph(*mask.shape) clustering = Ward(n_clusters=10, connectivity=connectivity) clustering.fit(X) # test caching clustering = Ward(n_clusters=10, connectivity=connectivity, memory=mkdtemp()) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) # Turn caching off now clustering = Ward(n_clusters=10, connectivity=connectivity) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) np.testing.assert_array_equal(clustering.labels_, labels) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = Ward(n_clusters=10, connectivity=connectivity.todense()) assert_raises(TypeError, clustering.fit, X) clustering = Ward(n_clusters=10, connectivity=sparse.lil_matrix( connectivity.todense()[:10, :10])) assert_raises(ValueError, clustering.fit, X)
def spectral_cluster(data, n_clusters, method='sl'): # 获取拉普拉斯矩阵 if method == 'NJW': lap_matrix = get_lap_matrix_njw(data, 0.1) eigenvalues, eigenvectors = np.linalg.eig(lap_matrix) idx = eigenvalues.argsort()[::-1] eigenvalues = eigenvalues[idx] eigenvectors = eigenvectors[:, idx] elif method == 'self-tuning': lap_matrix = get_lap_matrix_self_tuning(data) eigenvalues, eigenvectors = np.linalg.eig(lap_matrix) idx = eigenvalues.argsort()[::-1] eigenvalues = eigenvalues[idx] eigenvectors = eigenvectors[:, idx] else: lap_matrix = get_lap_matrix_sl(data, 0.1) eigenvalues, eigenvectors = np.linalg.eig(lap_matrix) idx = eigenvalues.argsort() eigenvalues = eigenvalues[idx] eigenvectors = eigenvectors[:, idx] #print(eigenvalues) # 获取前n_clusters个特征向量 x_matrix = eigenvectors[:, 0:n_clusters] # 归一化特征向量矩阵 y_matrix = normal_eigen(x_matrix) # 调用自己写的k_means函数 """ k_dist_dic, k_centers_dic, cluster_group = kmeans.k_means(y_matrix, n_clusters) mat_plot_cluster_sample(data, cluster_group, method) """ # 调用自己写的bi_k_means函数 """center_list, cluster_assign = bikmeans.exe_bi_k_means(y_matrix, n_clusters) labels = cluster_assign[:, 0] mat_plot_cluster_sample(data, labels. method) # 调用sklearn中的KMeans函数,效果比自己写的强了好多 k_means = KMeans(n_clusters) k_means.fit(y_matrix) #k_centers = k_means.cluster_centers_ #mat_plot_cluster_sample(data, k_means.labels_, method) """ # 调用sklearn中的hierarchical 聚类方法进行聚类 hie_cluster = Ward(n_clusters) hie_cluster.fit(y_matrix) mat_plot_cluster_sample(data, hie_cluster.labels_, method)
def ward(self, X, n_clusters, plot=True): k_means = Ward(n_clusters=n_clusters, copy=False, compute_full_tree=True, memory="cache") k_means.fit(X) labels = k_means.labels_ pl.close('all') pl.figure(1) pl.clf() if plot: colors = "rbgcmybgrcmybgrcmybgrcm" * 10 X2d = RandomizedPCA(n_components=2).fit_transform(X) for i in xrange(len(X2d)): x = X2d[i] pl.plot(x[0], x[1], "o", markerfacecolor=colors[labels[i]], markeredgecolor=colors[labels[i]], alpha=0.035) pl.show() return k_means.labels_
def cluster_ward(self, calpha=True): ''' cluster the positively predicted residues using the Ward method. Returns a list of cluster labels the same length as the number of positively predicted residues. ''' if calpha: data_atoms = self.positive_surface_residues.ca #else: # data_atoms = self.positive_surface_residues.select('ca or sidechain').copy() if data_atoms.getCoords().shape[0] < 4: print self.pdbid, data_atoms.getCoords().shape return {} connectivity = kneighbors_graph(data_atoms.getCoords(), 5) ward = Ward(n_clusters=self.WARD_N_CLUSTERS, connectivity=connectivity) ward.fit(data_atoms.getCoords()) resnums = data_atoms.getResnums() reslabels = ward.labels_ clusters = sorted([resnums[reslabels==i] for i in set(reslabels)], key=len, reverse=True) return dict(enumerate(clusters))
def compute_clusters(dataset,features_vector): """ Apply clustering method """ labels = dataset.target true_k = np.unique(labels).shape[0] # Run clustering method print "Performing clustering with method ", cmd_options.clust_method.upper() print if(cmd_options.clust_method == "hclust"): result = features_vector.toarray() ward = Ward(n_clusters=true_k) ward.fit(result) return ward if(cmd_options.clust_method == "kmeans"): km = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, verbose=1) km.fit(features_vector) return km
print("Homogeneity k-means: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness k-means: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure k-means: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Silhouette Coefficient k-means: %0.3f" % metrics.silhouette_score(clustering, km.labels_, sample_size = 8000)) # DBSCAN # Structured hierarchical clustering db = DBSCAN() db.fit(clustering) print 'DBSCAN clusters created..' print("Homogeneity DBSCAN: %0.3f" % metrics.homogeneity_score(labels, db.labels_)) print("Completeness DBSCAN: %0.3f" % metrics.completeness_score(labels, db.labels_)) print("V-measure DBSCAN: %0.3f" % metrics.v_measure_score(labels, db.labels_)) print("Silhouette Coefficient DBSCAN: %0.3f" % metrics.silhouette_score(clustering, db.labels_, sample_size = 5000)) # Structured hierarchical clustering ward = Ward(n_clusters = 9) ward.fit(clustering) print 'Hierarchical clusters created..' print("Homogeneity hierarchical: %0.3f" % metrics.homogeneity_score(labels, ward.labels_)) print("Completeness hierarchical: %0.3f" % metrics.completeness_score(labels, ward.labels_)) print("V-measure hierarchical: %0.3f" % metrics.v_measure_score(labels, ward.labels_)) print("Silhouette Coefficient hierarchical: %0.3f" % metrics.silhouette_score(clustering, ward.labels_, sample_size = 5000))
print i train = pd.concat([train, pd.get_dummies(raw_train[i])], axis=1) freq = train.groupby('Report ID').sum() freq = freq.drop('Has Combined Queries', 1) # Train Model ############################# num_cluster = 12 kmean = KMeans(n_clusters=num_cluster, max_iter=400, verbose = 0, n_jobs = 2, n_init=20, tol=1e-6) model_kmean = kmean.fit(freq) ward = Ward(n_clusters=num_cluster) model_ward = ward.fit(freq) from sklearn.neighbors import kneighbors_graph connectivity = kneighbors_graph(freq, n_neighbors=4) #ward = Ward(n_clusters=num_cluster, connectivity = connectivity) #model_ward = ward.fit(freq) # Visualization ##################################################### import mpl_toolkits.mplot3d.axes3d as p3 import pylab as pl from sklearn.datasets.samples_generator import make_friedman3 def plot(model, data, name):
def encode(self, interm_rep, neighborhood_size = 26, clust_ratio=10, encoding='geometrical', similarity_measure='pearson', threshold=0.3, n_jobs=1, **kwds): """ Parameters ---------- interm_rep: IntermRep IntermRep object containing the arr_xyz and arr_voxel matrixes. neighborhood_size: int Number of neighbors each voxel will be connected to. clust_ratio: int The number of clusters will be equal to n/clust_ratio, where n is the number of voxels. encoding: string Type of encoding. 'geometrical' and 'functional' are allowed. similarity_measure: string Similarity measure used to compare the representative value of each parcel (cluster). 'pearson' or the measures available in scikit-learn are allowed. threshold: float Threshold applied to the similarity values in order to define the edges in the graph. Returns ------- g: Graph Networkx graph representing the graph encoding of the data. """ #computing the connectivity matrix, each voxel is connected to #"neighborhood_size" neighbors. # conn = kneighbors_graph(interm_rep.arr_xyz, n_neighbors=neighborhood_size) # conn_n = kneighbors_graph(interm_rep.arr_xyz, n_neighbors=neighborhood_size) # conn_r = radius_neighbors_graph(interm_rep.arr_xyz, radius=10) # conn = conn_n * conn_r #Hierarchical clustering algorithm. The number of clusters is defined #accoring to the parameter "clust_ratio". ward = Ward(n_clusters=len(interm_rep.arr_xyz)/clust_ratio, connectivity=conn) #ward = Ward(n_clusters=60, connectivity=conn) #Type of encoding: geometrical (only xyz data is used) or # functional (voxel time series is used). if encoding=='geometrical': ward.fit(interm_rep.arr_xyz) elif encoding=='functional': ward.fit(interm_rep.arr_voxels) labels = ward.labels_ #Plotting the voxels with the cluster labels. #pp.plot_clustering_intermediate_representation(interm_rep, labels*10) #Computing the unique cluster indentifiers l_unique = np.unique(labels) mean_voxels = np.zeros((len(l_unique), interm_rep.arr_voxels.shape[1])) mean_xyz = np.zeros((len(l_unique), interm_rep.arr_xyz.shape[1])) cont = 0 for i in l_unique: #Taking the possitions corresponding to the same cluster. pos = np.where(labels == i)[0] #Taking data from these possitions and computing the mean time serie m_voxel = interm_rep.arr_voxels[pos].mean(0) #Taking the xyz from these positions and computing the mean value m_xyz = interm_rep.arr_xyz[pos].mean(0) mean_voxels[cont] = m_voxel mean_xyz[cont] = m_xyz cont += 1 #plotting the voxels time series for each cluster #pp.plot_interm_representation_time_series(ir.IntermRep(mean_voxels, mean_xyz)) #The new intermediate representation is given by mean_voxels and # mean_xyz. #Computing similarity matrix and applying the threshold adj_mat = np.zeros((len(mean_voxels), len(mean_voxels)), dtype = np.byte) for j in range(len(mean_voxels) - 1): for k in range(j + 1, len(mean_voxels)): if similarity_measure == 'pearson': aux = st.pearsonr(mean_voxels[j], mean_voxels[k])[0] else: aux = skpw.pairwise_kernel(mean_voxels[j], mean_voxels[k], metric = similarity_measure, n_jobs = n_jobs) if aux >= threshold: adj_mat[j,k] = 1 adj_mat[k,j] = 1 # #Weighted encoding (for graph kernels that work with weighted graphs) # #------------------------------------ # adj_mat = np.zeros((len(mean_voxels), len(mean_voxels)), # dtype = np.float) # for j in range(len(mean_voxels) - 1): # for k in range(j + 1, len(mean_voxels)): # if similarity_measure == 'pearson': # aux = st.pearsonr(mean_voxels[j], mean_voxels[k])[0] # else: # aux = skpw.pairwise_kernel(mean_voxels[j], mean_voxels[k], # metric = similarity_measure, # n_jobs = n_jobs) ## if aux >= threshold: ## adj_mat[j,k] = aux ## adj_mat[k,j] = aux # adj_mat[j,k] = adj_mat[k,j] = aux # adj_mat = (adj_mat - np.mean(adj_mat))/np.std(adj_mat) # adj_mat = (adj_mat - np.min(adj_mat))/(np.max(adj_mat) - np.min(adj_mat)) # adj_mat = np.where(adj_mat>=threshold, 1, 0) # #------------------------------------ #Building the graph from the adjacency matrix g = nx.from_numpy_matrix(adj_mat) #Spliting the node degrees into some categories and using them as node labels. # num_lab = 5 deg = g.degree() # for k in deg: # deg[k]/= num_lab nx.set_node_attributes(g, 'node_label', deg) ############ #Storing the mean time-series of each parcell as a node attribute ts_att = {} mv = mean_voxels.tolist() for pos in range(len(mv)): ts_att[pos] = mv[pos] nx.set_node_attributes(g, 'time_series', ts_att) #Saving the graphs for CLFR subject (the one for which I have the structural data) # if interm_rep.subj_name == 'CLFR': # nx.write_gexf(g, 'graph_gephi_format.gexf') # np.savetxt('CLFR_clusters_xyz.txt', mean_xyz, fmt='%1d', delimiter=' ') # edges = np.array(np.where(adj_mat==1)).T # np.savetxt('CLFR_clusters_timeseries_cond%s.txt' %(interm_rep.cls), edges, fmt='%1d', delimiter=' ') #Plot Graphs #pp.plot_graph(mean_xyz, g) return g
def ward(X, n_clust): "H" ward = Ward(n_clusters=n_clust) ward.fit(X) return ward
def HierachicalClustering(X, Expect_ext): from sklearn.cluster import Ward HC = Ward(n_clusters=Expect_ext) HC.fit(X) return HC.labels_
class VisualVocabulary: """ Creates a visual vocabulary and quantises visual features """ def __init__(self, pathFile=None, flagVerbose=False): self.mbk = None self.ward = None # If a path file is provided... if pathFile != None: # ...read from disk self.loadFromDisk(pathFile) if flagVerbose == True: self.flagVerbose = 1 else: self.flagVerbose = 0 def readImageIdsFromTxtFile(self, pathTxtFile): """ Read the image IDs contained in a text file """ print pathTxtFile if not os.path.exists(pathTxtFile): print 'File not found ' + pathTxtFile return [] # Read the file containing the image IDs fileDataset = open(pathTxtFile, 'r') # Read lines from the text file, stripping the end of line character imageIds = [line.strip() for line in fileDataset] # Close file fileDataset.close() return imageIds def buildFromImageCollection(self, pathTxtFile, pathDirImages, fileImageExtension, vocabularySize=4096, maxNumImages=sys.maxint): # Read the image IDs imageIds = self.readImageIdsFromTxtFile(pathTxtFile) # If there are more images than the considered ones... if (len(imageIds) > maxNumImages): imageIds = random.sample(imageIds, maxNumImages) # Extract the SURF descriptors from a collection of images and save in dictionary surfExtractor = SurfExtractor(True, True) surfExtractor.processCollectionFilesImage(imageIds, pathDirImages, fileImageExtension) # Create a numpy array from the descriptors descriptors = surfExtractor.getDescriptors() arr_descriptor = np.vstack(tuple(descriptors)) # if( self.flagRunOnServer == True): # # K-means: The amount of clusters is specified with 'k' in the sci-kit version # # in the GPI computation service # self.mbk = MiniBatchKMeans(init='k-means++', # k=vocabularySize, # init_size=3*vocabularySize, # max_no_improvement=10, # verbose=1) # else: # K-means: The amount of clusters is specified in 'n_clusters' in latest scikit-learn version self.mbk = MiniBatchKMeans(init='k-means++', n_clusters=vocabularySize, init_size=3 * vocabularySize, max_no_improvement=10, verbose=self.flagVerbose) self.mbk.fit(arr_descriptor) def buildFromImageCollectionWard(self, pathTxtFile, pathDirImages, fileImageExtension, vocabularySize, maxNumImages=sys.maxint): # vocabularySize could be 4096 # Read the image IDs imageIds = self.readImageIdsFromTxtFile(pathTxtFile) # If there are more images than the considered ones... if (len(imageIds) > maxNumImages): imageIds = random.sample(imageIds, maxNumImages) # Extract the SURF descriptors from a collection of images and save in dictionary surfExtractor = SurfExtractor(True) surfExtractor.processCollectionFilesImage(imageIds, pathDirImages, fileImageExtension) # Create a numpy array from the descriptors descriptors = surfExtractor.getDescriptors() arr_descriptor = np.vstack(tuple(descriptors)) #self.mbk = MiniBatchKMeans(init='k-means++', # k=vocabularySize, # n_init=10, # max_no_improvement=10, # verbose=0) self.ward = Ward(n_clusters=vocabularySize) self.ward.fit(arr_descriptor) def loadFromDisk(self, pathFile): if not os.path.exists(pathFile): print "File not found " + pathFile return self.mbk = pickle.load(open(pathFile, "rb")) def saveToDisk(self, pathFile): # Save mini batch K-Means to disk using Pickle pickle.dump(self.mbk, open(pathFile, "wb")) def quantizeVector(self, descriptors): # if len(descriptors)<128: # descriptors # Vector quantization with the visual vocabulary quant = self.mbk.predict(descriptors) # Build Histogram histogram = np.histogram(quant, bins=self.mbk.n_clusters) # histogram = np.histogram(quant, bins=self.mbk.k) return histogram
from sklearn.cluster import Ward ward = Ward(n_clusters=15) n_samples = np.logspace(.5, 3, 9) n_features = np.logspace(1, 3.5, 7) N_samples, N_features = np.meshgrid(n_samples, n_features) scikits_time = np.zeros(N_samples.shape) scipy_time = np.zeros(N_samples.shape) for i, n in enumerate(n_samples): for j, p in enumerate(n_features): X = np.random.normal(size=(n, p)) t0 = time.time() ward.fit(X) scikits_time[j, i] = time.time() - t0 t0 = time.time() hierarchy.ward(X) scipy_time[j, i] = time.time() - t0 ratio = scikits_time/scipy_time pl.clf() pl.imshow(np.log(ratio), aspect='auto', origin="lower") pl.colorbar() pl.contour(ratio, levels=[1, ], colors='k') pl.yticks(range(len(n_features)), n_features.astype(np.int)) pl.ylabel('N features') pl.xticks(range(len(n_samples)), n_samples.astype(np.int)) pl.xlabel('N samples')
from sklearn.cluster import Ward ward = Ward(n_clusters=3) n_samples = np.logspace(.5, 3, 9) n_features = np.logspace(1, 3.5, 7) N_samples, N_features = np.meshgrid(n_samples, n_features) scikits_time = np.zeros(N_samples.shape) scipy_time = np.zeros(N_samples.shape) for i, n in enumerate(n_samples): for j, p in enumerate(n_features): X = np.random.normal(size=(n, p)) t0 = time.time() ward.fit(X) scikits_time[j, i] = time.time() - t0 t0 = time.time() hierarchy.ward(X) scipy_time[j, i] = time.time() - t0 ratio = scikits_time / scipy_time pl.figure("scikit-learn Ward's method benchmark results") pl.imshow(np.log(ratio), aspect='auto', origin="lower") pl.colorbar() pl.contour(ratio, levels=[1, ], colors='k') pl.yticks(range(len(n_features)), n_features.astype(np.int)) pl.ylabel('N features') pl.xticks(range(len(n_samples)), n_samples.astype(np.int)) pl.xlabel('N samples')