def split_highest_sse_node(self): highest_sse_node = self._find_highest_sse_node(self.split_dataset_loader_gen()) leaf_id = highest_sse_node.node_id node_data = None for batch_data in self.split_dataset_loader_gen(): labels_np, _, node_id_label_map = self.leaf_prediction_np(batch_data) node_label_id = node_id_label_map[leaf_id] node_data_batch = batch_data.data.cpu().numpy()[labels_np == node_label_id] if node_data is None: node_data = node_data_batch else: node_data = np.concatenate([node_data, node_data_batch], 0) init_centers = k_means(node_data, 2, n_init=20)[0] new_left_leaf = ECTnode.new_leaf_node(self.next_free_node_id, self, init_centers[0, :]) new_right_leaf = ECTnode.new_leaf_node(self.next_free_node_id + 1, self, init_centers[1, :]) highest_sse_node.split_node(self.n_splits + 1, new_left_leaf, new_right_leaf) self.next_free_node_id += 2 self.n_splits += 1 self.add_module(f"node_{new_left_leaf.node_id}", new_left_leaf) self.add_module(f"node_{new_right_leaf.node_id}", new_right_leaf) self.leaf_nodes.remove(highest_sse_node) self.leaf_nodes.append(new_left_leaf) self.leaf_nodes.append(new_right_leaf) self._update_leaf_node_mappings() self.optimizer.add_param_group({'params': new_left_leaf.parameters()}) self.optimizer.add_param_group({'params': new_right_leaf.parameters()}) logger.info(f"new plit now we have {self.n_leaf_nodes} leaves")
def runSpectralEmbedding(self, X, n_components=2, n_clusters=2, k_means_=False): # Create distance matrix self.create_distance_matrix(X) # Run spectral embedding for n_components embedding = SpectralEmbedding(n_components=n_components, affinity='precomputed', random_state=42, n_jobs=-1).fit(self.adjacencyMatrix) # Alternative way # embedding_otherapp = spectral_embedding(self.adjacencyMatrix, # n_components=n_components, norm_laplacian=True, random_state=42, # drop_first=True) # Run k means if set to True if k_means_: _, kmeans_labels, _ = k_means(X=embedding.embedding_, n_clusters=n_clusters, random_state=42, n_init=10) # Alternative embedding - More freedom, but slower # _, kmeans_labels2, _ = k_means(X=embedding_otherapp, n_clusters= # n_clusters, random_state=42, n_init=10) return kmeans_labels, embedding.embedding_ else: return embedding.embedding_
def my_uniteigenvector_zeroeigenvalue_cluster(k): G = nx.read_gpickle('data/undirected(fortest).gpickle') A = nx.adjacency_matrix(G, nodelist=G.nodes()[:-1], weight='weight') #A=A.toarray() #np.fill_diagonal(A,0.01) #add node with its own weight to itself #Tri = np.diag(np.sum(A, axis=1)) #L = Tri - A #Tri_1 = np.diag(np.reciprocal(np.sqrt(Tri).diagonal())) #Ls = Tri_1.dot(L).dot(Tri_1) Ls, dd = graph_laplacian(A,normed=True, return_diag=True) eigenvalue_n, eigenvector_n = eigsh(Ls*(-1), k=k, sigma=1.0, which='LM', tol=0.0) #for ic,vl in enumerate(eigenvalue_n): # if abs(vl-0)<=1e-10: # eigenvector_n[:, ic] = np.full(len(G.nodes()[:-1]),1.0 / math.sqrt(len(G.nodes()[:-1]))) # zero eigenvalue eigenvector_n[:, -1] = np.full(len(G.nodes()[:-1]), 1.0 / math.sqrt(len(G.nodes()[:-1]))) # zero eigenvalue for ir,n in enumerate(eigenvector_n): eigenvector_n[ir]=n/float(np.linalg.norm(n)) # normalize to unitvector _, labels, _ = k_means(eigenvector_n, k, random_state=None, n_init=100) return labels
def spectral_clustering_sg(self, affinity, max_clusters=8, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans'): if assign_labels not in ('kmeans', 'discretize'): raise ValueError("The 'assign_labels' parameter should be " "'kmeans' or 'discretize', but '%s' was given" % assign_labels) random_state = check_random_state(random_state) n_components = max_clusters maps, lambdas = self.spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False) # determin n_clusters by Spectral Gap HERE!! n_clusters = self.estimate_num_of_clusters(lambdas) if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters, random_state=0, n_init=n_init) else: labels = discretize(maps, random_state=random_state) return labels
def k_means_label(pointcloud, n_clusters, init, precompute_distances, n_init=10, max_iter=300, tol=1e-4, random_state=None, n_jobs=1, algorithm="auto"): """ Returns -------- centroid, labels, inertia, best_n_iter """ res = k_means(pointcloud, n_clusters, init=init, precompute_distances=precompute_distances, n_init=n_init, max_iter=max_iter, verbose=False, tol=tol, random_state=random_state, copy_x=True, n_jobs=n_jobs, algorithm=algorithm, return_n_iter=False) return res[1]
def _create_root_node_centers(self): node_data = None for batch_data in self.split_dataset_loader_gen(): if node_data is None: node_data = batch_data.detach().cpu().numpy() else: node_data = np.concatenate([node_data, batch_data.detach().cpu().numpy()], 0) return k_means(node_data, 2, n_init=20)[0]
def run_experiment(ae_model_path): logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Working now on {ae_model_path.name}") logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) new_seed = random.randint(0, 1000) logger.info(f"Seed value for this is: {new_seed}") set_random_seed(new_seed) ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() # Get embedded data embedded_data = None for batch_data in torch.utils.data.DataLoader(pt_data, batch_size=256, shuffle=False): embedded_batch_np = ae_module.forward( batch_data.cuda())[0].detach().cpu().numpy() if embedded_data is None: embedded_data = embedded_batch_np else: embedded_data = np.concatenate([embedded_data, embedded_batch_np], 0) del ae_module # Perform k-means k_means_labels = k_means(embedded_data, n_clusters, n_init=20)[1] k_means_nmi_value = nmi(gold_labels, k_means_labels, average_method='arithmetic') k_means_acc_value = cluster_acc(gold_labels, k_means_labels)[0] result_file = Path(f"{result_dir}/results_ae_kmeans_{dataset_name}.txt") result_file_exists = result_file.exists() f = open(result_file, "a+") if not result_file_exists: f.write("#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\n") f.write( f"{ae_model_path.name}\t{k_means_nmi_value}\t{k_means_acc_value}\n") f.close()
def Final_Result(self): self.Scablekmeans_ProcessingCenter() try: return kmean.k_means(self.matrix, self.k, init=numpy.array(self.process_center), n_init=1) except: print("최종 계산된 Center의 갯수가 K 보다 작습니다...", "\n K 값 : ", self.k, " / Center 갯수 : ", len(self.process_center))
def fit(self, X, y=None): """Creates an affinity matrix for X using the selected affinity, then applies spectral clustering to this affinity matrix. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) OR, if affinity==`precomputed`, a precomputed affinity matrix of shape (n_samples, n_samples) """ # this class is not tested with sparse matrix. # any contribution (report, coding) is welcome! X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64) ell = self.n_clusters + 1 # +1 for drop_first, x2 for zero suppression in frequent_direction. k = self.n_buffer_rows if self.affinity == 'rbf': self.affinity_matrix_, dd = laplacian_sketch_rbf_kernel( X, ell, k, normed=self.normed, gamma=self.gamma) elif self.affinity == 'cosine': self.affinity_matrix_, dd = laplacian_sketch_cosine_similarity( X, ell, k, normed=self.normed) else: params = self.kernel_params if params is None: params = {} if callable(self.affinity): self.affinity_matrix_, dd = laplacian_sketch( X, ell, k, False, self.normed, self.affinity, params) else: warnings.warn("%s is unknown kernel" % self.affinity) random_state = check_random_state(self.random_state) # spectral embedding post process. maps = spectral_embedding_imitation(self.affinity_matrix_, dd, n_components=self.n_clusters, random_state=random_state, drop_first=False) if self.assign_labels == 'kmeans': _, self.labels_, _ = k_means(maps, n_clusters, random_state=random_state, n_init=n_init) else: self.labels_ = discretize(maps, random_state=random_state)
def cluster(self, affinities): laplacian, diagonal = graphutil.graph_laplacian(affinities, normed=True, return_diag=True) self.embedding = self.embed(laplacian, diagonal, self.k, self.tol) centroid_vals, self.labels, _ = k_means(self.embedding, self.k, random_state=self.rand, n_init=self.n_init, init=self.init_centroids) self.centroids = [] for c in centroid_vals: self.centroids.append( np.argmin([np.sum((c - e)**2) for e in self.embedding])) return self.labels
def run_experiment(ae_model_path): logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Working now on {ae_model_path.name}") logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) new_seed = random.randint(0, 1000) logger.info(f"Seed value for this is: {new_seed}") set_random_seed(new_seed) train = torch.utils.data.TensorDataset(pt_data) train_loader = torch.utils.data.DataLoader(train, batch_size=256, shuffle=True) n_features = pt_data.shape[1] # Same loss as in the DEC implementation ae_reconstruction_loss_fn = lambda x, y: torch.mean((x - y)**2) ae_module = stacked_ae(n_features, [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() node_data = None for batch_data in torch.utils.data.DataLoader(pt_init_sample, batch_size=256, shuffle=True): embedded_batch_np = ae_module.forward( batch_data.cuda())[0].detach().cpu().numpy() if node_data is None: node_data = embedded_batch_np else: node_data = np.concatenate([node_data, embedded_batch_np], 0) init_centers = k_means(node_data, n_clusters, n_init=20)[0] # Initialize cluster centers based on a smaller sample cluster_module = DEC(init_centers).cuda() optimizer = torch.optim.Adam(list(ae_module.parameters()) + list(cluster_module.parameters()), lr=0.001) def evaluate(train_round_idx, ae_module, cluster_module): test_loader = torch.utils.data.DataLoader( torch.utils.data.TensorDataset(pt_data), batch_size=256) pred_labels = np.zeros(pt_data.shape[0], dtype=np.int) index = 0 n_batches = 0 for batch_data in test_loader: batch_data = batch_data[0].cuda() n_batches += 1 batch_size = batch_data.shape[0] embedded_data, reconstructed_data = ae_module.forward(batch_data) labels = cluster_module.prediction_hard_np(embedded_data) pred_labels[index:index + batch_size] = labels index = index + batch_size pred_tree = dendrogram_purity_tree_from_clusters( cluster_module, pred_labels, 'single') pred_tree2 = dendrogram_purity_tree_from_clusters( cluster_module, pred_labels, 'complete') lp = leaf_purity(pred_tree, gold_labels) leaf_purity_value = f"{lp[0]:1.3}\t({lp[1]:1.3})" dp_value_single = dendrogram_purity(pred_tree, gold_labels) dp_value_complete = dendrogram_purity(pred_tree2, gold_labels) logger.info( f"{train_round_idx} Evaluation: leaf_purity: {leaf_purity_value}, purity_single: {dp_value_single}, purity_complete: {dp_value_complete}" ) return dp_value_single, dp_value_complete, leaf_purity_value evaluate("init", ae_module, cluster_module) n_rounds = 40000 train_round_idx = 0 while True: # each iteration is equal to an epoch for batch_data in train_loader: train_round_idx += 1 if train_round_idx > n_rounds: break batch_data = batch_data[0].cuda() embedded_data, reconstruced_data = ae_module.forward(batch_data) ae_loss = ae_reconstruction_loss_fn(batch_data, reconstruced_data) cluster_loss = cluster_module.loss_dec_compression(embedded_data) loss = cluster_loss + 0.1 * ae_loss if train_round_idx == 1 or train_round_idx % 100 == 0: logger.info( f"{train_round_idx} - loss in this batch: cluster_loss:{cluster_loss.item()} " f"ae_loss:{ae_loss.item()} total_loss: {ae_loss.item() + cluster_loss.item()}" ) # Backward pass optimizer.zero_grad() loss.backward() optimizer.step() if train_round_idx % 2000 == 0: evaluate(train_round_idx, ae_module, cluster_module) else: # For else is being executed if break did not occur, we continue the while true loop otherwise we break it too continue break # Break while loop here # Write last evaluation dp_value_single, dp_value_complete, leaf_purity_value = evaluate( "", ae_module, cluster_module) result_file = Path(result_dir, f"results_{dataset_name}.txt") result_file_exists = result_file.exists() f = open(result_file, "a+") if not result_file_exists: f.write( "#\"ae_model_name\"\t\"Dendrogram_Purity Single\"\t\"Dendrogram_Purity Complete\"\t\"Leaf_Purity\t(Std)\"\n" ) f.write( f"{ae_model_path.name}\t{dp_value_single}\t{dp_value_complete}\t{leaf_purity_value}\n" ) f.close()
dist = numpy.sqrt(numpy.sum(numpy.square(vec1 - vec2))) return dist K=65536 print(K) with open('ox5kdelf-full', 'rb') as file: b=pickle.load(file) with open('ox5kdelfquery-full', 'rb') as file: a=pickle.load(file) print(a[0]['filename'],b[30]['filename'],b[69]['filename'],b[75]['filename']) c=[] print(type(b[0]['descriptor_np_list'][0])) for i in range(len(b)): for j in range(len(b[i]['descriptor_np_list'])): c.append(b[i]['descriptor_np_list'][j]) c=np.array(c) codewords, _ ,_ ,_= k_means(c, K,max_iter=20,return_n_iter=True) code=[] query=[] ''' i=0 gd=np.zeros((K,40), dtype=np.float32) for j in range(len(a[i]['descriptor_np_list'])): x=a[i]['descriptor_np_list'][j].reshape(1,40) tmp,_=vq(x,codewords) gd[tmp]+=x-codewords[tmp] gd0=gd.reshape(1,-1) print(gd0) i=30 gd=np.zeros((K,40), dtype=np.float32)
import torch.utils.data from sklearn.cluster.k_means_ import k_means from ect.methods.DEC import DEC from scripts.Config import * from scripts.projection_problem.common_stuff import * ae_module, pt_data, gold_labels, _, train_loader, ae_reconstruction_loss_fn = init_data_and_ae( ) embedded_data_np = ae_module.encode(pt_data).detach().cpu().numpy() dec_module = DEC(k_means(embedded_data_np, 2)[0]).cuda() optimizer = torch.optim.Adam(list(ae_module.parameters()) + list(dec_module.parameters()), lr=0.001) gamma = 0.1 # Put 0.0 here for pure DEC n_rounds = 2000 train_round_idx = 0 while True: # each iteration is equal to an epoch for batch_data in train_loader: train_round_idx += 1 if train_round_idx > n_rounds: break batch_data = batch_data[0] embedded_data, reconstruced_data = ae_module.forward(batch_data) ae_loss = ae_reconstruction_loss_fn(batch_data, reconstruced_data)
# print descrs_for_vocab.shape # result = [] # for x in xrange(0,30): # print x # num_of_cluster = 1+20*x # _, _, inertia_ = k_means(descrs_for_vocab, num_of_cluster) # result.append([num_of_cluster, inertia_]) # import matplotlib.pyplot as plt # plt.plot(*zip(*result)) # plt.show() print "clustering sift features to form vocabulary" print datetime.now() vocab, _, _ = k_means(descrs_for_vocab, NUM_OF_WORD_FOR_VOCAB, verbose=True) savemat(join(result_dir, "vocab.mat"),{"vocab":vocab}) else: vocab = loadmat(join(result_dir, "vocab.mat"))['vocab'] print vocab.shape # extract sift features, dowmsample if needed, convert to BOW # 1000 sift features * 128dim * 2byte -> 4 images per MB -> 4000 images per GB if not isfile(join(result_dir, "train_bow.mat")): print "computing bag of word representation for train images. This may take a while, but the result will be saved for future usage" train_image_path = [] train_image_classes = [] query_image_path = []
def fit(self, X, y=None, sample_weight=None): """Compute k-means clustering. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. y : Ignored not used, present here for API consistency by convention. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None) """ if self.n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % self.n_init) random_state = check_random_state(self.random_state) if self.max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % self.max_iter) if self.precompute_distances == 'auto': precompute_distances = False elif isinstance(self.precompute_distances, bool): precompute_distances = self.precompute_distances else: raise ValueError("precompute_distances should be 'auto' or True/False" ", but a value of %r was passed" % self.precompute_distances) # avoid forcing order when copy_x=False order = "C" if self.copy_x else None X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], order=order, copy=self.copy_x) daal_ready = not sp.issparse(X) and not precompute_distances daal_ready = daal_ready and hasattr(X, '__array__') if daal_ready: X_len = _num_samples(X) daal_ready = (self.n_clusters <= X_len) if daal_ready and sample_weight is not None: sample_weight = np.asarray(sample_weight) daal_ready = (sample_weight.shape[0] == X_len) and (np.allclose( sample_weight, np.ones_like(sample_weight))) if not daal_ready: self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ k_means( X, n_clusters=self.n_clusters, sample_weight=sample_weight, init=self.init, n_init=self.n_init, max_iter=self.max_iter, verbose=self.verbose, precompute_distances=precompute_distances, tol=self.tol, random_state=random_state, copy_x=self.copy_x, n_jobs=self.n_jobs, algorithm=self.algorithm, return_n_iter=True) else: X = check_array(X, dtype=[np.float64, np.float32]) self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ _daal4py_k_means_dense( X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init, random_state) return self
# print descrs_for_vocab.shape # result = [] # for x in xrange(0,30): # print x # num_of_cluster = 1+20*x # _, _, inertia_ = k_means(descrs_for_vocab, num_of_cluster) # result.append([num_of_cluster, inertia_]) # import matplotlib.pyplot as plt # plt.plot(*zip(*result)) # plt.show() print "clustering sift features to form vocabulary" print datetime.now() vocab, _, _ = k_means(descrs_for_vocab, NUM_OF_WORD_FOR_VOCAB) savemat(join(result_dir, "vocab.mat"),{"vocab":vocab}) else: vocab = loadmat(join(result_dir, "vocab.mat"))['vocab'] print vocab.shape # extract sift features, dowmsample if needed, convert to BOW # 1000 sift features * 128dim * 2byte -> 4 images per MB -> 4000 images per GB if not isfile(join(result_dir, "train_bow.mat")): print "computing bag of word representation for train images. This may take a while, but the result will be saved for future usage" train_image_path = [] train_image_classes = [] class_mapping = []
def cloudstering(dendrogram, catalog, criteria, user_k, user_ams, user_scalpars, user_iter, save_isol_leaves, save_clust_leaves, save_branches, blind, rms, s2nlim, locscal): """ SCIMES main function. It collects parents/children of all structrures within the dendrogram, and their properties. It calls the affinity matrix-related functions (for creation, rescaling, cluster counting), and it runs several time the actual spectral clustering routine by calculating every time the silhouette of the current configuration. Input parameter are passed by the SpectralCloudstering class. Parameters ----------- dendrogram: 'astrodendro.dendrogram.Dendrogram' instance The dendrogram to clusterize. catalog: 'astropy.table.table.Table' instance A catalog containing all properties of the dendrogram structures. Generally generated with ppv_catalog module. header: 'astropy.io.fits.header.Header' instance The header of the fits data the dendrogram was generated from. Necessary to obtain the assignment cubes. criteria: list of strings Clustering criteria referred to the structure properties in the catalog (default ['volume', 'luminosity']). user_k: int The expected number of clusters, if not provided it will be guessed automatically through the eigenvalues of the unsmoothed affinity matrix. user_ams: numpy array User provided affinity matrix. Whether this is not furnish it is automatically generated through the volume and/or luminosity criteria. user_scalpars: list of floats User-provided scaling parameters to smooth the affinity matrices. user_iter: int User-provided number of k-means iterations. save_isol_leaves: bool Consider the isolated leaves (without parent) as individual 'clusters'. Useful for low resolution data where the beam size corresponds to the size of a Giant Molecular Cloud. save_clust_leaves: bool Consider unclustered leaves as individual 'clusters'. This keyword will not include the isolated leaves without parents. save_all_leaves: bool Trigger both save_isol_leaves and save_clust_leaves. save_branches: bool Retain all isolated branches usually discarded by the cluster analysis. save_all: bool Trigger all save_isol_leaves, save_clust_leaves, and save_branches. rms: int or float Noise level of the observation. Necessary tolist calculate the scaling parameter above a certain signal-to-noise ratio. s2nlim: int or float Signal-to-noise limit above which the scaling parameter is calculated. Needed only if rms is not np.nan. blind: bool Show the affinity matrices. Matplotlib required. locscaling: bool Smooth the affinity matrices using a local scaling technique. Return ------- clusts: list The dendrogram branch indexes corresponding to the identified clusters catalog: 'astropy.table.table.Table' instance The input catalog updated with dendrogram structure parent, ancestor, number of leaves, and type ('T', trunks or branches without parent; 'B', branches with parent; 'L', leaves). AMs: numpy array The affinity matrices calculated by the algorithm escalpars: list Estimated scaling parameters for the different affinity matrixes silhouette: float Silhouette of the best cluster configuration """ # Collecting all connectivity and other information into more handy lists all_structures_idx = np.arange(len(catalog[criteria[0]].data), dtype='int') all_levels = [] brc_levels = [] all_leav_names = [] all_leav_idx = [] all_brc_names = [] all_brc_idx = [] all_parents = [] all_children = [] all_struct_names = [] all_ancestors = [] all_struct_ancestors = [] all_struct_parents = [] all_struct_types = [] nleaves = [] trunk_brs_idx = [] two_clust_idx = [] mul_leav_idx = [] s2ns = [] for structure_idx in all_structures_idx: s = dendrogram[structure_idx] all_levels.append(s.level) s2ns.append(dendrogram[structure_idx].height/rms) all_struct_names.append(str(s.idx)) all_struct_ancestors.append(s.ancestor.idx) if s.parent: all_struct_parents.append(s.parent.idx) else: all_struct_parents.append(-1) nleaves.append(len(s.sorted_leaves())) ancestors = [] anc = s.parent while anc != None: ancestors.append(anc.idx) anc = anc.parent ancestors.append(s.idx) all_ancestors.append(ancestors) # If structure is a leaf find all the parents if s.is_leaf and s.parent != None: par = s.parent all_leav_names.append(str(s.idx)) parents = [] while par != None: parents.append(par.idx) par = par.parent parents.append(len(catalog[criteria[0]].data)) # This is the trunk! all_parents.append(parents) # If structure is a brach find all its leaves if s.is_branch: brc_levels.append(s.level) all_brc_idx.append(s.idx) all_brc_names.append(str(s.idx)) children = [] for leaf in s.sorted_leaves(): children.append(leaf.idx) all_children.append(children) # Trunk branches if s.parent == None: trunk_brs_idx.append(s.idx) all_leav_idx = all_leav_idx + children if s.children[0].is_branch or s.children[1].is_branch: mul_leav_idx = mul_leav_idx + children else: two_clust_idx.append(s.idx) all_struct_types.append('T') else: all_struct_types.append('B') else: all_struct_types.append('L') two_clust_idx = np.unique(two_clust_idx).tolist() dict_parents = dict(zip(all_leav_names,all_parents)) dict_children = dict(zip(all_brc_names,all_children)) dict_ancestors = dict(zip(all_struct_names,all_ancestors)) all_levels.append(-1) all_levels = np.asarray(all_levels) # Retriving needed properties from the catalog # and adding fake "trunk" properties props = [] for crit in criteria: prop = catalog[crit].data.tolist() tprop = sum(catalog[crit].data[trunk_brs_idx]) prop.append(tprop) props.append(prop) s2ns.append(1) props.append(s2ns) # Generating affinity matrices if not provided if user_ams == None: AMs = aff_matrix(len(all_leav_idx), len(catalog[criteria[0]].data), \ all_leav_idx, all_brc_idx, brc_levels, dict_children, props) if blind == False: # Showing all affinity matrices for i, crit in enumerate(criteria): plt.matshow(AMs[i,:,:]) plt.title('"'+crit+'" affinity matrix', fontsize = 'medium') plt.xlabel('leaf index') plt.ylabel('leaf index') plt.colorbar() else: AMs = user_ams S2Nmat = AMs[-1,:,:] AMs = AMs[:-1,:,:] # Check if the affinity matrix has more than 2 elements # otherwise return everything as clusters ("save_all"). if AMs.shape[1] <= 2: print("--- Not necessary to cluster. 'save_all' keyword triggered") all_leaves = [] for leaf in dendrogram.leaves: all_leaves.append(leaf.idx) clusts = all_leaves return clusts, AMs # Check whether the affinity matrix scaling parameter # are provided by the user, if so use them, otherwise # calculate them """ scpars = np.zeros(len(criteria)) if user_scalpars is not None: print("- Using user-provided scaling parameters") user_scalpars = np.asarray(user_scalpars) scpars[0:len(user_scalpars)] = user_scalpars """ scpars = np.array(user_scalpars) print("- Start spectral clustering") # Selecting the criteria and merging the matrices escalpars = [] AM = np.ones(AMs[0,:,:].shape) for i, crit in enumerate(criteria): print("-- Rescaling %s matrix" % crit) AMc, sigma = mat_smooth(AMs[i,:,:], S2Nmat, s2nlim = s2nlim, scalpar = scpars[i], lscal = locscal) AM = AM*AMc escalpars.append(sigma) # Making the reduced affinity matrices mul_leav_mat = [] for mli in mul_leav_idx: mul_leav_mat.append(all_leav_idx.index(mli)) mul_leav_mat = np.asarray(mul_leav_mat) rAM = AM[mul_leav_mat,:] rAM = rAM[:,mul_leav_mat] if blind == False: # Showing the final affinity matrix plt.matshow(AM) plt.colorbar() plt.title('Final Affinity Matrix') plt.xlabel('leaf index') plt.ylabel('leaf index') # Guessing the number of clusters # if not provided if user_k == 0: kg = guessk(rAM) else: kg = user_k-len(two_clust_idx) print("-- Guessed number of clusters = %i" % (kg+len(two_clust_idx))) if kg > 1: print("-- Number of k-means iteration: %i" % user_iter) # Find the best cluster number sils = [] min_ks = max(2,kg-15) max_ks = min(kg+15,rAM.shape[0]-1) clust_configs = [] for ks in range(min_ks,max_ks): try: evecs = spectral_embedding(rAM, n_components=ks, eigen_solver='arpack', random_state=222, eigen_tol=0.0, drop_first=False) _, all_clusters, _ = k_means(evecs, ks, random_state=222, n_init=user_iter) sil = silhouette_score(evecs, np.asarray(all_clusters), metric='euclidean') clust_configs.append(all_clusters) except np.linalg.LinAlgError: sil = 0 sils.append(sil) # Use the best cluster number to generate clusters best_ks = sils.index(max(sils))+min_ks print("-- Best cluster number found through SILHOUETTE (%f)= %i" % (max(sils), best_ks+len(two_clust_idx))) silhoutte = max(sils) all_clusters = clust_configs[np.argmax(sils)] else: print("-- Not necessary to cluster") all_clusters = np.zeros(len(all_leaves_idx), dtype = np.int32) clust_branches = clust_cleaning(mul_leav_idx, all_clusters, dict_parents, dict_children, dict_ancestors, savebranches = save_branches) clusts = clust_branches + two_clust_idx print("-- Final cluster number (after cleaning) %i" % len(clusts)) # Calculate the silhouette after cluster cleaning #fclusts_idx = np.ones(len(mul_leav_idx)) fclusts_idx = -1*all_clusters i = 1 for clust in clusts: i += 1 fleavs = dendrogram[clust].sorted_leaves() fleavs_idx = [] for fleav in fleavs: fleavs_idx.append(fleav.idx) fleavs_idx = np.asarray(fleavs_idx) # Find the position of the cluster leaves pos = np.where(np.in1d(mul_leav_idx,fleavs_idx))[0] fclusts_idx[pos] = i oldclusts = np.unique(fclusts_idx[fclusts_idx < 0]) for oldclust in oldclusts: fclusts_idx[fclusts_idx == oldclust] = np.max(fclusts_idx)+1 evecs = spectral_embedding(rAM, n_components=ks, eigen_solver='arpack', random_state=222, eigen_tol=0.0, drop_first=False) sil = silhouette_score(evecs, fclusts_idx, metric='euclidean') print("-- Final clustering configuration silhoutte %f" % sil) all_struct_types = np.asarray(all_struct_types) all_struct_parents = np.asarray(all_struct_parents) # Add the isolated leaves to the cluster list, if required if save_isol_leaves: isol_leaves = all_structures_idx[(all_struct_parents == -1) & (all_struct_types == 'L')] clusts = clusts + list(isol_leaves) print("SAVE_ISOL_LEAVES triggered. Isolated leaves added.") print("-- Total cluster number %i" % len(clusts)) # Add the unclustered leaves within clusters to the cluster list, if required if save_clust_leaves: isol_leaves = all_structures_idx[(all_struct_parents == -1) & (all_struct_types == 'L')] all_leaves = [] for leaf in dendrogram.leaves: all_leaves.append(leaf.idx) clust_leaves = [] for clust in clusts: for leaf in dendrogram[clust].sorted_leaves(): clust_leaves.append(leaf.idx) unclust_leaves = list(set(all_leaves)-set(clust_leaves + list(isol_leaves))) clusts = clusts + unclust_leaves print("SAVE_CLUST_LEAVES triggered. Unclustered leaves added.") print("-- Total cluster number %i" % len(clusts)) # Update the catalog with new information catalog['parent'] = all_struct_parents catalog['ancestor'] = all_struct_ancestors catalog['n_leaves'] = nleaves catalog['structure_type'] = all_struct_types return clusts, catalog, AMs, escalpars, silhoutte
def train(self): print(f"{datetime.now()} Pre-training evaluation:") loss, nmi, acc = self._evaluation() print(f"loss: {loss}, acc: {acc}, nmi: {nmi}") for e in range(self.current_epoch, self.config.epochs): print(f"\n{datetime.now()} epoch {e}/{self.config.epochs}") end = time.time() if self.config.refine_epoch == e: print(f"{datetime.now()} starting refinement stage, targets will be reassigned using k-means") with open(os.path.join(self.out_dir, "no_refine_run_stats.pickle"), "wb") as handle: pickle.dump(self.run_stats, handle) if self.config.refine_epoch <= e: # we are in refinement stage, reassign targets with k-means preds = [] self.model.eval() for batch in self.eval_dataloader: images, _ = batch preds.append(self.model(images.cuda()).data.cpu().numpy()) preds = np.concatenate(preds) _, labels, _ = k_means.k_means(preds, self.config.k) # find permutation of labels that is closest to previous num_correct = np.zeros((self.config.k, self.config.k)) prev_labels = np.argmax(self.targets, axis=1) for c_1 in range(self.config.k): for c_2 in range(self.config.k): num_correct[c_1, c_2] = int(((labels == c_1) * (prev_labels == c_2)).sum()) _, assignments, _ = lap.lapjv(self.n_data - num_correct) reordered = np.zeros(self.n_data, dtype=np.int) for c in range(self.config.k): reordered[labels == c] = assignments[c] self.targets = np.eye(self.config.k)[reordered] if self.config.rotnet: # train an epoch on rotation auxiliary task for batch in self.rot_dataloader: images, labels = batch unpack_images = [] for i in range(len(images[0])): for r in range(4): unpack_images.append(images[r][i]) unpack_images = np.stack(unpack_images, axis=0) labels = np.reshape(labels, newshape=-1) self.model.train() images = torch.tensor(unpack_images, dtype=torch.float, device="cuda") labels = labels.cuda() out = self.model(images, rot_head=True) loss = self.rot_crit(out, labels) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # train an epoch on main clustering task for batch in self.train_dataloader: images1, images2, indices = batch if self.config.refine_epoch > e: # optimize and update targets self.model.eval() pred = self.model(images1.cuda()).data.cpu().numpy() batch_targets = self.targets[indices] cost = euclidean_distances(pred, batch_targets) _, assignments, _ = lap.lapjv(cost) for i, idx in enumerate(indices): self.targets[idx] = batch_targets[assignments[i]] images = images2.cuda() batch_targets = torch.tensor(self.targets[indices], dtype=torch.float, device="cuda") self.model.train() pred = self.model(images) loss = self.clustering_crit(pred, batch_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_scheduler.step() loss, nmi, acc = self._evaluation() self.run_stats["loss"].append(loss) self.run_stats["acc"].append(acc) self.run_stats["nmi"].append(nmi) print(f"{datetime.now()} train epoch took: {int(time.time() - end)}s") print(f"{datetime.now()} loss: {loss}, acc: {acc}, nmi: {nmi}") self.current_epoch = e if e % self.config.plot_rate == 0: fig, ax = plt.subplots(len(self.run_stats), figsize=(10, 30)) for i, run_stat_name in enumerate(self.run_stats.keys()): ax[i].plot(range(e + 1), self.run_stats[run_stat_name]) title = run_stat_name + ' (' + str(format(self.run_stats[run_stat_name][-1], '.4f')) + ')' ax[i].set_title(title) plt.savefig(os.path.join(self.out_dir, "plots")) plt.close() self.save_checkpoint(self.out_dir)
def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, k=None, eigen_tol=0.0, assign_labels='kmeans', mode=None): """Apply clustering to a projection to the normalized laplacian. In practice Spectral Clustering is very useful when the structure of the individual clusters is highly non-convex or more generally when a measure of the center and spread of the cluster is not a suitable description of the complete cluster. For instance when clusters are nested circles on the 2D plan. If affinity is the adjacency matrix of a graph, this method can be used to find normalized graph cuts. Parameters ----------- affinity: array-like or sparse matrix, shape: (n_samples, n_samples) The affinity matrix describing the relationship of the samples to embed. **Must be symmetric**. Possible examples: - adjacency matrix of a graph, - heat kernel of the pairwise distance matrix of the samples, - symmetric k-nearest neighbours connectivity matrix of the samples. n_clusters: integer, optional Number of clusters to extract. n_components: integer, optional, default is k Number of eigen vectors to use for the spectral embedding eigen_solver: {None, 'arpack' or 'amg'} The eigenvalue decomposition strategy to use. AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities random_state: int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by the K-Means initialization. n_init: int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. eigen_tol : float, optional, default: 0.0 Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' The strategy to use to assign labels in the embedding space. There are two ways to assign labels after the laplacian embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. See the 'Multiclass spectral clustering' paper referenced below for more details on the discretization approach. Returns ------- labels: array of integers, shape: n_samples The labels of the clusters. References ---------- - Normalized cuts and image segmentation, 2000 Jianbo Shi, Jitendra Malik http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 - A Tutorial on Spectral Clustering, 2007 Ulrike von Luxburg http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323 - Multiclass spectral clustering, 2003 Stella X. Yu, Jianbo Shi http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf Notes ------ The graph should contain only one connect component, elsewhere the results make little sense. This algorithm solves the normalized cut for k=2: it is a normalized spectral clustering. """ if not assign_labels in ('kmeans', 'discretize'): raise ValueError("The 'assign_labels' parameter should be " "'kmeans' or 'discretize', but '%s' was given" % assign_labels) if not k is None: warnings.warn( "'k' was renamed to n_clusters and will " "be removed in 0.15.", DeprecationWarning) n_clusters = k if not mode is None: warnings.warn( "'mode' was renamed to eigen_solver " "and will be removed in 0.15.", DeprecationWarning) eigen_solver = mode random_state = check_random_state(random_state) n_components = n_clusters if n_components is None else n_components maps = spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False) if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters, random_state=random_state, n_init=n_init) else: labels = discretize(maps, random_state=random_state) return labels, maps
def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, k=None, eigen_tol=0.0, assign_labels='kmeans', mode=None): """Apply clustering to a projection to the normalized laplacian. In practice Spectral Clustering is very useful when the structure of the individual clusters is highly non-convex or more generally when a measure of the center and spread of the cluster is not a suitable description of the complete cluster. For instance when clusters are nested circles on the 2D plan. If affinity is the adjacency matrix of a graph, this method can be used to find normalized graph cuts. Parameters ----------- affinity: array-like or sparse matrix, shape: (n_samples, n_samples) The affinity matrix describing the relationship of the samples to embed. **Must be symmetric**. Possible examples: - adjacency matrix of a graph, - heat kernel of the pairwise distance matrix of the samples, - symmetric k-nearest neighbours connectivity matrix of the samples. n_clusters: integer, optional Number of clusters to extract. n_components: integer, optional, default is k Number of eigen vectors to use for the spectral embedding eigen_solver: {None, 'arpack' or 'amg'} The eigenvalue decomposition strategy to use. AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities random_state: int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by the K-Means initialization. n_init: int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. eigen_tol : float, optional, default: 0.0 Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' The strategy to use to assign labels in the embedding space. There are two ways to assign labels after the laplacian embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. See the 'Multiclass spectral clustering' paper referenced below for more details on the discretization approach. Returns ------- labels: array of integers, shape: n_samples The labels of the clusters. References ---------- - Normalized cuts and image segmentation, 2000 Jianbo Shi, Jitendra Malik http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 - A Tutorial on Spectral Clustering, 2007 Ulrike von Luxburg http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323 - Multiclass spectral clustering, 2003 Stella X. Yu, Jianbo Shi http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf Notes ------ The graph should contain only one connect component, elsewhere the results make little sense. This algorithm solves the normalized cut for k=2: it is a normalized spectral clustering. """ if not assign_labels in ('kmeans', 'discretize'): raise ValueError("The 'assign_labels' parameter should be " "'kmeans' or 'discretize', but '%s' was given" % assign_labels) if not k is None: warnings.warn("'k' was renamed to n_clusters and will " "be removed in 0.15.", DeprecationWarning) n_clusters = k if not mode is None: warnings.warn("'mode' was renamed to eigen_solver " "and will be removed in 0.15.", DeprecationWarning) eigen_solver = mode random_state = check_random_state(random_state) n_components = n_clusters if n_components is None else n_components maps = spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False) if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters, random_state=random_state, n_init=n_init) else: labels = discretize(maps, random_state=random_state) return labels