def get_constant_height_labels(clustering, n_clusters=None): """ use silhouette analysis to select the best heigh to cut a linkage matrix :df: a correlation matrix parse_heatmap: int (optional). If defined, devides the columns of the heatmap based on cutting the dendrogram """ N_variables = len(clustering['reorder_vec']) scores = [] if n_clusters is None: for k_clusters in range(2,N_variables//3): labels = cut_tree(clustering['linkage'], n_clusters=k_clusters) try: score = silhouette_score(clustering['distance_df'], labels.ravel(), metric='precomputed') except ValueError: continue scores.append((k_clusters,score)) best_k = max(scores, key=lambda x: x[1])[0] labels = cut_tree(clustering['linkage'], n_clusters=best_k) else: labels = cut_tree(clustering['linkage'], n_clusters=n_clusters) score = silhouette_score(clustering['distance_df'], labels, metric='precomputed') scores.append((n_clusters, score)) labels = reorder_labels(labels.flatten(), clustering['linkage']) # comparison MI = adjusted_mutual_info_score(labels, clustering['labels']) return labels, scores, MI
def get_constant_height_labels(clustering, n_clusters=None): """ use silhouette analysis to select the best heigh to cut a linkage matrix :df: a correlation matrix parse_heatmap: int (optional). If defined, devides the columns of the heatmap based on cutting the dendrogram """ N_variables = len(clustering['reorder_vec']) scores = [] if n_clusters is None: for k_clusters in range(2, N_variables // 3): labels = cut_tree(clustering['linkage'], n_clusters=k_clusters) try: score = silhouette_score(clustering['distance_df'], labels.ravel(), metric='precomputed') except ValueError: continue scores.append((k_clusters, score)) best_k = max(scores, key=lambda x: x[1])[0] labels = cut_tree(clustering['linkage'], n_clusters=best_k) else: labels = cut_tree(clustering['linkage'], n_clusters=n_clusters) score = silhouette_score(clustering['distance_df'], labels, metric='precomputed') scores.append((n_clusters, score)) labels = reorder_labels(labels.flatten(), clustering['linkage']) # comparison MI = adjusted_mutual_info_score(labels, clustering['labels']) return labels, scores, MI
def compute_vi(self, gtmap, num_parts=0): """Return variation of information based on provided ground truth maps. Args: gtmap (dict): body/neuronid => label or type num_parts (int): optional specification of number of parititons to use (default=0, find optimal) Returns: (float64, float64, dataframe, numparts): merge vi, split vi, bodyids and score, and num partitionse """ # find optimal match unless one is specified from scipy.cluster.hierarchy import cut_tree partitions = None if num_parts > 0: partitions = cut_tree(self.cluster, n_clusters=num_parts) return _vi_wrapper(list(partitions[:.0]), self.labels, gtmap) # iterate through all partitionings and find best match bestmatch = 99999999999999 partitions = cut_tree(self.cluster) bestres = None for colid in range(0, len(partitions[0,:])): merge, split, bodyrank = _vi_wrapper(list(partitions[:,colid]), self.labels, gtmap) if (merge + split) < bestmatch: bestmatch = (merge+split) bestres = (merge, split, bodyrank, len(gtmap)-colid) return bestres
def applyHierarchiqueClusteringFromDataset(metadataDataset,distanceMatrixComplete,parameter=5,typeOfHierarchical=HIERARCHICAL_FIXED_NUMBER_OF_CLUSERS,method=HIERARCHICAL_COMPLETE_LINKAGE): #single;#average;#complete;#weighted;#centroid;#median;#ward clusteringResults=[dict(obj) for obj in metadataDataset] innerMatrix,mapRowsID,mapColumnsID=getInnerMatrix(distanceMatrixComplete) # for i in range (len(innerMatrix)) : # innerMatrix[i][i]=float(0) # for index,row in enumerate(innerMatrix) : for column,val in enumerate(row) : if not innerMatrix[column][index] == innerMatrix[index][column] : if (math.isnan(innerMatrix[index][column])) : innerMatrix[index][column]=1. #print innerMatrix[index][column],'-',innerMatrix[column][index] ##ERREURE d'ARRONDIE else : innerMatrix[index][column]=innerMatrix[column][index] ##ERREURE d'ARRONDIE distArray = ssd.squareform(innerMatrix) linkageMatrix=linkage(distArray, method) if (typeOfHierarchical==HIERARCHICAL_FIXED_NUMBER_OF_CLUSERS): cutree = hierarchy.cut_tree(linkageMatrix, n_clusters=[parameter, parameter]) elif (typeOfHierarchical==HIERARCHICAL_SIMPLE): cutree = hierarchy.cut_tree(linkageMatrix, height =[parameter, parameter]) cuttreeclusters = [(k,v[0]) for k,v in enumerate(cutree.tolist())] clusters={} for value in iter(cuttreeclusters): clusteringResults[value[0]]['CLUSTER']=str(value[1]) if not clusters.has_key(str(value[1])) : clusters[str(value[1])]=0 clusters[str(value[1])]+=1 return clusteringResults,clusters,linkageMatrix
def main(): digits_embedding = genfromtxt('digits-embedding.csv', delimiter=',') np.random.seed(0) data = [] for i in range(10): class_i_digits = digits_embedding[digits_embedding[:, 1] == i] digits = np.random.randint(0, len(class_i_digits), size=10) for digit in digits: data.append(class_i_digits[digit]) data = np.array(data) # plt.scatter(data[:,2],data[:,3], c=data[:,1]) # plt.show() ''' plot dendograms ''' methods = ['single', 'complete', 'average'] features = data[:, 2:4] k_list = [2, 4, 8, 16, 32] # print (features[:20]) for method in methods: Z = linkage(features, method=method) max_d = 500 plot_dendogram(Z, max_d, 10, 'dendogram_' + method) wc_ssd_values = [] sc_values = [] for k in k_list: cluster_indices = cut_tree(Z, k) features_labels = np.column_stack((features, cluster_indices)) centroids = {} for cluster_id in range(k): cluster_members = features[features_labels[:, 2] == cluster_id] centroid = np.average(cluster_members, axis=0) centroids[cluster_id] = centroid wc_ssd_values.append( get_wc_ssd(centroids, features, features_labels[:, 2])) sc_values.append(get_SC(features, features_labels[:, 2])) # print ("Method", method, "WC-SSD", wc_ssd_values) # print ("Method", method, "SC", sc_values) plot_graph(k_list, wc_ssd_values, 'k (number of clusters)', 'WC-SSD', ['Sub Sample 100 images, method ' + method], 'hierarchical_learning_curve_wc_ssd_' + str(method)) plot_graph(k_list, sc_values, 'k (number of clusters)', 'SC', ['Sub Sample 100 images, method ' + method], 'hierarchical_learning_curve_sc_' + str(method)) print("We chose K=8 for all 3 methods single, compelete, average") for method in methods: Z = linkage(features, method=method) k = 8 cluster_indices = cut_tree(Z, k) features_labels = np.column_stack((features, cluster_indices)) nmi = get_NMI(features, features_labels[:, 2], data[:, 1]) print("For method", method, "NMI:", nmi)
def get_labels(self, what=None, n_clusters=2): if what == "row": labels = hierarchy.cut_tree(self.row_linkage, n_clusters) elif what == "col": labels = hierarchy.cut_tree(self.col_linkage, n_clusters) else: print 'Error: what must be "row" or "col"' return labels
def divide_computation(Sigma, max_block): """ Approximates a correlation matrix Sigma as a block-diagonal matrix using hierarchical clustering. Roughly follows the R knockoff package. Parameters ---------- Sigma : np.ndarray ``(p, p)``-shaped covariance matrix of X max_size : int Maximum size of a block in the block-diagonal approximation. Returns ------- blocks : np.ndarray ``(p, )``-shaped numpy array where ``blocks[i] == j`` indicates that variable ``i`` belongs to block ``j``. """ # Correlation tree. We add noise to deal with highly structured Sigma. p = Sigma.shape[0] noise = np.random.randn(p, p) * 1e-6 noise += noise.T Sigma = Sigma + noise link = dgp.create_correlation_tree(Sigma) # Set up binary search max_clusters = p min_clusters = 1 prev_max_clusters = p prev_min_clusters = 1 # Binary search to create clusters for j in range(100): # Create new groups and check maximum size n_clusters = int((max_clusters + min_clusters) / 2) groups = hierarchy.cut_tree(link, n_clusters).reshape(-1) + 1 current_max_block = utilities.calc_group_sizes(groups).max() # Cache search info and check maximum size prev_max_clusters = max_clusters prev_min_clusters = min_clusters if current_max_block > max_block: min_clusters = n_clusters else: max_clusters = n_clusters # Break if nothing has changed between iterations if min_clusters == prev_min_clusters and max_clusters == prev_max_clusters: if current_max_block > max_block: groups = hierarchy.cut_tree(link, n_clusters + 1).reshape(-1) + 1 break return merge_groups(groups, max_block)
def part2(computedTFIDF, showDendograms=False): startTime = time.time() runningTotalTime=0 print("Executing code for Part 2...\n") print("Creating and cutting single link clusters...") singleCluster = single(computedTFIDF.similarityMatrix) singleClusterCut = cut_tree(singleCluster, n_clusters=[i for i in range(0, computedTFIDF.docCount-1)]) singleClusterTime = round(time.time() - startTime, 3) runningTotalTime+=singleClusterTime print("Time: " + str(singleClusterTime) + " seconds") print("Creating list of single link clusters each document is contained in...") finalSingleClustering = singleClusterCut[len(singleClusterCut)-1] documentClusters=createDocumentCluster(finalSingleClustering, computedTFIDF) singleTrackingTime = round(time.time() - startTime - runningTotalTime, 3) runningTotalTime+=singleTrackingTime print("Time: " + str(singleTrackingTime) + " seconds") print("Writing single link clusters to file...") writeToFile(documentClusters, 'single.txt') singleWritingTime = round(time.time() - startTime - runningTotalTime, 3) runningTotalTime+=singleWritingTime print("Time: " + str(singleWritingTime) + " seconds") print("Creating and cutting complete link clusters...") completeCluster = complete(computedTFIDF.similarityMatrix) completeClusterCut = cut_tree(completeCluster, n_clusters=[i for i in range(0, computedTFIDF.docCount-1)]) completeClusterTime = round(time.time() - startTime - runningTotalTime, 3) runningTotalTime+=completeClusterTime print("Time: " + str(completeClusterTime) + " seconds") print("Creating list of complete link clusters each document is contained in...") finalCompleteClustering = completeClusterCut[len(completeClusterCut)-1] completeDocumentClusters=createDocumentCluster(finalCompleteClustering, computedTFIDF) completeTrackingTime = round(time.time() - startTime - runningTotalTime, 3) runningTotalTime+=completeTrackingTime print("Time: " + str(completeTrackingTime) + " seconds") print("Writing complete link clusters to file...") writeToFile(completeDocumentClusters, 'complete.txt') completeWritingTime = round(time.time() - startTime - runningTotalTime, 3) runningTotalTime+=completeWritingTime print("Time: " + str(completeWritingTime) + " seconds") if showDendograms: displayDendogram(completeCluster, 'Single') displayDendogram(completeCluster, 'Complete') print('\nPart 2 Complete') print("Execution Time: " + str(round(time.time() - startTime, 3)) + " seconds\n") return documentClusters, completeDocumentClusters
def fit(self, model, X, verbose): if verbose: print('Fitting variable:' + str(self.original_name)) pdp, names = self._get_partial_dependence(model, X) self.pdp = pdp self.axes = names if pdp.ndim == 1: arr = np.reshape(pdp, (len(pdp), 1)) else: arr = pdp self.Z = ward(arr) if pdp.shape[0] == 3: self.clusters = cut_tree(self.Z, height=self.Z[0, 2] - sys.float_info.epsilon) self.new_names = [] for cluster in range(len(np.unique(self.clusters))): names = [] for idx, c_val in enumerate(self.clusters): if c_val == cluster: if idx == 0: names.append('base') else: names.append( self.dummy_names[idx - 1][len(self.original_name) + 1:]) self.new_names.append(self.original_name + '_' + "_".join(names)) elif pdp.shape[0] > 3: kneed = KneeLocator(range(self.Z.shape[0]), self.Z[:, 2], direction='increasing', curve='convex') if kneed.knee is not None: self.clusters = cut_tree(self.Z, height=self.Z[kneed.knee + 1, 2] - sys.float_info.epsilon) self.new_names = [] for cluster in range(len(np.unique(self.clusters))): names = [] for idx, c_val in enumerate(self.clusters): if c_val == cluster: if idx == 0: names.append('base') else: names.append(self.dummy_names[idx - 1] [len(self.original_name) + 1:]) self.new_names.append(self.original_name + '_' + "_".join(names)) return self
def fms_compare(XX, YY, npoints, plot_title, plot_save): #Clustering ZXc = hierarchy.linkage(XX, method=clustering_method) ZYc = hierarchy.linkage(YY, method=clustering_method) #Cut dendrogram to obtain labelling for each k value #Warning: using hierarchy.cut_tree, but this function has a known bug! fms_dict = {} mean_dict = {} mean_dict[npoints]=0 varbound_dict = {} varbound_dict[npoints]=0 for i in range(1,npoints+1): ZXc_cut = [l for sublist in hierarchy.cut_tree(ZXc, i) for l in sublist] ZYc_cut = [l for sublist in hierarchy.cut_tree(ZYc, i) for l in sublist] #Compute FM scores score = fms(ZXc_cut, ZYc_cut) fms_dict[i] = score #Compute moments for plotting and analysis c = contingency_matrix(ZXc_cut, ZYc_cut, sparse=True) tk = np.dot(c.data, c.data) - npoints pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - npoints qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - npoints pk2 = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 3) - 3*(np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2)) + 2*(np.sum(np.asarray(c.sum(axis=0)).ravel())) qk2 = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 3) - 3*(np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2)) + 2*(np.sum(np.asarray(c.sum(axis=1)).ravel())) if i < npoints: mean = (np.sqrt(pk*qk)) / (npoints*(npoints-1)) mean_dict[i] = mean variance = (2/(npoints*(npoints-1))) + ((4*pk2*qk2)/(npoints*(npoints-1)*(npoints-2)*pk*qk))+ (((pk-2-((4*pk2)/pk))*(qk-2-((4*qk2)/qk)))/(npoints*(npoints-1)*(npoints-2)*(npoints-3))) - ((pk*qk)/((npoints**2)*((npoints-1)**2))) varbound_dict[i] = 2* (variance**0.5) #Plot Bk and variance bounds lists = sorted(fms_dict.items()) x, z = zip(*lists) upper = [mean_dict[i]+varbound_dict[i] for i in x] lower = [mean_dict[i]-varbound_dict[i] for i in x] means = [mean_dict[i] for i in x] #plt.plot(x,z) plt.scatter(x,z) plt.plot(x,upper) plt.plot(x, means) plt.plot(x,lower) plt.title(plot_title) plt.xlabel('# clusters') plt.ylabel('B_k') plt.savefig(path_fm_plot+ plot_save+'.jpg') plt.clf()
def gapStatistic(original, data, Z, nref=3, maxClusters=15): gaps = np.zeros((len(range(1, maxClusters)), )) # gapdf = pd.DataFrame({'clusterCount':[], 'gap':[]}) wkbs = np.zeros(len(range(1, maxClusters))) # minMaxArr = bounding_box(data) for gap_index, k in enumerate(range(1, maxClusters)): print('\ncalculating reference disp. K = ', k) # For n references, generate random sample # and get clustering result at each level k # Holder for reference dispersion results bWkbs = np.zeros(nref) for i in range(nref): # print('i', i) # Create new random reference set # randomReference = np.dot(np.random.random_sample(size=data.shape), maxx) # randomReference = generateRandomVecs(data.shape, minMaxArr) randomReference = generateRandomVecsBySampling( original, data.shape) # cluster to it refZ = hierarchy.linkage(randomReference, 'average') refClusterLabels = hierarchy.cut_tree(refZ, k) refCenters, refClusters = clusterCenters(randomReference, refClusterLabels, k) bWkbs[i] = np.log(Wk(refCenters, refClusters)) # everage Wk for the reference sample groups wkbs[gap_index] = sum(bWkbs) / nref # cluster to original data and create dispersion print('\ncalculating original disp.') clusterLabels = hierarchy.cut_tree(Z, k) # print('clusterLabels', np.unique(clusterLabels)) centers, clusters = clusterCenters(data, clusterLabels, k) # Calculate gap statistic gap = wkbs[gap_index] - np.log(Wk(centers, clusters)) # Assign this loop's gap statistic to gaps gaps[gap_index] = gap # gapdf = gapdf.append({'clusterCount':k, 'gap':gap}, ignore_index=True) optK = gaps.argmax() + 1 return optK
def worker_cut_tree(nClu, Z=None): print nClu, if nClu <= 1: res = None else: res = sphier.cut_tree(Z, n_clusters=nClu) return res
def filter_points(gdf, min_dist=0, remove="first"): """Filter points in geodataframe using a minimum distance buffer. Args ---- gdf : Geopandas GeoDataFrame Containing point geometries. min_dist : int or float, optional (default=0) Minimum distance by which to filter out closely spaced points. remove : str, optional (default='first') Optionally choose to remove 'first' occurrences or 'last' occurrences. Returns ------- xy : 2d array-like Numpy array filtered coordinates """ xy = gdf.geometry.bounds.iloc[:, 0:2] Z = linkage(xy, "complete") tree_thres = cut_tree(Z, height=min_dist) gdf["tree_thres"] = tree_thres if remove == "first": gdf = gdf.groupby(by="tree_thres").first() elif remove == "last": gdf = gdf.groupby(by="tree_thres").last() return gdf
def doHierachicalClustering(matrixAudioDataTransformed, threshold=0.995): global distanceMatrix from scipy.cluster import hierarchy as h from scipy.spatial import distance as dist distanceFunction = 'cosine' #canberra, cityblock, braycurtis, euclidean linkageType = 'average' #single, complete, weighted, average print("Distance function:", distanceFunction) print("Linkage type:", linkageType) tic = time.clock() distanceMatrix = dist.pdist(matrixAudioDataTransformed, distanceFunction) clusters = h.linkage(distanceMatrix, linkageType) c, d = h.cophenet(clusters, distanceMatrix) #factor cofonético toc = time.clock() print("Cophenet factor:", c) print("time:", toc - tic) # THRESHOLD = 0.995 #THRESHOLD = 0.92 cutTree = h.cut_tree(clusters, height=threshold) return cutTree
def process_hierarchy(inf, h, method): df = pd.read_csv(inf, header=0, index_col=0) df = df.fillna(0) strains = df.index df = 1 - (df / 100) df_v = ssd.squareform( df, force='tovector', checks=False) # flatten matrix to condensed distance vector if method == 'single': li = sch.single(df_v) elif method == 'complete': li = sch.complete(df_v) elif method == 'average': li = sch.average(df_v) elif method == 'weighted': li = sch.weighted(df_v) else: print('\nERROR: Please enter a valid clustering method\n') sys.exit() hclus = cut_tree( li, height=h ) # using the height (percent ID as decimal, for example), cluster OFUs from dendrogram hclus = pd.DataFrame(hclus, index=strains) hclus.ix[:, 0] += 1 # cut_tree defaults to the first 'cluster' being named "0"; this just bumps all IDs +1 return hclus
def aggregate_panels(panels: List[PanelProfile]) -> List[PanelGroupProfile]: """ the function takes a list of Panels then groups them into a list of PanelGroupParts :param panels: a list of Panels :return: a list of PanelGroupParts """ vertices = list() for p in range(len(panels)): panel = panels[p] points = panel.points_rc x, y = points[0][1], points[0][0] w, h = points[1][1] - x, points[1][0] - y vertices.extend([[i, j, p] for i in range(x, x + w, 5) for j in range(y, y + h, 5)]) vertices.extend([[x, y + h, p], [x + w, y, p], [x + w, y + h, p]]) linkage_matrix = linkage(np.array(vertices)[:, :2], method="single", metric="chebyshev") ctree = cut_tree(linkage_matrix, height=[10]) cluster = [x[0] for x in ctree] panel_groups = list() for group in range(max(cluster) + 1): vertices_group = [ vertices[i] for i in range(len(vertices)) if cluster[i] == group ] contour = cv2.convexHull(np.array([x[:2] for x in vertices_group])) panel_index = set([x[2] for x in vertices_group]) panel_group = PanelGroupProfile(contour=contour) for i in panel_index: panel_group.add_panel(panels[i]) panel_groups.append(panel_group) return panel_groups
def classbyUPGMA(self,obstimes,trainingtimes,obsnodes): self._classorder=["C1"] self._classscore["C1"] = score() for u,v in itertools.combinations(obsnodes.keys(),2): #to predict new pair of nodes and initialize the classes link = frozenset([u,v]) self._classUnion.addPair(link) if link not in obstimes: self._classscore["C1"].addPair(link) learningperiods=dict() for link in obstimes: if link in trainingtimes: learningperiods[link] = obstimes[link]+trainingtimes[link] else: learningperiods[link] = obstimes[link] Y,self._label=classes.Makedistmatx(learningperiods,self._VandPparameter,0,0) self._linkage=hierarchy.average(Y) cutree = hierarchy.cut_tree(self._linkage,self._nbcluster) for i in range(self._nbcluster): self._classscore["C"+str(i+2)] = score() self._classorder.append("C"+str(i+2)) for i in range(len(self._label)): u=self._label[i] self._classscore["C"+str(cutree[i][0]+2)].addPair(u)
def problemTwo(X, cities, method, metric, height, plot): # 设置distance_threshold=0可以确保计算完整的树。 model = AgglomerativeClustering(linkage="single", affinity="euclidean", n_clusters=None, distance_threshold=0) model = model.fit(X) plt.rcParams['font.size'] = 12 plt.figure(figsize=[8, 6]) plt.title('Hierarchical Clustering Dendrogram (%s)' % method) # 标出树木图的前三个层次 Z = linkage(X, method=method, metric=metric) dendrogram(Z, labels=cities, orientation="right") # plot_dendrogram(model, cities) plt.xlabel("Height") if plot: plt.show() sns.clustermap(X, method=method, metric=metric, figsize=[8, 6]) plt.show() label = cut_tree(Z, height=height) label = label.reshape(label.size, ) PCA(X=X, label=label, cities=cities, method=method, height=height)
def createCut(a, w, r, s, n): name = ','.join(map(str, [a, w, r, s, n])) with open(codebasesPath + codebaseName + "/analyser/similarityMatrix.json") as f: similarityMatrix = json.load(f) matrix = similarityMatrix["matrix"] for i in range(len(matrix)): for j in range(len(matrix)): matrix[i][j] = matrix[i][j][0] * a / 100 + \ matrix[i][j][1] * w / 100 + \ matrix[i][j][2] * r / 100 + \ matrix[i][j][3] * s / 100 matrix = np.array(matrix) hierarc = hierarchy.linkage(y=matrix, method=linkageType) cut = hierarchy.cut_tree(hierarc, n_clusters=n) clusters = {} for i in range(len(cut)): if str(cut[i][0]) in clusters.keys(): clusters[str(cut[i][0])] += [entities[i]] else: clusters[str(cut[i][0])] = [entities[i]] clustersJSON = {} clustersJSON["clusters"] = clusters with open( codebasesPath + codebaseName + "/analyser/cuts/" + name + ".json", 'w') as outfile: outfile.write(json.dumps(clustersJSON, indent=4))
def cut_dendrogram(hierarc, linkage_type, height): # EMPTY DICTIONARIES FOR REPEATED CUTS dict_class_cluster.clear() dict_cluster_controller_access.clear() dict_ctrl_class_percentage.clear() cut = hierarchy.cut_tree(hierarc, height=height) list_unique_classes = sorted(list(get_all_controller_classes())) for i in range(0, len(list_unique_classes)): dict_class_cluster.setdefault(cut[i][0], []).append(list_unique_classes[i]) ################################################### # Statistics generation f = open('statistics_2_' + linkage_type + '.txt', 'w') print("-----Clusters from Dendrogram-----", file=f) for key, value in dict_class_cluster.items(): print(key, value, file=f) f.close() # Calculate percentages calculate_cluster_controller_access(linkage_type) calculate_controller_percentage_classes(linkage_type) return dict_cluster_controller_access
def extract_cell_name_from_clusters(self, n_clusters=5): Z = self.Z labels = list(self.df.columns) clusters = cut_tree(Z, n_clusters=n_clusters) clusters.reshape(clusters.shape[0]) grouped = {} for i, cluster in enumerate(clusters): if str(cluster[0]) not in grouped: grouped[str(cluster[0])] = [] grouped[str(cluster[0])].append(labels[i]) grouped_inv = {} for (k, v) in grouped.items(): for cell_name in v: grouped_inv[cell_name] = k df = pd.DataFrame(data=map( lambda cell_cluster: [cell_cluster[0], cell_cluster[1]], grouped_inv.items(), )) df.columns = ["cell_name", "cluster"] return df
def clustering(pt, kmin,kmax,distance,linkagem): """ Description: Computes the clustering for the pathways matrix and returns a dataframe with the groups with k clusters from kmin to kmax. Inputs: pt : Pathway scores. kmin : Min number of groups in the clustering. kmax : Max number of groups in the clustering. distance: Similarity metric for the clustering. Default is binomial distance but any distance from scipy.spatial.distance can be used. linkagem: Linkage method. Default is complete. Outputs: df : Cluster assignement dataframe. """ tinit = time.time() if(distance=="binomial"): Y = pdist(pt.transpose(),binomial_dist) # Computes the distance matrix. pt is transposed because the pdist function takes rows as input and what we want to cluster are the samples. else: Y = pdist(pt.transpose(),distance) Z = linkage(Y,linkagem) #Linkage of the clusters using the distance matrix and the complete method. np.savetxt("dist_matrix.csv",squareform(Y),delimiter=",")#Saves the distance matrix. Note that the output of pdist is a condensed matrix!! #Building the output dataframe. df = pd.DataFrame() for k in range(kmin,kmax+1): R = cut_tree(Z,k) #Cuts the tree at k groups. u = [item for sublist in R for item in sublist] df["X"+str(k)] =u df.index = pt.columns tfin = time.time() print("Clustering runtime: ",tfin-tinit) return df.transpose()
def generate_groups(linkage_matrix: np.array, sample_ids: list or np.array, n_groups: int): """ Given the output of SimilarityMatrix (that is the linkage matrix and ordered list of sample IDs) and a desired number of groups, return a Pandas DataFrame of sample IDs and assigned group ID, generated by cutting the linkage matrix in such a way that the desired number of groups are generated. Parameters ---------- linkage_matrix: np.array Linkage matrix generated from EvaluateBatchEffects.similarity_matrix (using SciPy.cluster.hierarchy.linkage) sample_ids: list or np.array Ordered list of sample IDs generated from EvaluateBatchEffects.similarity_matrix n_groups: int Desired number of groups Returns ------- Pandas.DataFrame """ groups = pd.DataFrame({ 'sample_id': sample_ids, 'group': list( map(lambda x: x + 1, hierarchy.cut_tree(linkage_matrix, n_groups).flatten())) }) groups = groups.sort_values('group') return groups
def get_optimal_clustering(self): X = range(20, 40) c = cut_tree(self.Z, height=X) Y = np.apply_along_axis(self.unique_ele_count, axis=0, arr=c) optimal_cluster_count = Counter(Y).most_common(1)[0][0] idx = max(np.where(Y == optimal_cluster_count)[0]) return c[:, idx]
def get_partitions(self, num_parts, return_max=False): """Returns cluster partitions for specified number of clusters. Args: num_parts (int): number of cluster partitions return_max (boolean): if true return the maximum distance between body ids in one cluster Returns: (dict, dataframe): {cluster id: [body1, body2,...]}, "bodyid", "cluster id" optional (dict, dataframe, float, tuple): includes maximum distance between bodies in a cluster and those body ids """ from scipy.cluster.hierarchy import cut_tree partitions = cut_tree(self.cluster, n_clusters=num_parts) res = {} labels = list(partitions[:,0]) mapping = pd.DataFrame(list(zip(self.labels, labels)), columns=["bodyid", "type"]) for idx, label in enumerate(labels): if label not in res: res[label] = [] res[label].append(self.labels[idx]) if return_max: max_dist, max_pair = self._get_max_dist(res) return (res, mapping, max_dist, max_pair) return (res, mapping)
def filter_points(gdf, min_dist=0, remove='first'): """ Filter points in geodataframe using a minimum distance buffer Args ---- gdf : Geopandas GeoDataFrame Containing point geometries min_dist : int or float, optional (default=0) Minimum distance by which to filter out closely spaced points remove : str, optional (default='first') Optionally choose to remove 'first' occurrences or 'last' occurrences Returns ------- xy : 2d array-like Numpy array filtered coordinates """ from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree xy = gdf.geometry.bounds.iloc[:, 0:2] Z = linkage(xy, 'complete') tree_thres = cut_tree(Z, height=min_dist) gdf['tree_thres'] = tree_thres
def clustering(pt, kmin, kmax, distance, linkagem): """Computes the clustering for the pathways matrix and returns a dataframe with the groups with k clusters from kmin to kmax.""" tinit = time.time() if (distance == "binomial"): Y = pdist( pt.transpose(), binomial_dist ) # Computes the distance matrix. pt is transposed because the pdist function takes rows as input and what we want to cluster are the samples. else: Y = pdist(pt.transpose(), distance) Z = linkage( Y, linkagem ) #Linkage of the clusters using the distance matrix and the complete method. np.savetxt( "dist_matrix.csv", squareform(Y), delimiter="," ) #Saves the distance matrix. Note that the output of pdist is a condensed matrix!! #Building the output dataframe. df = pd.DataFrame() for k in range(kmin, kmax + 1): R = cut_tree(Z, k) #Cuts the tree at k groups. u = [item for sublist in R for item in sublist] df["X" + str(k)] = u df.index = pt.columns tfin = time.time() print("Clustering runtime: ", tfin - tinit) return df.transpose()
def get_partitions_dist_constraint(self, dist): """Returns cluster partitions for the specified distance constraint. Args: dist (float): maximum distance to allow between bodies in a cluter Returns: (dict, dataframe, float, tuple): {cluster id: [body1, body2,...]}, "bodyid", "cluster id", distance between farthest bodies, farthest bodies in the same cluster """ from scipy.cluster.hierarchy import cut_tree previous_result = None partitions = cut_tree(self.cluster) for colid in range(0, len(partitions[0,:])): labels = list(partitions[:,colid]) mapping = pd.DataFrame(list(zip(self.labels, labels)), columns=["bodyid", "type"]) res = {} for idx, label in enumerate(labels): if label not in res: res[label] = [] res[label].append(self.labels[idx]) max_dist, max_pair = self._get_max_dist(res) if max_dist > dist: break previous_result = (res, mapping, max_dist, max_pair) return previous_result
def get_cluster_assignments_hac(linkage_matrix: np.ndarray, height: int): """ Assigns clusters by cutting the HAC dendrogram at the specified height. Returns list """ return [x[0] for x in cut_tree(linkage_matrix, height=height)]
def cluster_hierarchy(data, method, axis, metric='euclidean', n_clusters=None): """ data :pandas.DataFrame Rectangular data method :str, 'single', 'centroid', 'median', 'ward' axis : int, optional Which axis to use to calculate linkage. 0 is rows, 1 is columns. metric : "eulidean" n_cluster: int, optional return the cut tree. no_plot: bool, optional When True, the final rendering is not performed. This is useful if only the data structures computed for the rendering are needed or if matplotlib is not available. """ data = data.copy() if axis == 1: data = data.T array = data.values _linkage = make_linkage(array, method, metric) if n_clusters is not None: cut_result = cut_tree(_linkage, n_clusters=n_clusters) df_cut = pd.DataFrame(cut_result.flatten()) label = df_cut.iloc[:, 0].sort_values(ascending=True, inplace=False).index.values return data.index.values[label] _result = dendrogram(_linkage, no_plot=True) _reordered_index = data.index.values[_result['leaves']] return _reordered_index
def divide_clusters(global_distmat, original_cluster, max_cluster_size, sub_clusters, min_cluster_count=2): local_to_global_map = { ind: global_ind for ind, global_ind in enumerate(original_cluster) } local_distmat = global_distmat[original_cluster, :][:, original_cluster] local_disttriu = mat2triu(local_distmat) local_tree = linkage(local_disttriu, method='complete') local_heights = local_tree[:, 2] local_cluster_by_cutoff = cut_tree( local_tree, height=local_heights[-(min_cluster_count - 1)]) num_clusters = np.max(local_cluster_by_cutoff) + 1 for cluster_ind in range(num_clusters): sub_cluster = np.where(local_cluster_by_cutoff == cluster_ind)[0] sub_cluster_size = len(sub_cluster) print('sub_cluster_size = {}'.format(sub_cluster_size)) sub_cluster_global_ind = np.array( [local_to_global_map[x] for x in sub_cluster]) if sub_cluster_size > max_cluster_size: divide_clusters(global_distmat, sub_cluster_global_ind, max_cluster_size, sub_clusters) else: sub_clusters.append(sub_cluster_global_ind)
def make_hier_clusters(df, n): ''' Given dataframe of schools and attributes, use hierarchical clustering to find most similar schools. ''' vectors = df.iloc[:, 2:].values vectors = scale(vectors) dist_matrix = squareform(pdist(vectors, metric='cosine')) link_matrix = linkage(dist_matrix, method='average') tree = cut_tree(link_matrix, n) return link_matrix, tree
def consensus_clustering(consensus, n_components=5): """ :param consensus: cells x cells consensus matrix :param n_components: number of clusters :return: cells x 1 labels """ print 'SC3 Agglomorative hierarchical clustering.' # condensed distance matrix cdm = dist.pdist(consensus) # hierarchical clustering (SC3: complete agglomeration + cutree) hclust = spc.complete(cdm) cutree = spc.cut_tree(hclust, n_clusters=n_components) labels = cutree.reshape(consensus.shape[0]) # Below is the hclust code for the older version, fyi # hclust = spc.linkage(cdm) # labels = spc.fcluster(hclust, n_components, criterion='maxclust') return labels, dist.squareform(cdm)
def test_cut_tree(): np.random.seed(23) nobs = 50 X = np.random.randn(nobs, 4) Z = scipy.cluster.hierarchy.ward(X) cutree = cut_tree(Z) assert_equal(cutree[:, 0], np.arange(nobs)) assert_equal(cutree[:, -1], np.zeros(nobs)) assert_equal(cutree.max(0), np.arange(nobs - 1, -1, -1)) assert_equal(cutree[:, [-5]], cut_tree(Z, n_clusters=5)) assert_equal(cutree[:, [-5, -10]], cut_tree(Z, n_clusters=[5, 10])) assert_equal(cutree[:, [-10, -5]], cut_tree(Z, n_clusters=[10, 5])) nodes = _order_cluster_tree(Z) heights = np.array([node.dist for node in nodes]) assert_equal(cutree[:, np.searchsorted(heights, [5])], cut_tree(Z, height=5)) assert_equal(cutree[:, np.searchsorted(heights, [5, 10])], cut_tree(Z, height=[5, 10])) assert_equal(cutree[:, np.searchsorted(heights, [10, 5])], cut_tree(Z, height=[10, 5]))
# Compute distance metrics on standardized data # This will likely generate an error on most machines # d = dist(new_data) # Take a 10% sample customers_sample = customers.iloc[::10, :] new_data_sample = new_data.iloc[::10, :] # Compute distance metrics on standardized data #d = pdist(new_data_sample) #not needed for ward # Perform hierarchical clustering on distance metrics c = linkage(new_data_sample, method='ward') # Plot the dendogram dendrogram(c, get_leaves=True, labels=None) # Cut at 9 segments members = pd.DataFrame(cut_tree(c, n_clusters = 9), index=new_data_sample.index, columns=['ClusterNumber']) # Show 30 first customers, frequency table members.iloc[0:30] members.ClusterNumber.value_counts(sort=False) # Show profile of each segment customers_sample_new = customers_sample.set_index(customers_sample.customer_id).iloc[:,1:4] customers_sample_new.groupby(members.ClusterNumber).mean()