def check_maxdists_Q_linkage(self, method): # Tests maxdists(Z) on the Q data set X = hierarchy_test_data.Q_X Z = linkage(X, method) MD = maxdists(Z) expectedMD = calculate_maximum_distances(Z) assert_allclose(MD, expectedMD, atol=1e-15)
def find_optimal_t_cluster(L, matrix_fun, num_of_clusters): max_silhuete = [-1, 0] arr = [] max_val = (floor(max(sch.maxdists(L)) * 100) / 100.0) * 0.66 t_f = max_val while True: predictions = sch.fcluster(L, t=t_f, criterion="distance").ravel() score = round( silhouette_score(matrix_fun, predictions, metric="cityblock"), 3) arr.append((score, round(t_f, 4))) num_of_c = len(set(predictions)) if num_of_c == num_of_clusters: max_silhuete[0] = score max_silhuete[1] = round(t_f, 4) break elif num_of_c < num_of_clusters: t_f = t_f - (t_f / 2) else: t_f = t_f + (t_f / 2) # print(t_f) print("The best value for t is t =", max_silhuete[1]) return arr, t_f
def check_maxdists_Q_linkage(self, method): # Tests maxdists(Z) on the Q data set X = eo['Q-X'] Y = pdist(X) Z = linkage(X, method) MD = maxdists(Z) expectedMD = calculate_maximum_distances(Z) assert_allclose(MD, expectedMD, atol=1e-15)
def check_fcluster_monocrit(self, t, criterion): # Tests fcluster(Z, criterion='monocrit'/'maxclust_monocrit', t=t, # monicrit=maxdists(Z)) on a random 3-cluster data set. expectedT = np.int_(eo['fcluster-%s-%d' % (criterion, t)]) X = eo['Q-X'] Y = pdist(X) Z = linkage(Y) T = fcluster(Z, criterion=criterion, t=t, monocrit=maxdists(Z)) assert_(is_isomorphic(T, expectedT))
def test_maxdists_one_cluster_linkage(self): # Tests maxdists(Z) on linkage with one cluster. Z = np.asarray([[0, 1, 0.3, 4]], dtype=np.double) MD = maxdists(Z) expectedMD = calculate_maximum_distances(Z) assert_allclose(MD, expectedMD, atol=1e-15)
def check_fcluster_maxclust_monocrit(self, t): expectedT = hierarchy_test_data.fcluster_maxclust[t] Z = single(hierarchy_test_data.Q_X) T = fcluster(Z, t, criterion='maxclust_monocrit', monocrit=maxdists(Z)) assert_(is_isomorphic(T, expectedT))
# Plot the clustermap # Save the returned object for further plotting mclust = sns.clustermap(m, linewidths=0, cmap=plt.get_cmap('RdBu'), vmax=1, vmin=-1, figsize=(14, 14), row_linkage=l, col_linkage=l) # In[ ]: # Threshold 1: median of the # distance thresholds computed by scipy t = np.median(hierarchy.maxdists(l)) # In[ ]: # Plot the clustermap # Save the returned object for further plotting mclust = sns.clustermap(m, linewidths=0, cmap=plt.get_cmap('RdBu'), vmax=1, vmin=-1, figsize=(12, 12), row_linkage=l, col_linkage=l) # Draw the threshold lines
scaler.fit(df) data = scaler.transform(df) #Applying the clustering algorithm using average distance method z = linkage(data, "average") #Calculating cophenet correlation coefficient c, coph_dists = cophenet(z, pdist(data)) print('Cophenet Correlation coefficient = ', c) print('Cophenet pairwise distances = ', coph_dists) #Printing the first two points merged and the distance between them print("1st Cluster is ", z[0]) #Distance array m = maxdists(z) print("Distance Array ", m) #Plotting the full dendogram plt.figure(figsize=(30, 15)) plt.title('Dendogram for Flag data') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=10., # font size for the x axis labels ) plt.show() #Plotting a truncated dendogram showing last 12 cluster iterations
ax.set_aspect(1./ax.get_data_ratio()) #### figure 2 ##### uniqueLabels = np.sort(np.unique(case1Labels)) centroids = np.array([case1[np.where(case1Labels == i)[0],:].mean(axis=0) for i in uniqueLabels]) fig = plt.figure() ax = fig.add_subplot(111) ncluster = 27 y = pdist(centroids) method = 'centroid'#'average' z = hierarchy.linkage(y,'average') #t = hierarchy.fcluster(27,criterion='maxclust') ## computes the max distance between any cluster and ea non singleton cluster print 'max dists', hierarchy.maxdists(z) ## inconsistancy r = hierarchy.inconsistent(z) print 'r',r #print 'max inconsts', hierarchy.maxinconsts(z,r,i) print 'z',z #print 'blah', z[:,2] - np.array(z[1:,2].tolist()+[0]) print z[:,2] print np.hstack([z[1:,2],[0]]) levelDiffs = np.abs(z[:,2] - np.hstack([z[1:,2],[0]])) levelDiffMeans = z[:,2]# - 0.001 #np.hstack([z[1:,2],[0]]) / 2.0#z[:,2] + np.hstack([z[1:,2],[0]]) / 2.0 print 'diffs',levelDiffs*100 diffInds = np.argsort(levelDiffs)
def plot_corr_cluster(m, method=1, **kargs): sns.set_style('white') l = fst.linkage(m, method='average') if method == 1: # Threshold 1: MATLAB-like behavior t = 0.7*max(l[:, 2]) elif method == 2: t = np.median(hierarchy.maxdists(l)) elif method == 3: t= mquantiles(hierarchy.maxdists(l), prob=0.75)[0] else: raise RuntimeError('no such method') # Plot the clustermap # Save the returned object for further plotting mclust = sns.clustermap(m, linewidths=0, cmap=plt.get_cmap('RdBu'), vmax=1, vmin=-1, row_linkage=l, col_linkage=l, **kargs) # Draw the threshold lines mclust.ax_col_dendrogram.hlines(t, 0, m.shape[0]*10, colors='g', linewidths=2, zorder=1) mclust.ax_row_dendrogram.vlines(t, 0, m.shape[0]*10, colors='g', linewidths=2, zorder=1) # Extract the clusters clusters = hierarchy.fcluster(l, t, 'distance') for c in set(clusters): # Retrieve the position in the clustered matrix index = [x for x in range(m.shape[0]) if mclust.data2d.columns[x] in m.index[clusters == c]] # No singletons, please if len(index) == 1: continue # Draw a rectangle around the cluster mclust.ax_heatmap.add_patch( patches.Rectangle( (min(index), m.shape[0] - max(index) - 1), len(index), len(index), facecolor='none', edgecolor='g', lw=3) ) plt.title('Cluster matrix')