def test_dendrogram_colors(self): # Tests dendrogram plots with alternate colors Z = linkage(hierarchy_test_data.ytdist, 'single') set_link_color_palette(['c', 'm', 'y', 'k']) R = dendrogram(Z, no_plot=True, above_threshold_color='g', color_threshold=250) set_link_color_palette(['g', 'r', 'c', 'm', 'y', 'k']) color_list = R['color_list'] assert_equal(color_list, ['c', 'm', 'g', 'g', 'g'])
def cluster(data): pairwise_dists = distance.squareform(distance.pdist(data)) # cluster sch.set_link_color_palette(['black']) row_clusters = sch.linkage(pairwise_dists,method='complete') # rename row clusters #row_clusters = clusters # calculate pairwise distances for columns col_pairwise_dists = distance.squareform(distance.pdist(data.T)) # cluster col_clusters = sch.linkage(col_pairwise_dists,method='complete') return row_clusters, col_clusters
def hierarchical(self,lst,fulldataset): #Samples are colored according to its sample type # label_color={} for i in self.numbering(self.classLabel(lst)): r=('r') b=('b') if i[0:6]=='cancer': label_color[i]=r #print label_colors elif i[0:6]=='normal' : label_color[i]=b #print label_colors else: continue tg=zip(*fulldataset) Y = pdist(tg) #average linkage is applied # Z = linkage(Y,method='average') sch.set_link_color_palette(['black']) a=sch.dendrogram(Z,leaf_font_size=6,labels=self.newlist) #dendrogram is plotted # ax = plt.gca() xlbls = ax.get_xmajorticklabels() for lbl in xlbls: lbl.set_color(label_color[lbl.get_text()]) plt.title("Average Hierarchical Clustering Algorithm") plt.savefig('Average Hierarchical Clustering.pdf',dpi=500) #plt.show() plt.close() self.labels=array([]) c=array([1]) n=array([0]) #Silhouette Test # #Samples are converted into '0' or '1' for validation # for i in self.classLabel(lst): if i=='cancer': self.labels=np.concatenate([self.labels,c]) else: self.labels=np.concatenate([self.labels,n]) self.labels=np.delete(self.labels,self.labels[-1]) self.score=metrics.silhouette_score(Z, self.labels, metric='euclidean')
def dendrogramClusteringPlot(cls, linkageMatrix, labels, fileLocation): from matplotlib import pyplot as plt import scipy.cluster.hierarchy as sch import numpy as np sch.set_link_color_palette(['black']) fig, axes = plt.subplots() fig.subplots_adjust(bottom=0.4) plt.title('Hierarchical Clustering Dendrogram') plt.ylabel('Distance') sch.dendrogram( linkageMatrix, labels=labels, leaf_rotation=270, # rotates the x axis labels color_threshold=np.inf ) plt.plot() cls.saveFigure(fig, fileLocation)
def plot_dendrogram(self, nolabels=True): ''' Plots the dendragram visualization of Z and returns the dendrogram object 'dendro'. ''' from scipy.cluster.hierarchy import dendrogram, set_link_color_palette import matplotlib.pyplot as plt cpool = ["#1F78B4", "#E31A1C", "#A6CEE3", "#FB9A99", "#7BCCC4", "#B2DF8A", "#33A02C", "#02818A", "#FF7F00", "#FDBF6F", "#CAB2D6", "#6A3D9A", "#BFD3E6", "#8C96C6"] set_link_color_palette(cpool) h_clustering = self.Z dendro = dendrogram(h_clustering, no_labels=nolabels, count_sort=True, orientation="left"); #plt.title("Clustering Diagram for N = %d" % self.get_sample_size(), fontsize = 14) plt.xlabel("Coincidence Metric ($\Gamma$)", fontsize = 14) plt.ylabel("Clusters", fontsize = 14) plt.xticks([1, 0.8, 0.6, 0.4, 0.2, 0], [0, 0.2, 0.4, 0.6, 0.8, 1]) plt.xticks(fontsize=14) return dendro
def test_dendrogram_colors(self): # Tests dendrogram plots with alternate colors Z = linkage(hierarchy_test_data.ytdist, "single") set_link_color_palette(["c", "m", "y", "k"]) R = dendrogram(Z, no_plot=True, above_threshold_color="g", color_threshold=250) set_link_color_palette(["g", "r", "c", "m", "y", "k"]) color_list = R["color_list"] assert_equal(color_list, ["c", "m", "g", "g", "g"]) # reset color palette (global list) set_link_color_palette(None)
def hierarchical(df, cluster_cols=True, cluster_rows=False, n_col_clusters=False, n_row_clusters=False, fcol=None, z_score=True, method='ward'): # helper for cleaning up axes by removing ticks, tick labels, frame, etc. def clean_axis(ax): """Remove ticks, tick labels, and frame from axis""" ax.get_xaxis().set_ticks([]) ax.get_yaxis().set_ticks([]) ax.set_axis_bgcolor('#ffffff') for sp in ax.spines.values(): sp.set_visible(False) def optimize_clusters(clusters, denD, target_n): target_n = target_n - 1 # We return edges; not regions threshold = np.max(clusters) max_iterations = threshold i = 0 while i < max_iterations: cc = sch.fcluster(clusters, threshold, 'distance') cco = cc[ denD['leaves'] ] edges = [n for n in range(cco.shape[0]-1) if cco[n] != cco[n+1] ] n_clusters = len(edges) if n_clusters == target_n: break if n_clusters < target_n: threshold = threshold // 2 elif n_clusters > target_n: threshold = int( threshold * 1.5 ) i += 1 return edges dfc = df.copy() if z_score: dfc = (dfc - dfc.median(axis=0)) / dfc.std(axis=0) # Remove nan/infs dfc[np.isinf(dfc)] = 0 dfc[np.isnan(dfc)] = 0 #dfc.dropna(axis=0, how='any', inplace=True) # make norm vmin = dfc.min().min() vmax = dfc.max().max() vmax = max([vmax, abs(vmin)]) # choose larger of vmin and vmax vmin = vmax * -1 my_norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax) df[np.isnan(df)] = 0 df[np.isinf(df)] = 0 # dendrogram single color sch.set_link_color_palette(['black']) # cluster if cluster_rows: row_pairwise_dists = distance.squareform(distance.pdist(dfc)) row_clusters = sch.linkage(row_pairwise_dists, method=method) if cluster_cols: col_pairwise_dists = distance.squareform(distance.pdist(dfc.T)) col_clusters = sch.linkage(col_pairwise_dists, method=method) # heatmap with row names fig = plt.figure(figsize=(12, 12)) heatmapGS = gridspec.GridSpec(2, 2, wspace=0.0, hspace=0.0, width_ratios=[0.25, 1], height_ratios=[0.25, 1]) if cluster_cols: # col dendrogram col_denAX = fig.add_subplot(heatmapGS[0, 1]) col_denD = sch.dendrogram(col_clusters, color_threshold=np.inf) clean_axis(col_denAX) rowGSSS = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=heatmapGS[1, 0], wspace=0.0, hspace=0.0, width_ratios=[1, 0.05]) if cluster_rows: # row dendrogram row_denAX = fig.add_subplot(rowGSSS[0, 0]) row_denD = sch.dendrogram(row_clusters, color_threshold=np.inf, orientation='right') clean_axis(row_denAX) row_denD = { 'leaves':range(0, dfc.shape[0]) } # row colorbar if fcol and 'Group' in dfc.index.names: class_idx = dfc.index.names.index('Group') classcol = [fcol[x] for x in dfc.index.get_level_values(0)[row_denD['leaves']]] classrgb = np.array([colorConverter.to_rgb(c) for c in classcol]).reshape(-1, 1, 3) row_cbAX = fig.add_subplot(rowGSSS[0, 1]) row_axi = row_cbAX.imshow(classrgb, interpolation='nearest', aspect='auto', origin='lower') clean_axis(row_cbAX) # heatmap heatmapAX = fig.add_subplot(heatmapGS[1, 1]) axi = heatmapAX.imshow(dfc.iloc[row_denD['leaves'], col_denD['leaves']], interpolation='nearest', aspect='auto', origin='lower' , norm=my_norm, cmap=cm.PuOr_r) clean_axis(heatmapAX) # row labels if dfc.shape[0] <= 100: heatmapAX.set_yticks(range(dfc.shape[0])) heatmapAX.yaxis.set_ticks_position('right') ylabels = [" ".join([str(t) for t in i]) if type(i) == tuple else str(i) for i in dfc.index[row_denD['leaves']]] heatmapAX.set_yticklabels(ylabels) # col labels if dfc.shape[1] <= 100: heatmapAX.set_xticks(range(dfc.shape[1])) xlabels = [" ".join([str(t) for t in i]) if type(i) == tuple else str(i) for i in dfc.columns[col_denD['leaves']]] xlabelsL = heatmapAX.set_xticklabels(xlabels) # rotate labels 90 degrees for label in xlabelsL: label.set_rotation(90) # remove the tick lines for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines(): l.set_markersize(0) heatmapAX.grid('off') if cluster_cols and n_col_clusters: edges = optimize_clusters(col_clusters, col_denD, n_col_clusters) for edge in edges: heatmapAX.axvline(edge +0.5, color='k', lw=3) if cluster_rows and n_row_clusters: edges = optimize_clusters(row_clusters, row_denD, n_row_clusters) for edge in edges: heatmapAX.axhline(edge +0.5, color='k', lw=3) return fig
# Replace the data points with their respective cluster value # (ex. 0) and is color coded with a colormap (plt.cm.spectral) plt.text(X1[i, 0], X1[i, 1], str(y1[i]), #(X1[i, 0] n_samples, X1[i, 1] n_features, str(y1[i]) The integer labels for cluster membership of each sample. color=plt.cm.nipy_spectral(agglom.labels_[i]/10), #/10 (or any number) change color of lables, idk why.. fontdict={'weight': 'bold', 'size': 9}) #This only plot the numbers (labels) of data # Remove the x ticks, y ticks, x and y axis plt.xticks([]) plt.yticks([]) #plt.axis('off') # Display the plot of the original data before clustering plt.scatter(X1[:, 0], X1[:, 1], marker='.') #With these, we combine labes and datapoints. plt.show() dist_matrix = distance_matrix(X1,X1) #Measures dist between all data, set x as rows and also columns, diagonal = 0. print(dist_matrix) Z = hierarchy.linkage(dist_matrix, "complete") dendro = hierarchy.dendrogram(Z) print(Z) #plot example from google hierarchy.set_link_color_palette(['m', 'c', 'y', 'k']) fig, axes = plt.subplots(1, 2, figsize=(8, 3)) dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y', orientation='top') dn2 = hierarchy.dendrogram(Z, ax=axes[1], above_threshold_color='#bcbddc', orientation='right') hierarchy.set_link_color_palette(None) # reset to default after use #plt.show()
ax.get_yaxis().set_ticks([]) for sp in ax.spines.values(): sp.set_visible(False) # make norm vmin = input_data.min().min() vmax = input_data.max().max() print("Range in data %f...%f" % (vmin, vmax)) vmax = max([vmax, abs(vmin)]) # choose larger of vmin and vmax vmin = vmax * -1 print("Normalised to %f...%f" % (vmin, vmax)) my_norm = mpl.colors.Normalize(vmin, vmax) # dendrogram single color sch.set_link_color_palette(['black']) # cluster row_pairwise_dists = distance.squareform(distance.pdist(input_data)) row_clusters = sch.linkage(row_pairwise_dists, method=config['method']) col_pairwise_dists = distance.squareform(distance.pdist(input_data.T)) col_clusters = sch.linkage(col_pairwise_dists, method=config['method']) progress(0.25) # heatmap with row names View = plt.figure(figsize=(12, 8)) heatmapGS = gridspec.GridSpec(2, 2, wspace=0.0,
def clean_select_neighb(df_feature2): # Keep only neighborhoods with 10 or more known coffee shops data_temp=df_feature2[df_feature2['Count']>=10] X=np.asarray(data_temp.loc[:,~data_temp.columns.isin(['loc_City','Avg_Utility_Score','Count'])]) columns=[col for col in data_temp.columns if col not in ['Avg_Utility_Score', 'Count','loc_City']] imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0) imputer = imputer.fit(X) X = imputer.transform(X) scaler = preprocessing.MinMaxScaler() minmax_scaled_df = scaler.fit_transform(X) minmax_scaled_df = pd.DataFrame(minmax_scaled_df, columns=columns,index=data_temp.index) minmax_scaled_df=pd.concat([minmax_scaled_df,pd.DataFrame(data_temp['loc_City'],index=data_temp.index)],axis=1) minmax_scaled_df['loc_id'] = minmax_scaled_df.index + ', ' + minmax_scaled_df.loc_City minmax_scaled_df.drop(columns=['loc_City'],inplace=True) minmax_scaled_df.set_index('loc_id',drop=True,inplace=True) minmax_scaled_df = minmax_scaled_df.loc[~minmax_scaled_df.index.duplicated(keep='first')] samples=minmax_scaled_df.values labs = minmax_scaled_df.index set_link_color_palette(['teal','sandybrown', 'steelblue', 'firebrick', 'forestgreen', 'darkviolet', 'crimson', 'darkcyan', 'peru', 'indigo', 'darkorange']) mergings = linkage(samples, method='complete') # Apply dendogram being applying PCA dendo = dendrogram(mergings, labels=labs, leaf_rotation=0, leaf_font_size=14, color_threshold=1.0, orientation='right', no_plot=True) a = pd.DataFrame(dendo['color_list']) val_dict = dict(a.iloc[:,0].value_counts()) colors = a.iloc[:,0].unique() color_list = [] for color in colors: if color != 'b': for i in range(val_dict[color] + 1): color_list.append(color) neighb_colors = pd.DataFrame([color_list, dendo['ivl'], dendo['leaves']]).T neighb_colors.rename({0:'color', 1:'loc_id', 2:'leaf'}, axis=1,inplace=True) neighb_colors = neighb_colors.set_index('loc_id', drop=True) # Apply PCA, keeping top 5 features pca = PCA(5) projected = pca.fit_transform(minmax_scaled_df.values) # Now, after PCA, apply K-Means and GMM clustering n_clusters = 7 kmeans = KMeans(n_clusters, random_state=42) labels_kmeans = kmeans.fit(projected).predict(projected) gmm = GaussianMixture(n_components=n_clusters).fit(projected) labels_GMM = gmm.predict(projected) minmax_scaled_df['labels_GMM']=labels_GMM minmax_scaled_df['labels_KMeans']=labels_kmeans neighb_colors = neighb_colors[['color']] neighb_colors = neighb_colors.sort_index() # Merge with Dendogram output minmax_scaled_df=minmax_scaled_df.merge(neighb_colors, on='loc_id',how='outer') minmax_scaled_df.rename(columns={'color': 'labels_dendo'}, inplace=True) return minmax_scaled_df
VOTES_RAW = {i: PLYRS[i]['votes'] for i in PLYRS.keys()} (NAMES, VOTES) = (list(VOTES_RAW.keys()), list(VOTES_RAW.values())) mat = np.genfromtxt(path.join(PT_DTA, FN_DST), delimiter=',') if cst.ANONYMIZE: shuffle(NAMES) ############################################################################### # Process ############################################################################### print('(5) Plotting Dendrogram') dists = squareform(mat) linkage_matrix = linkage(dists, 'ward') ############################################################################### # Plot ############################################################################### (fig, ax) = plt.subplots() set_link_color_palette(['#ff006e', '#2614ed', 'k']) dend = dendrogram( linkage_matrix, labels=NAMES, orientation='right', above_threshold_color='#bcbddcA0', count_sort='descending' ) ax.set_aspect(.0025) plt.xticks([]) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_color('#ffffff') plt.title('Hierarchical Clustering\n', fontdict={'size': 18}) fig.savefig( path.join(PT_PLT, 'DN.png'),
def plot_dendrograms(data, labels): cmap = mpl.cm.get_cmap("tab10") hierarchy.set_link_color_palette([ mpl.colors.to_hex(cmap(idx)) for idx in range(10) ]) linkage_gene_total = linkage( data_dict["crc"]["gene"]["f"], 'ward', optimal_ordering=True ) linkage_gene_0 = linkage( data_dict["crc"]["gene"]["fhb0"], 'ward', optimal_ordering=True ) linkage_gene_1 = linkage( data_dict["crc"]["gene"]["fhb1"], 'ward', optimal_ordering=True ) linkage_tumor = linkage( data_dict["common"]["tumor"]["f"], 'ward', optimal_ordering=True ) fig, ax = plt.subplots(figsize=(5, 10)) dendrogram(linkage_gene_total, orientation='left', above_threshold_color="grey", color_threshold=linkage_gene_total[-2, 2], labels=labels["crc"]["gene"], show_leaf_counts=True, ax=ax) ax.axvline(linkage_gene_total[-2, 2], color="grey", linestyle="--") plt.savefig("dendrogram-total.svg", dpi=600) fig, ax = plt.subplots(figsize=(5, 10)) dendrogram(linkage_gene_0, orientation='left', above_threshold_color="grey", color_threshold=linkage_gene_0[-2, 2], labels=labels["crc"]["gene"], show_leaf_counts=True, ax=ax) ax.axvline(linkage_gene_0[-2, 2], color="grey", linestyle="--") ax.set_xlim(12, 0) plt.savefig("dendrogram-0.svg", dpi=600) fig, ax = plt.subplots(figsize=(5, 10)) dendrogram(linkage_gene_1, orientation='left', above_threshold_color="grey", color_threshold=linkage_gene_0[-2, 2], labels=labels["crc"]["gene"], show_leaf_counts=True, ax=ax) ax.axvline(linkage_gene_0[-2, 2], color="grey", linestyle="--") ax.set_xlim(12, 0) plt.savefig("dendrogram-1.svg", dpi=600) fig, ax = plt.subplots(figsize=(5, 10)) dendrogram(linkage_tumor, orientation='left', above_threshold_color="grey", color_threshold=linkage_tumor[-1, 1], labels=["CRC" for _ in labels["crc"]["tumor"]] + ["EC" for _ in labels["ec"]["tumor"]], show_leaf_counts=True, ax=ax) plt.savefig("dendrogram-t.svg", dpi=600)
bubble_data['cluster']) # and we attribute y-coordinate simply corresponding to the donor id bubble_data['y'] = -bubble_data['max % ADCC'].index.map(int) #%% ### FIGURE ### fig = plt.figure(figsize = (8,3.5)) ### DENDROGRAM ax1=plt.subplot(122) # we try to make the dendrogram a bit sexier than by default... hierarchy.set_link_color_palette(['purple', 'cornflowerblue', 'limegreen', 'gold', 'tomato']) # we need context manager to set the linewidth with plt.rc_context({'lines.linewidth': 2}): dend = dendrogram(Z, labels=data_cluster.index, orientation='right', leaf_font_size=7, leaf_rotation=0, above_threshold_color='lightgrey', color_threshold=32, ax=ax1) # a few more aesthetics
def my_clustermap(matrix, thrs_row=1, thrs_col=1, distM=None, row_cls=False, col_cls=True, return_fig=False, method='average', fig_sz=(8, 8), colnames=None, rownames=None, cls_info=False): colors = sns.color_palette("Set2", 25) colors = [mat_col.rgb2hex(color) for color in colors] set_link_color_palette(colors) #print ("testing") if colnames is None: try: colnames = np.array(matrix.columns, str) except AttributeError: colnames = np.array(range(0, matrix.shape[1]), str) if rownames is None: try: rownames = np.array(matrix.index, str) except AttributeError: rownames = np.array(range(0, matrix.shape[0]), str) if row_cls: if distM is None: D_row = scipy.spatial.distance.pdist(matrix) else: D_row = distM if col_cls: if distM is None: D_col = scipy.spatial.distance.pdist(matrix.T) else: D_col = distM #print ("testing") fig = plt.figure(figsize=fig_sz) #print ("testing2") lef = 0.01 bot = 0.05 h_sep = 0.2 v_sep = 0.7 row_leg = 0.01 #space for the legend of the rows plotted on the right side of the matrix #print ("test") if row_cls: if col_cls: #if want both row and column dendrogram mat_h = v_sep - 0.005 - bot mat_w = 0.9 - row_leg - h_sep den_h = 1 - v_sep - 0.005 den_w = h_sep - 0.005 - lef #plot dendrogram for column clusters ax_col = fig.add_axes([h_sep, v_sep, mat_w, den_h]) g_col = scipy.cluster.hierarchy.linkage(D_col, method=method) den_col = scipy.cluster.hierarchy.dendrogram( g_col, color_threshold=thrs_col, above_threshold_color='black') idx_col = den_col['leaves'] ax_col.set_xticklabels(['']) else: #if only want row dendrogram mat_h = 1 - bot * 2 mat_w = 0.9 - 0.01 - h_sep den_w = h_sep - 0.005 - lef idx_col = list(range(0, matrix.shape[1])) # plot dendrogram for row clusters ax_row = fig.add_axes([lef, bot, den_w, mat_h]) g_row = scipy.cluster.hierarchy.linkage(D_row, method=method) den_row = scipy.cluster.hierarchy.dendrogram( g_row, color_threshold=thrs_row, orientation='left', above_threshold_color='black') idx_row = den_row['leaves'] ax_row.set_yticklabels(['']) ax_mat = fig.add_axes([h_sep, bot, mat_w, mat_h]) else: if col_cls: #if only want column clusters lef = lef + 0.04 mat_h = v_sep - 0.005 - bot mat_w = 0.9 - row_leg - lef den_h = 1 - v_sep - 0.005 else: plt.close() raise ValueError( "At least one of row_cls and col_cls has to be Ture.") #plot dendrogram for column clusters ax_col = fig.add_axes([lef, v_sep, mat_w, den_h]) g_col = scipy.cluster.hierarchy.linkage(D_col, method=method) den_col = scipy.cluster.hierarchy.dendrogram( g_col, color_threshold=thrs_col, above_threshold_color='black') idx_col = den_col['leaves'] idx_row = list(range(0, matrix.shape[0])) ax_col.set_xticklabels(['']) ax_mat = fig.add_axes([lef, bot, mat_w, mat_h]) #plot data matrix as a heatmap #print (matrix.index) #matrix.loc['znf536','clcn3','tcf4','cnnm2','akt3b','foxg1het','foxg1','snap91','kmt2e'] #matrix.loc[['gria1','znf536','clcn3','tcf4','cnnm2','akt3b','foxg1het','foxg1','snap91','kmt2e'],['gria1','znf536','clcn3','tcf4','cnnm2','akt3b','foxg1het','foxg1','snap91','kmt2e']] matrix3 = np.array(matrix) matrix2 = np.array(matrix3) matrix2[matrix2 > 0.9999] = np.nan maxval = np.nanmax(matrix2) list_to_mask = [ 'c11orf87', 'dgki', 'sept3', 'ppp1r16b', 'sez6l2', 'immp2l', 'fxr1', 'otud7b', 'zswim6', 'ptn', 'ncan', 'tmtc1', 'nab2', 'kcnv1', 'r3hdm2', 'chrna5', 'cyp17a1', 'gtdc1', 'srpk2', 'cacna1i', 'epc2', 'satb2', 'srr', 'slc32a1', 'glt8d1', 'ftcdnl1', 'ogfod2', 'adamtsl3', 'sdccag8', 'srebf2', 'plch2a', 'slc38a7', 'slc39a8', 'nfkb1', 'kif5c', 'nxph4', 'dpyd', 'ngef', 'hapln4', 'apopt1', 'ina', 'mbd5', 'sybu', 'kctd13', 'lrriq3', 'arl6ip4', 'klc1', 'c2orf82', 'nrgn', 'gatad2a', 'nck1', 'grm3', 'fut9a', 'fes', 'galnt10', 'anp32e', 'slc35g2', 'snx19', 'plcl1', 'c12orf65', 'bag5', 'tbc1d5', 'mdk', 'negr1', 'pak6b', 'cntn4', 'ca8', 'man2a1', 'kcnj13', 'tcf20', 'stat6', 'cnksr2', 'ckb', 'tcf4', 'shmt2', 'znf804a', 'sipa1l1', 'arl3', 'tle1', 'doc2a', 'c10orf32', 'mad1l1' ] print(matrix) matrix.loc[list_to_mask] = 0 matrix.loc[:, list_to_mask] = 0 print(matrix) matrix = np.array(matrix) # maxval2 = np.nanmax(matrix) D = matrix[idx_row, :] D = D[:, idx_col] D2 = np.rot90(D) #print (D2) #D3 = np.rot90(D2) #print (D3) #print (colnames[idx_col]) #print (rownames[idx_row]) revlist = np.flipud(colnames[idx_col]) #print (maxval, maxval2) #im = ax_mat.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu) im = ax_mat.pcolormesh(D2, cmap=plt.cm.YlGnBu, vmin=0, vmax=maxval) ax_mat.xaxis.set_ticks_position('bottom') ax_mat.yaxis.set_ticks_position('right') ax_mat.set_xticks(list(np.asarray(list(range(0, matrix.shape[1]))) + 0.5)) ax_mat.set_yticks(list(np.asarray(list(range(0, matrix.shape[0]))) + 0.5)) #ax_mat.set_xticks(list(range(0,matrix.shape[1])+0.05)) #ax_mat.set_yticks(list(range(0,matrix.shape[0])+0.05)) ax_mat.set_xticklabels(colnames[idx_col], rotation=90, size=4) #ax_mat.set_xticklabels(colnames[idx_col],rotation=90,size=4) #ax_mat.set_yticklabels(rownames[idx_row],size=4) ax_mat.set_yticklabels(revlist, size=4) ax_mat.grid(False) # Plot colorbar. axcolor = fig.add_axes([0.94, bot, 0.02, mat_h]) plt.colorbar(im, cax=axcolor) axcolor = fig.add_axes([0.94, bot, 0.02, mat_h]) plt.colorbar(im, cax=axcolor) #namepre = fname.split(".")[0] # if row_cls: # namepre = namepre + "rows_" plt.savefig("Dec_corr_MASKING_sort1_" + method + ".png", bbox_inches='tight', dpi=600) #plt.savefig(namepre+method+".png",bbox_inches='tight', dpi=600) if cls_info: cls_dic = {} if col_cls: cls_dic['col_ind'] = den_col['leaves'] cls_dic['col_cls'] = scipy.cluster.hierarchy.fcluster( g_col, t=thrs_col, criterion='distance') if row_cls: cls_dic['row_ind'] = den_row['leaves'] cls_dic['row_cls'] = scipy.cluster.hierarchy.fcluster( g_row, t=thrs_row, criterion='distance') return (cls_dic)
def heatmap(Mat, label, bool_sort, filename, threshold, heatmap_threshold): m = len(Mat) n = len(Mat[0]) print m, n #xlabel = [] #for i in range(0,m): # xlabel.append('lig_'+str(i+1)) #ylabel = [] #for i in range(0,n): # ylabel.append('lig_'+str(i+1)) xlabel = label ylabel = label fig = pylab.figure(figsize=(8, 8)) if (bool_sort): Mat, Matvec = mat_to_vector(Mat) Mat_copy = copy.copy(Mat) Y = sch.linkage(Matvec, method='single') #Y = sch.linkage(Matvec, method='average') #Y = sch.linkage(Matvec, method='complete') #Y = sch.linkage(Matvec, method='centroid') #Y = sch.linkage(Matvec, method='median') #help(sch.linkage) #threshold = 1.0 # good for single #threshold = 0.5 # good for single #threshold = 1.5 # good for average #threshold = 2.35 # good for complete #threshold = 3.0 # good for complete #threshold = 2.0 # good for complete clusters = sch.fcluster(Y, threshold, 'distance') print clusters for i in range(len(label)): print label[i] + " " + str(clusters[i]) ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6]) sch.set_link_color_palette(['k', 'k', 'k', 'k', 'c', 'm', 'g']) Z1 = sch.dendrogram(Y, orientation='right', color_threshold=threshold) matplotlib.pyplot.plot( [threshold, threshold], [0, 10 * len(label)], 'k--') # draws a datshed line where dendogram is cut. #help(sch.dendrogram) ax1.set_xticks([]) ax1.set_yticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2]) Z2 = sch.dendrogram(Y, color_threshold=threshold) matplotlib.pyplot.plot( [0, 10 * len(label)], [threshold, threshold], 'k--') # draws a datshed line where dendogram is cut. ax2.set_xticks([]) ax2.set_yticks([]) #ax2.set_xlim(-1, n) # Plot distance matrix. axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] print "#### index " for i in idx1: print i print "####" Mat = Mat[idx1, :] Mat = Mat[:, idx2] #xlabel[:] = xlabel[idx2] xlabel_new = [] clusters_new = [] for i in range(len(idx2)): xlabel_new.append(xlabel[idx2[i]]) clusters_new.append(clusters[idx2[i]]) del xlabel[:] xlabel = xlabel_new cluster_dic = {} print "systems sorted:" for i in range(len(xlabel)): print xlabel[i] + " " + str(clusters_new[i]) if clusters_new[i] in cluster_dic.keys(): cluster_dic[clusters_new[i]] = cluster_dic[ clusters_new[i]] + " " + xlabel[i] else: cluster_dic[clusters_new[i]] = xlabel[i] for key in cluster_dic.keys(): print "cluster " + str(key) + ":" + cluster_dic[key] else: Mat = mat_to_mat(Mat) axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6]) #cdict = {'red': ((0.0, 0.0, 0.0), # (0.0, 0.0, 0.0), # (1.0, 1.0, 1.0)), # 'green': ((0.0, 0.0, 0.0), # (0.0, 0.0, 0.0), # (1.0, 1.0, 1.0)), # 'blue': ((0.0, 0.0, 0.0), # (0.0, 0.0, 0.0), # (1.0, 1.0, 1.0))} #cdict = {'red': [(0.0, 0.0, 0.0), # (0.5, 1.0, 1.0), # (1.0, 1.0, 1.0)], # # 'green': [(0.0, 0.0, 0.0), # (0.25, 0.0, 0.0), # (0.75, 1.0, 1.0), # (1.0, 1.0, 1.0)], # # 'blue': [(0.0, 0.0, 0.0), # (0.5, 0.0, 0.0), # (1.0, 1.0, 1.0)]} ## red - white - blue # colorbar is from 0 to 5 # I want the white to appare at the threshold value # midpoint : threshodl # 0.0 = 0.0 # 0.2 = ~1.0 # 0.5 = 2.5 # 0.8 = ~4.0 # 1.0 = 5.0 cmin = 0.5 cmax = 3.0 #mp = (threshold - cmin) / (cmax - cmin) # midpoint is where the white will appear mp = (heatmap_threshold - cmin) / ( cmax - cmin) # midpoint is where the white will appear tol = 0.02 if mp > 0.9 or mp < 0.1: print "threshold = " + str(threshold) + "is too high or low" exit() cdict = { 'red': [(0.0, 1.0, 1.0), (mp - tol, 1.0, 1.0), (mp, 1.0, 1.0), (mp + tol, 0.7, 0.7), (1.0, 0.0, 0.0)], 'green': [(0.0, 0.0, 0.0), (mp - tol, 0.7, 0.7), (mp, 1.0, 1.0), (mp + tol, 0.7, 0.7), (1.0, 0.0, 0.0)], 'blue': [(0.0, 0.0, 0.0), (mp - tol, 0.7, 0.7), (mp, 1.0, 1.0), (mp + tol, 1.0, 1.0), (1.0, 1.0, 1.0)] } ## blue - purple - red #cdict = {'red': [(0.0, 0.0, 0.0), # (0.5, 0.5, 0.5), # (1.0, 1.0, 1.0)], # # 'green': [(0.0, 0.0, 0.0), # (1.0, 0.0, 0.0)], # # 'blue': [(0.0, 1.0, 1.0), # (0.5, 0.5, 0.5), # (1.0, 0.0, 0.0)]} my_cmap = matplotlib.colors.LinearSegmentedColormap( 'my_colormap', cdict, 100) im = axmatrix.imshow(Mat, aspect='auto', origin='lower', interpolation='nearest', cmap=my_cmap) if (bool_sort): v = range(0, n) axmatrix.plot(v, v, 'yo', markersize=2) im.set_clim(cmin, cmax) axmatrix.set_xlim(-0.5, n - 0.5) axmatrix.set_ylim(-0.5, n - 0.5) axmatrix.set_xticks(range(0, m)) axmatrix.set_xticklabels(xlabel) if (not bool_sort): axmatrix.set_yticks(range(0, n)) axmatrix.set_yticklabels(ylabel) for i in range(0, n): labels = axmatrix.yaxis.get_major_ticks()[i].label labels.set_fontsize(3) else: axmatrix.set_yticks([]) for i in range(0, m): labels = axmatrix.xaxis.get_major_ticks()[i].label labels.set_fontsize(3) labels.set_rotation('vertical') # Plot colorbar. axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6]) pylab.colorbar(im, cax=axcolor) fig.show() fig.savefig(filename, dpi=600) ## # make histograms ## if not (bool_sort): ## return ## ## # this see how many clusters and how big they are ## min_clust = min(clusters) ## max_clust = max(clusters) ## for i in range(min_clust,max_clust+1): ## count = 0 ## for j in range(len(clusters)): ## if i == clusters[j]: ## count = count+1 ## print "cluster_"+str(i)+" has " + str(count) + " elements." ## if you do not want to gerenate the histograms ## comment back in the return return cluster1_1 = [] cluster2_2 = [] cluster3_3 = [] cluster1_2 = [] cluster1_3 = [] cluster2_3 = [] ## ## Looking at the heatmap we I denified the non ## singlton clusters. clustnum1 = 8 # closed clustnum2 = 7 # intermediate clustnum3 = 13 # open clustname = ["closed", "intermediate", "open"] print "Number of sytems = " + str(len(xlabel)) for i in range(len(xlabel)): for j in range(i, len(xlabel)): ## Note that this is for a threshold of 2.0 ## Looking at the heatmap we I denified the non ## singlton clusters. #if clusters[i] == 1 and clusters[j] == 1: if clusters[i] == clustnum1 and clusters[j] == clustnum1 \ or clusters[j] == clustnum1 and clusters[i] == clustnum1: cluster1_1.append(Mat_copy[i, j]) #elif clusters[i] == 2 and clusters[j] == 2: elif clusters[i] == clustnum2 and clusters[j] == clustnum2 \ or clusters[j] == clustnum2 and clusters[i] == clustnum2: cluster2_2.append(Mat_copy[i, j]) #elif clusters[i] == 3 and clusters[j] == 3: elif clusters[i] == clustnum3 and clusters[j] == clustnum3 \ or clusters[j] == clustnum3 and clusters[i] == clustnum3: cluster3_3.append(Mat_copy[i, j]) #elif clusters[i] == 1 and clusters[j] == 2: elif clusters[i] == clustnum1 and clusters[j] == clustnum2 \ or clusters[j] == clustnum1 and clusters[i] == clustnum2: cluster1_2.append(Mat_copy[i, j]) #elif clusters[i] == 1 and clusters[j] == 3: elif clusters[i] == clustnum1 and clusters[j] == clustnum3 \ or clusters[j] == clustnum1 and clusters[i] == clustnum3: cluster1_3.append(Mat_copy[i, j]) #elif clusters[i] == 2 and clusters[j] == 3: elif clusters[i] == clustnum2 and clusters[j] == clustnum3 \ or clusters[j] == clustnum2 and clusters[i] == clustnum3: print clusters[i], clusters[j], Mat_copy[i, j] cluster2_3.append(Mat_copy[i, j]) #else: # print clusters[i], clusters[j] #print clusters[i], clusters[j] cluster1_1_sci = array_to_vector(cluster1_1) cluster1_2_sci = array_to_vector(cluster1_2) cluster1_3_sci = array_to_vector(cluster1_3) cluster2_2_sci = array_to_vector(cluster2_2) cluster2_3_sci = array_to_vector(cluster2_3) cluster3_3_sci = array_to_vector(cluster3_3) fig = pylab.figure(figsize=(8, 8)) inbins = numpy.linspace(0, 4, 50) pbins = numpy.linspace(0.05, 3.95, 49) #inbins = [0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0] #pbins = [0.25,0.75,1.25,1.75,2.25,2.75,3.25,3.75,4.25,4.75] #n, bins, patches = matplotlib.pylab.hist(cluster2_2_sci, inbins, normed=1, histtype='bar') axis = fig.add_axes([0.1, 0.1, 0.3, 0.1]) n1_1, bins, patches = axis.hist(cluster1_1_sci, inbins, normed=1, histtype='bar') #p1 = pylab.plot(pbins,n1_1,'k-') #. axis.set_xlim(0.0, 4.0) #axis.set_ylim(0.0, 10.0) axis.set_ylim(0.0, 5.0) axis = fig.add_axes([0.1, 0.3, 0.3, 0.1]) n2_2, bins, patches = axis.hist(cluster2_2_sci, inbins, normed=1, histtype='bar') axis.set_xlim(0.0, 4.0) axis.set_ylim(0.0, 5.0) axis = fig.add_axes([0.1, 0.5, 0.3, 0.1]) n3_3, bins, patches = axis.hist(cluster3_3_sci, inbins, normed=1, histtype='bar') axis.set_xlim(0.0, 4.0) axis.set_ylim(0.0, 5.0) axis = fig.add_axes([0.5, 0.1, 0.3, 0.1]) n1_2, bins, patches = axis.hist(cluster1_2_sci, inbins, normed=1, histtype='bar') axis.set_xlim(0.0, 4.0) axis.set_ylim(0.0, 5.0) axis = fig.add_axes([0.5, 0.3, 0.3, 0.1]) n1_3, bins, patches = axis.hist(cluster1_3_sci, inbins, normed=1, histtype='bar') axis.set_xlim(0.0, 4.0) axis.set_ylim(0.0, 5.0) axis = fig.add_axes([0.5, 0.5, 0.3, 0.1]) n2_3, bins, patches = axis.hist(cluster2_3_sci, inbins, normed=1, histtype='bar') axis.set_xlim(0.0, 4.0) axis.set_ylim(0.0, 5.0) fig.show() #fig.savefig("single_hist1.png",dpi=600) fig.savefig("single_hist1_" + filename, dpi=600) fig = pylab.figure(figsize=(8, 8)) #print n, bins, patches axis = fig.add_axes([0.3, 0.1, 0.6, 0.6]) #axis = fig.add_axes([0.1,0.4,0.1,0.6]) #matplotlib.pyplot.plot(pbins,n1_1,'y-o',pbins,n2_2,'b-o',pbins,n3_3,'r-o',pbins,n1_2,'g-o',pbins,n1_3,'m-o',pbins,n2_3,'k-o') #. #matplotlib.pyplot.plot(pbins,n1_1,'y-',label='1_1') #. p1 = pylab.plot(pbins, n1_1, 'm-') #. p2 = pylab.plot(pbins, n2_2, 'c-') #. p3 = pylab.plot(pbins, n3_3, 'g-') #. p4 = pylab.plot(pbins, n1_2, 'r-') #. p5 = pylab.plot(pbins, n1_3, 'b-') #. p6 = pylab.plot(pbins, n2_3, 'y-') #. #pylab.legend([p1[0],p2[0],p3[0],p4[0],p5[0],p6[0]],['1_1','2_2','3_3','1_2','1_3','2_3']) pylab.legend([p1[0],p2[0],p3[0],p4[0],p5[0],p6[0]],[ clustname[0]+'_'+clustname[0], clustname[1]+'_'+clustname[1], clustname[2]+'_'+clustname[2], \ clustname[0]+'_'+clustname[1],clustname[0]+'_'+clustname[2],clustname[1]+'_'+clustname[2]], \ bbox_to_anchor=(0., 1.02, 1., .102), loc=3) # loc=[0.3,0.8]) pylab.xlabel("RMSD (angstroms)") pylab.ylabel("Normlized Count") fig.show() #fig.savefig("single_hist2.png",dpi=600) fig.savefig("single_hist2_" + filename, dpi=600) return
data_scaled_imputed_df = pd.DataFrame(data_scaled_imputed, columns=SDoH_COLS_NEW) data_scaled_imputed_df['ssid'] = analysis_data['ssid'].values data_scaled_imputed_df.to_csv(save_dir + 'dev_data_SDoH.csv', index=False) ## ------------ clustering ---------------- # dist_mtx = euclidean_distances(data_scaled_imputed) #dist_mtx = euclidean_distances(X_pca) linkage = hc.linkage(sp.distance.squareform(dist_mtx, checks=False), method='ward') ns_plot = sns.clustermap(dist_mtx, row_linkage=linkage, col_linkage=linkage) plt.savefig(save_dir + '_SDoH_clustergram.png', dpi=300) plt.close() plt.figure(figsize=[8, 6]) hc.set_link_color_palette(['#330066', '#7F00FF', '#CC99FF', 'k']) d_plot = hc.dendrogram(linkage, orientation='top', color_threshold=100, above_threshold_color='#808080') plt.savefig(save_dir + '_SDoH_dendrogram.pdf') plt.close() C = 3 labels = fcluster(linkage, C, criterion='maxclust') # rename labels clust_label_map = { 1: 1, 2: 2, 3: 3,
level=logging.INFO) import matplotlib matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab! import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import seaborn as sb matplotlib.rcParams['lines.linewidth'] = 0.8 from matplotlib.colors import rgb2hex from scipy.cluster.hierarchy import linkage, dendrogram, set_link_color_palette sb.set_palette('Set1', 10, 0.80) palette = sb.color_palette() set_link_color_palette(map(rgb2hex, palette)) from verification.verification import Verification from verification.evaluation import evaluate, evaluate_with_threshold, average_precision_score from verification.evaluation import rank_predict from verification.plotting import draw_tree from verification.preprocessing import prepare_corpus, Dataset from sklearn.cross_validation import train_test_split import numpy as np import pandas as pd # select a data set dev = "../data/caesar_dev" test = "../data/caesar_test" # we prepare the corpus
def create_dendrogram(self, metric, rank, excel_path): if metric == "Precall": precision_df = pd.read_excel(excel_path, sheet_name="Precision", engine='openpyxl').fillna(1) recall_df = pd.read_excel(excel_path, sheet_name="Recall", engine='openpyxl').fillna(1) # calculate harmonic mean = (2*p*r) / (p+r) p_df = precision_df.iloc[:, 3:] r_df = recall_df.iloc[:, 3:] harmonic_mean_df = (p_df.mul(r_df) * 2).div( p_df.add(r_df)).fillna(0) df = pd.concat([precision_df.iloc[:, :3], harmonic_mean_df], axis=1) else: df = pd.read_excel(excel_path, sheet_name=metric, engine='openpyxl') if rank == '': tmp_df = df else: tmp_df = df[df['rank'] == rank] to_remove = ['Tax ID', 'rank', 'name', 'Aggregate'] cols = [col for col in tmp_df.columns if col not in to_remove] tool_array = [] names = [] for item in cols: res = tmp_df[item] if np.sum(res) == 0: continue tool_array.append(res.tolist()) names.append(item.split('.')[0]) tool_array = np.array(tool_array) if len(tool_array) > 1: matplotlib.rcParams['lines.linewidth'] = 3 bray_curt = distance.pdist(np.array(tool_array), 'braycurtis') link = linkage(bray_curt, 'average') set_link_color_palette(['y', 'c', 'g', 'm', 'r']) plt.figure(figsize=[20.4, 10.4], dpi=480) title = metric + ": " + rank.capitalize() + "-Dendrogram" plt.suptitle(title, size=36, weight='semibold') den = dendrogram(link, orientation='right', labels=names) plt.xlim(-0.05, 1.05) plt.xlabel("Bray Curtis Distance", fontsize=20, weight='semibold', labelpad=15) plt.ylabel("Tools", fontsize=20, weight='semibold', labelpad=30) plt.tick_params(labelsize=16, labelcolor='#00213E') fn = title.replace(": ", "-") filename = fn.replace(" ", "_") + '.png' plt.savefig(os.path.join(self.output_path, filename), dpi=480, facecolor='#F5FFFF', transparent=False, bbox_inches='tight') plt.close() print("\n{} has been saved.".format(filename)) #plt.show() # add arg to create subplot grouped by metric or rank (subplot='none'; 'metric'; 'rank') return
ax = axi.get_axes() clean_axis(ax) plt.show() # calculate pairwise distances for rows pairwise_dists = distance.squareform(distance.pdist(core_df, similarity)) # cluster row_clusters = sch.linkage(pairwise_dists, method="complete") # calculate pairwise distances for columns col_pairwise_dists = distance.squareform(distance.pdist(core_df.T, similarity)) # cluster col_clusters = sch.linkage(col_pairwise_dists, method="complete") # make dendrograms black rather than letting scipy color them sch.set_link_color_palette(["black"]) # plot the results fig = plt.figure(figsize=figure_size) # fig.suptitle(os.path.split(input_file_path)[1]) heatmapGS = gridspec.GridSpec(2, 2, wspace=0.0, hspace=0.0, width_ratios=[0.25, 1], height_ratios=[0.25, 1]) ### col dendrogram #### col_denAX = fig.add_subplot(heatmapGS[0, 1]) col_denD = sch.dendrogram(col_clusters, color_threshold=np.inf) clean_axis(col_denAX) ### row dendrogram ### row_denAX = fig.add_subplot(heatmapGS[1, 0]) row_denD = sch.dendrogram(row_clusters, color_threshold=np.inf, orientation="right") clean_axis(row_denAX)
Dm = squareform(D) # Dendrogram fig = plt.figure(figsize=(18, 6)) plt.style.use('seaborn-whitegrid') G = gridspec.GridSpec(1, 3, wspace=0, hspace=0.1, top=0.86, bottom=0.08, left=0.14, right=0.9) ax0 = plt.subplot(G[0, :-1]) Z = sch.linkage(D, method='complete') sch.set_link_color_palette(['r', 'b', 'g', 'm', 'y', 'c']) dn = sch.dendrogram( Z, orientation='left', distance_sort='descending', no_labels=True, above_threshold_color='k', color_threshold=110, ) #plt.xticks(np.arange(0, 1500, 100)) plt.margins(0.5, 0.1) plt.title('Dendrogram', fontsize=20) plt.xlabel('Euclidean distance', fontsize=14) plt.axvline(x=110) # Generate heatmap
if __name__ == '__main__': #argv[1]: cctable.dat from Kamo outputs #argv[2]: file contains coordinates of a specific residue of all structures #argv[3]: CLUSTERS.txt from Kamo outputs #argv[4]: height cutoff matrix = get_cc_matrix(sys.argv[1]) z = sch.linkage(matrix, method='ward') rs = extract_coordinate(sys.argv[2]) fig_dendro = plt.figure(figsize=(80, 50)) plt.rc('ytick', labelsize=20) plt.ylabel('Height', fontsize=20) sch.set_link_color_palette(['g', 'r', 'c', 'm', 'y']) d = sch.dendrogram(z, color_threshold=float(sys.argv[4])) color_cluster = d['color_list'] ivl = d['ivl'] print(ivl) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for i in range(len(rs)): rs[i][0] = float(rs[i][0]) rs[i][1] = float(rs[i][1]) rs[i][2] = float(rs[i][2]) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') kmeans = KMeans(n_clusters=5) kmeans.fit(rs)
import matplotlib matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab! import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import seaborn as sb matplotlib.rcParams['lines.linewidth'] = 0.8 from matplotlib.colors import rgb2hex from scipy.cluster.hierarchy import linkage, dendrogram, set_link_color_palette sb.set_palette('Set1', 10, 0.80) palette = sb.color_palette() set_link_color_palette(map(rgb2hex, palette)) from verification.verification import Verification from verification.evaluation import evaluate, evaluate_with_threshold, average_precision_score from verification.evaluation import rank_predict from verification.plotting import draw_tree from verification.preprocessing import prepare_corpus, Dataset from sklearn.cross_validation import train_test_split import numpy as np import pandas as pd # select a data set dev = "../data/caesar_dev" test = "../data/caesar_test" # we prepare the corpus
# convert to boolean kobool = df > 0 kojacc = spatial.distance.pdist(kobool, metric=distance) Z = hierarchy.linkage(kojacc, method=method, optimal_ordering=True) maxdist = np.max(Z[:, 2]) # clustering (segmentación) clust = hierarchy.fcluster(Z, maxdist * args.cutoff, criterion='distance') n_clusts = len(np.unique(clust)) # number of clusters # dendograms # colors for clusters # TODO: Add an option for selecting colormap dend_colors = cm.jet(np.linspace(0, 1, n_clusts)) hexcolors = [mpl.colors.rgb2hex(rgb[:3]) for rgb in dend_colors] hierarchy.set_link_color_palette(hexcolors) # Plot plt.figure(figsize=(15, 7)) dend = hierarchy.dendrogram(Z, color_threshold=maxdist * args.cutoff, no_labels=True) plt.axhline(maxdist * args.cutoff, ls='--', alpha=0.3, c='k') plt.tight_layout() # legend and colors if legend: legend_elements = [] for i, rgb in enumerate(dend_colors): label = f"Cluster {i+1}" element = Line2D([0], [0], color=rgb, label=label, lw=3)
def main(): """Main function""" args = argparser() matplotlib.rcParams['lines.linewidth'] = 0.4 # load data print('[INFO] Loading data') data = pd.read_csv(args.filename, sep='\t', index_col=[0, 1]) # ************************************************************************* # * data scaling normaliation * # ************************************************************************* # to do # data_scaling = (data.T - data.T.min())/(data.T.max() - data.T.min()) # data = data_scaling.T # ************************************************************************* # * Fix columns (rows) with 0 or show error and * # * warning to the user * # ************************************************************************* # Fix for 0 columns print("[WARN] Fixing all 0s columns") data = data.loc[:, data.sum() != 0] # ************************************************************************* # * Calculate distnaces - add to options * # ************************************************************************* # Rows distances d_metric = 'braycurtis' linkage_m = 'average' metadist = sch.distance.pdist(data, metric=d_metric) metalink = sch.linkage(metadist, method=linkage_m) metalink = metalink.clip(0, metalink.max() + 1) # columns distances profdist = sch.distance.pdist(data.T, metric=d_metric) proflink = sch.linkage(profdist, method=linkage_m) proflink = proflink.clip(0, proflink.max() + 1) ############ # Plotting # ############ print('[INFO] Plotting ...') # - Figure setup xf = 6.7 yf = 8.6 fig = plt.figure(figsize=(xf, yf)) # Axes positions # # Axes without column names # posm = [0.01, 0.01, 0.2, 0.82] # posp = [0.24, 0.84, 0.67, 0.15] # posmat = [0.24, 0.01, 0.67, 0.82] # posm_colors = [0.215, 0.01, 0.02, 0.82] # poscbar = [0.92, 0.01, 0.015, 0.40] # new with labesl posm = [0.01, 0.23, 0.2, 0.62] posp = [0.24, 0.855, 0.67, 0.14] posmat = [0.24, 0.23, 0.67, 0.62] posm_colors = [0.215, 0.23, 0.02, 0.62] # poscbar = [0.94, 0.01, 0.02, 0.41] poscbar = [0.92, 0.23, 0.015, 0.30] poslegend = [0.01, 0.84, 0.23, 0.15] # colors for dendograms sch.set_link_color_palette([ '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf' ]) # # - rows dendogram meta_ax = fig.add_axes(posm, frameon=False) metadend = sch.dendrogram(metalink, color_threshold=0.2 * max(metalink[:, 2]), orientation='left') meta_ax.set_xticks([]) meta_ax.set_yticks([]) # # - columns dendogram prof_ax = fig.add_axes(posp, frameon=False) profdend = sch.dendrogram(proflink, color_threshold=0.2 * max(proflink[:, 2]), orientation='top') prof_ax.set_xticks([]) prof_ax.set_yticks([]) # # - Matrix - HEATMAP matrix_ax = fig.add_axes(posmat) mat = data.get_values() mmask = metadend['leaves'] pmask = profdend['leaves'] mat = mat[mmask, :] mat = mat[:, pmask] im = matrix_ax.matshow(mat, aspect='auto', origin='lower', cmap='viridis') # **************************************************************************** # * Here we can add options to show labels - needs to modify axes positions * # **************************************************************************** # Etiquetas matrix_ax.set_yticks([]) matrix_ax.set_xticks(range(len(data.columns))) matrix_ax.set_xticklabels(data.columns[pmask], rotation=90, fontsize=5, color='k') matrix_ax.xaxis.set_ticks_position('bottom') # # - Colorbar colorbar_ax = fig.add_axes(poscbar) cb = plt.colorbar(im, cax=colorbar_ax) cb.set_label('Completeness', fontsize='x-small') cb.ax.tick_params(labelsize='xx-small') # # - Color code # Get colors from the first element in the index general_index = sorted(pd.MultiIndex.to_frame(data.index)[0].unique(), key=lambda x: int(x[1:])) color_as = {} for i, v in enumerate(range(len(general_index))): icolor = plt.cm.tab20(v / len(general_index)) color_as[general_index[i]] = icolor # color vector color_vec = [color_as[i[0]] for i in data.index] color_vec = np.array(color_vec) color_vec = color_vec[mmask] color_ax = fig.add_axes(posm_colors, frameon=False) lefts = range(0, len(color_vec), 1) height = np.ones(len(color_vec)) width = 1 metabars = color_ax.barh(lefts, height, width, color=color_vec, edgecolor=color_vec) # Can you use matshow, pcolor or imshow? # im_col = color_ax.matshow(color_mat, aspect='auto', # origin="lower") # color_ax.set_xlim(-0.5, 0.5) color_ax.set_xticks([]) color_ax.set_yticks([]) color_ax.set_ylim((0, len(color_vec))) # # - Legend legend_ax = fig.add_axes(poslegend, frameon=False) legend_ax.set_xticks([]) legend_ax.set_yticks([]) patches = [] for name, color_ in color_as.items(): p = mpatches.Patch(color=color_, label=name) patches.append(p) plt.legend(handles=patches, fancybox=True, fontsize='xx-small', loc=2, framealpha=0.75) # # - show # plt.show() # # - Save Figure figname = 'heatmap.{}'.format(args.im_format) fig.savefig(figname, dpi=args.im_res)
X_umap = umpa_reducer.fit_transform(val_study_data[CLUSTERING_COLS]) #X_umap = umpa_reducer.fit_transform(X_pca) plt.scatter(X_umap[:, 0], X_umap[:, 1], s=1, alpha=0.5) plt.show() plt.close() ## ------------ clustering ---------------- # dist_mtx = euclidean_distances(val_study_data[CLUSTERING_COLS].values) #dist_mtx = euclidean_distances(X_pca) linkage = hc.linkage(sp.distance.squareform(dist_mtx, checks=False), method='ward') ns_plot = sns.clustermap(dist_mtx, row_linkage=linkage, col_linkage=linkage) plt.savefig(MAIN_DIR + '\\Results\\' + OUTPUT_FOLDER + '\\clustergram.png', dpi=300) plt.close() plt.figure(figsize=[8, 6]) hc.set_link_color_palette([ '#CE4257', '#F9C77E', '#79A3D9', '#7B967A']) d_plot = hc.dendrogram(linkage, orientation='top', color_threshold=60, above_threshold_color='#808080') plt.savefig(MAIN_DIR + '\\Results\\' + OUTPUT_FOLDER + '\\dendrogram.pdf', dpi=300) plt.close() C = 4 labels = fcluster(linkage, C, criterion='maxclust') lable_color = {1:'#79A3D9', 2:'#7B967A', 3:'#F9C77E', 4:'#CE4257'} lable_annotation = {1:'Subphenotype I', 2:'Subphenotype II', 3:'Subphenotype III', 4:'Subphenotype IV', }
from scipy.cluster.hierarchy import linkage, dendrogram from scipy.cluster.hierarchy import set_link_color_palette import pandas as pd import numpy as np import matplotlib.pyplot as plt set_link_color_palette(["black"]) pd.set_option('display.max_columns', 500) np.random.seed(123) variables = ["X", "Y", "Z"] labels = ["ID_0", "ID_1", "ID_2", "ID_3", "ID_4"] X = np.random.random_sample([5, 3]) * 10 df = pd.DataFrame(X, columns=variables, index=labels) row_clusters = linkage(df.values, method="complete", metric="euclidean") row_dendr = dendrogram(row_clusters, labels=np.asarray(labels), color_threshold=np.inf) plt.tight_layout() plt.ylabel("Euclidean Distance") plt.show()
def make_heatmap_png(data_matrix, colLabel_list, rowLabel_list,\ isRowClustering, isColClustering, ratio, fontSize, outputPATH): colOrder = [i for i in range(len(data_matrix[0]))] rowOrder = [j for j in range(len(data_matrix))] data_matrix = numpy.array(data_matrix) fig = plt.figure() if ratio == -1: figRatio = len(rowLabel_list)/float(len(colLabel_list)) else: figRatio = ratio if figRatio >= 1: figWidth = 50.0/figRatio else: figWidth = 50 figHeight = figWidth*figRatio fig.set_size_inches(figWidth,figHeight) heatmapGS = gridspec.GridSpec(2,2,wspace=0.0,hspace=0.0,width_ratios=[10,figWidth],height_ratios=[10,figHeight]) #clustering clusterMethod = 'average' if isRowClustering: #row clustering and dendrogram rowDendro_ax = fig.add_subplot(heatmapGS[1,0]) rowPairwiseDist = dist.squareform(dist.pdist(data_matrix), 'euclidean') rowCluster = sch.linkage(rowPairwiseDist, method=clusterMethod) sch.set_link_color_palette(['black']) row_dendro = sch.dendrogram(rowCluster, color_threshold=numpy.inf, orientation='right') rowOrder = row_dendro['leaves'] clean_axis(rowDendro_ax) data_matrix = data_matrix[rowOrder, :] if isColClustering: #column clustering and dendrogram colDendro_ax = fig.add_subplot(heatmapGS[0,1]) colPairwiseDist = dist.squareform(dist.pdist(numpy.transpose(data_matrix)), 'euclidean') colCluster = sch.linkage(colPairwiseDist, method=clusterMethod) sch.set_link_color_palette(['black']) col_dendro = sch.dendrogram(colCluster, color_threshold=numpy.inf) colOrder = col_dendro['leaves'] clean_axis(colDendro_ax) data_matrix = data_matrix[:, colOrder] #depict heatmap ax = fig.add_subplot(heatmapGS[1,1]) heatmap = ax.imshow(data_matrix, cmap=plt.cm.PuBuGn,interpolation='nearest',aspect='auto',origin='lower', alpha=1) clean_axis(ax) #tick and labels x_index = numpy.arange(data_matrix.shape[1]) y_index = numpy.arange(data_matrix.shape[0]) ax.yaxis.tick_left() ax.set_xticks(x_index, minor=False) ax.set_yticks(y_index, minor=False) ax.yaxis.set_ticks_position('right') ax.set_xticklabels([colLabel_list[i] for i in colOrder], rotation=90, minor=False) if rowLabel_list!=[]: ax.set_yticklabels([rowLabel_list[i] for i in rowOrder], minor=False) if fontSize == -1: ylabelsize = 36 xlabelsize = ylabelsize*len(rowOrder)/float(len(colOrder)) if figRatio != -1: xlabelsize = xlabelsize/figRatio else: ylabelsize = fontSize xlabelsize = ylabelsize*len(rowOrder)/float(len(colOrder)) if figRatio != -1: xlabelsize = xlabelsize/figRatio plt.tick_params(axis='x', labelsize=xlabelsize) plt.tick_params(axis='y', labelsize=ylabelsize) plt.setp(ax.get_xticklines()+ax.get_yticklines(), visible=False) #for colorbar scale_cbGSSS = gridspec.GridSpecFromSubplotSpec(1,2,subplot_spec=heatmapGS[0,0],wspace=0.0,hspace=0.0) scale_cbAX = fig.add_subplot(scale_cbGSSS[0,0]) cBar = fig.colorbar(heatmap, scale_cbAX, drawedges=False) cBar.ax.tick_params(labelsize=ylabelsize) cBar.outline.set_linewidth(0) cBar.ax.yaxis.set_ticks_position('left') plt.setp(cBar.ax.get_yticklines(), visible=False) #plt.tight_layout() if outputPATH[-1] == '/': plt.savefig(outputPATH+"heatmap.png", format='png') else: plt.savefig(outputPATH+".png", format='png')
clustering = AgglomerativeClustering().fit(points) AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto', connectivity=None, distance_threshold=None, linkage='single', memory=None, n_clusters=1, pooling_func='deprecated') S = hierarchy.linkage(dc, 'single') sdn = hierarchy.dendrogram(S) fig = plt.gcf() fig.canvas.set_window_title('Single-Linkage Clustering') #plt.show() A = hierarchy.linkage(dc, 'average') adn = hierarchy.dendrogram(A) fig = plt.gcf() fig.canvas.set_window_title('Average Linkage Clustering') #plt.show() C = hierarchy.linkage(dc, 'complete') cdn = hierarchy.dendrogram(C) fig = plt.gcf() fig.canvas.set_window_title('Complete Linkage Clustering') #plt.show() hierarchy.set_link_color_palette(None) # reset to default after use
from scipy.cluster import hierarchy import matplotlib.pyplot as plt import numpy as np # A very basic example: ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268., 400., 754., 564., 138., 219., 869., 669.]) Z = hierarchy.linkage(ytdist, 'single') plt.figure() dn = hierarchy.dendrogram(Z) # Now plot in given axes, improve the color scheme and use both vertical and # horizontal orientations: hierarchy.set_link_color_palette(['m', 'c', 'y', 'k']) fig, axes = plt.subplots(1, 2, figsize=(8, 3)) dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y', orientation='top') dn2 = hierarchy.dendrogram(Z, ax=axes[1], above_threshold_color='#bcbddc', orientation='right') hierarchy.set_link_color_palette(None) # reset to default after use plt.show()
#normalized standardize features from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_new) X_new=scaler.transform(X_new) X_new=pd.DataFrame(X_new,columns=index) pca = PCA(n_components = 2) X_principal = pca.fit_transform(X_new) # Calculate the distance between each sample #Z = hierarchy.linkage(X_principal, 'ward') Z = hierarchy.linkage(X_new, 'ward') # Set the colour of the cluster here: hierarchy.set_link_color_palette(['r', 'b']) # Make the dendrogram and give the colour above threshold hierarchy.dendrogram(Z, color_threshold=14, above_threshold_color='grey') # Add horizontal line. plt.axhline(y=14, c='black', lw=2, linestyle='dashed') #from scipy.cluster.hierarchy import fcluster #d=shc.linkage(X_principal, method ='ward') ac2 = AgglomerativeClustering(n_clusters = 2,compute_full_tree=True) # Visualizing the clustering plt.figure(figsize =(6, 6))
def main(): """ Cluster distance matrix with scipy.cluster.hierarchy """ parser = argparse.ArgumentParser(description='description') parser.add_argument('--input', '-i', type=str, required=True, help='Location of input file' ' that contains the distance matrix as csv') parser.add_argument('--label', '-l', type=str, required=False, help='Location of id-label mapping file') parser.add_argument('--output', '-o', required=False, help='output file') args = parser.parse_args() logger.info("loading matrix") matrix = np.loadtxt(args.input, delimiter=",") labels = [ line.rstrip('\n').split('\t')[0] for line in open(args.label, 'r') ] logger.info("clustering") Z = linkage(squareform(matrix), 'ward') logger.info("generating flat clusters") clusters = fcluster(Z, 250, 'maxclust') cluster_map = {} # Output clusters output = open(args.output, 'w') for disease_id, cluster_id in zip(labels, clusters): try: cluster_map[cluster_id].append(disease_id) except KeyError: cluster_map[cluster_id] = [disease_id] output.write("{}\t{}\n".format(disease_id, cluster_id)) # Singletons singleton_count = sum( [len(v) for k, v in cluster_map.items() if len(v) == 1]) sizes = [len(v) for k, v in cluster_map.items()] logger.info("{} singletons".format(singleton_count)) logger.info("Avg cluster size: {}".format(mean(sizes))) logger.info("median cluster size: {}".format(median(sizes))) # Draw dendrogram plt.figure() dn = hierarchy.dendrogram(Z) hierarchy.set_link_color_palette(['m', 'c', 'y', 'k']) fig, axes = plt.subplots(1, 2, figsize=(8, 3)) dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y', orientation='top') dn2 = hierarchy.dendrogram(Z, ax=axes[1], above_threshold_color='#bcbddc', orientation='right') hierarchy.set_link_color_palette(None) # reset to default after use plt.show()
ax = fig.add_subplot(111) cax = ax.matshow(probDf, interpolation='nearest', cmap='hot_r') fig.colorbar(cax) ax.set_xticklabels([''] + list(probDf.columns)) ax.set_yticklabels([''] + list(probDf.index)) plt.show() ''' rowDist = pd.DataFrame(squareform(pdist(probDf, metric='euclidean')), columns=sortedRowNames, index=sortedRowNames) rowClusters = linkage(pdist(probDf, metric='euclidean'), method='complete') hierarchy.set_link_color_palette(['black']) fig = plt.figure(figsize = (8,8)) axd = fig.add_axes([0.09,0.1,0.2,0.6]) rowDendr = dendrogram(rowClusters, orientation = 'right', color_threshold = np.inf,) dfRowClust = probDf.ix[rowDendr['leaves'][::-1]] print(rowDendr['leaves']) axd.set_xticks([]) axd.set_yticks([]) for i in axd.spines.values(): i.set_visible(False)
def plot_img_with_dendrograms(self, use_abs_cor=True): ''' Plot an image or correlation matrix along with dendrograms Uses methods from: http://nbviewer.ipython.org/github/ucsd-scientific-python/user-group/blob/master/presentations/20131016/hierarchical_clustering_heatmaps_gridspec.ipynb Parameters ----------- use_abs_cor : {True, False}, optional Use the absolute values of correlation matrix ''' import matplotlib.gridspec as gridspec import scipy.cluster.hierarchy as sch # helper for cleaning up axes by removing ticks, tick labels, frame, etc. def clean_axis(ax): """Remove ticks, tick labels, and frame from axis""" ax.get_xaxis().set_ticks([]) ax.get_yaxis().set_ticks([]) for sp in ax.spines.values(): sp.set_visible(False) fig = plt.figure() heatmapGS = gridspec.GridSpec(2, 2, wspace=0.0, hspace=0.0, width_ratios=[1, 0.25], height_ratios=[0.25, 1]) if use_abs_cor == True: D = np.abs(self.array) if use_abs_cor == False: D = self.array ## Col Dendrogram col_denAX = fig.add_subplot(heatmapGS[0, 0]) clusters1 = sch.linkage(D, method='centroid') sch.set_link_color_palette(['black']) col_denD = sch.dendrogram(clusters1, labels=self.df.columns.values, orientation='top', color_threshold=np.inf) clean_axis(col_denAX) ## Row Dendrogram row_denAX = fig.add_subplot(heatmapGS[1, 1]) clusters2 = sch.linkage(D, method='single') sch.set_link_color_palette(['black']) row_denD = sch.dendrogram(clusters2, labels=self.df.index.values, orientation='left', color_threshold=np.inf) clean_axis(row_denAX) # Heatmap heatmapAX = fig.add_subplot(heatmapGS[1, 0]) idx1 = row_denD['leaves'] idx2 = col_denD['leaves'] D_remap = D.copy() D_remap = D_remap[idx1, :] D_remap = D_remap[:, idx2] axi = heatmapAX.imshow(D_remap, interpolation='nearest', aspect='auto', origin='lower', vmin=0, vmax=1) def _format_coord(x, y): x = int(x + 0.5) y = int(y + 0.5) par_row = row_denD.items()[0][1][y] par_col = col_denD.items()[0][1][x] try: return "%.3f %s | %s" % (D_remap[y, x], par_row, par_col) except IndexError: return "" heatmapAX.format_coord = _format_coord clean_axis(heatmapAX) ## row labels ## heatmapAX.set_yticks(np.arange(self.df.shape[0])) heatmapAX.yaxis.set_ticks_position('left') heatmapAX.set_yticklabels(self.df.index[row_denD['leaves']]) # remove the tick lines for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines(): l.set_markersize(0) ## col labels ## heatmapAX.set_xticks(np.arange(self.df.shape[1])) heatmapAX.xaxis.set_ticks_position('bottom') xlabelsL = heatmapAX.set_xticklabels( self.df.columns[col_denD['leaves']]) # rotate labels 90 degrees for label in xlabelsL: label.set_rotation(90) # remove the tick lines for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines(): l.set_markersize(0) ### scale colorbar ### scale_cbGSSS = gridspec.GridSpecFromSubplotSpec( 1, 2, subplot_spec=heatmapGS[0, 1], wspace=0.5, hspace=0.5) scale_cbAX = fig.add_subplot( scale_cbGSSS[0, 0]) # colorbar for scale in upper corner cb = fig.colorbar( axi, scale_cbAX ) # note that we tell colorbar to use the scale_cbAX axis cb.set_label('Abs. Cor.') cb.ax.yaxis.set_ticks_position( 'right' ) # move ticks to left side of colorbar to avoid problems with tight_layout cb.ax.yaxis.set_label_position( 'right' ) # move label to left side of colorbar to avoid problems with tight_layout cb.outline.set_linewidth(0) # make colorbar labels smaller tickL = cb.ax.yaxis.get_ticklabels() for t in tickL: t.set_fontsize(t.get_fontsize() - 3) heatmapGS.tight_layout(fig, h_pad=0.1, w_pad=0.5)
ax = axi.get_axes() clean_axis(ax) plt.show() # calculate pairwise distances for rows pairwise_dists = distance.squareform(distance.pdist(core_df, similarity)) # cluster row_clusters = sch.linkage(pairwise_dists, method='complete') # calculate pairwise distances for columns col_pairwise_dists = distance.squareform(distance.pdist(core_df.T, similarity)) # cluster col_clusters = sch.linkage(col_pairwise_dists, method='complete') # make dendrograms black rather than letting scipy color them sch.set_link_color_palette(['black']) # plot the results fig = plt.figure(figsize=figure_size) #fig.suptitle(os.path.split(input_file_path)[1]) heatmapGS = gridspec.GridSpec(2, 2, wspace=0.0, hspace=0.0, width_ratios=[0.25, 1], height_ratios=[0.25, 1]) ### col dendrogram #### col_denAX = fig.add_subplot(heatmapGS[0, 1]) col_denD = sch.dendrogram(col_clusters, color_threshold=np.inf) clean_axis(col_denAX) ### row dendrogram ### row_denAX = fig.add_subplot(heatmapGS[1, 0]) row_denD = sch.dendrogram(row_clusters, color_threshold=np.inf, orientation='right') clean_axis(row_denAX)
def dendro(ax, dist, cut=None, labels=None, root="top", leaf_rotation=90, leaf_font_size=10, sorting="distance", palette_name="LaSalle", cluster_colors=True, legend_loc="upper right", label_colors=False, label_color_map=None, label_title="", labs=("", "", "Distance"), font_size=(16, 12, 10)): """ Plot a dendrogram given the artist and the distance matrix at minimum. Can produce a refined dendrogram with customized color palette for the clusters, and each xtick labelled (even colored if there is a target variable). Also enables adding legend for each cluster and the color codes if applicable. Inputs: - ax (Axes): canvas - dist (ndArray): the hierarchical clustering encoded as a linkage matrix - cut (float): height at which to cut the tree - labels (Pandas.Index): index to use for xtick labels - root (str): plots the root at the top with "top", and left with "left" - leaf_rotation (float): the angle (in degrees) to rotate the leaf labels - leaf_font_size (float): the font size (in points) of the leaf labels - sorting (str): for each node n, the order (visually, from left-to-right) n’s two descendent links are plotted is determined either by number of objects in its cluster descending, or by distance between its direct descendents descending - palette_name (str): user-defined palette name for 'Palette' class, find more in the 'palette' module - cluster_colors (bool): whether to use default or user-defined clusters coloring palette - legend_loc (str): location for the cluster legend, consistent with Matplotlib legend location definitions - label_colors (bool): if there is a established target variable, whether to color xtick labels according to that variable - label_color_map (Pandas.Series): target column if there is an established target variable (supervised) - label_title (str): title of the label coloring legend, using target column name is recommended - labs ((str, str, str)): title, x-axis label, y-axis label - font_size ((int, int, int)): title, axis label, tick label font properties Returns: ([[int]]) cluster output as color-coded by the dendrogram """ palette = Palette().getPallete(palette_name, path="../../../palettes/") _, axis_font, ticks_font = create_font_setting(font_size) if cluster_colors: set_link_color_palette(palette.color_lst[::-1]) default = {"show_leaf_counts": True, "above_threshold_color": "grey"} # Sort child nodes by distance or by count descending, or neither if sorting == "distance": default["distance_sort"] = 'descending' elif sorting == "count": default["count_sort"] = 'descending' # Plotting dendrogram and cut den = dendrogram(dist, labels=labels, orientation=root, color_threshold=cut, leaf_rotation=leaf_rotation, leaf_font_size=leaf_font_size, **default) # Cluster legend cluster_colors = [] for color in den['color_list']: if color != "grey" and color not in cluster_colors: cluster_colors.append(color) c_leg = ax.legend( [Line2D([0], [0], color=c, lw=6) for c in cluster_colors], ['Cluster %s' % i for i in range(len(cluster_colors))], prop=axis_font, loc=legend_loc, shadow=False) # Get color-coded clusters color_cluster = { col: cluster for cluster, col in enumerate(cluster_colors) } col_lst = den['color_list'][:] + [den['color_list'][-1]] for i, col in enumerate(col_lst): if col == "grey": col_lst[i] = col_lst[i - 1] clusters = [[row[1]] for row in sorted(zip( den['leaves'], [color_cluster[col] for col in col_lst]), key=lambda x: x[0])] # Color the labels by target if applicable if label_colors: targets = label_color_map.unique() if len(targets) == 2: label_colors = palette.pair else: label_colors = palette.color_lst label_color_dict = { label: label_colors[i] for i, label in enumerate(targets) } for lbl in ax.get_xmajorticklabels(): lbl.set_color(label_color_dict[label_color_map[lbl.get_text()]]) box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) leg = ax.legend([ Line2D([0], [0], color='white', lw=0) for _ in range(len(targets) + 1) ], [label_title.title()] + list(targets), loc='center left', bbox_to_anchor=(1, 0.5), prop=axis_font) for i, text in enumerate(leg.get_texts()[1:]): text.set_color(label_colors[i]) text.set_ha('left') ax.add_artist(c_leg) # Plot cut if cut: if root == "left": func = ax.axvline else: func = ax.axhline func(cut, ls='--', color='r') ax.set_xticklabels(ax.get_xticklabels(), fontproperties=ticks_font) ax.set_yticklabels(ax.get_yticks(), fontproperties=ticks_font) ax.tick_params(axis='y', direction='in') labelTitleAxis(ax, labs, font_size) return clusters
def heatmapper(X, xLabels=[], yLabels=[], save=os.getcwd() + os.path.sep, WRITE_CLUSTER=True, methods="pca", CPU=os.cpu_count() // 2, cluster_both=True, SHOW=True, tCOLOR='nipy_spectral', hCOLOR="YlGnBu", _spectral=18, _n_neighbors=5, _min_dist=0.1, _perplexity=50, _n_iter=5000, _pca_comp=2, _color_threshold=0.1): """ X: M x N array. xLabels: N array. The labels or names of data X by column. yLabels: M array. The labels or names of data X by row. save: a saving directory with a prefix WRITE_CLUSTER: True or False. choose if cluster information is output ot not. methods: "", "tsne", "umap", "pca". Dimension reduction methods to apply before hierarchical clustering. CPU: CPU number to use. It has effect only when tsne methods is used. """ plt.rcParams.update({'font.size': 12}) Xshape = np.shape(X) assert len(Xshape) == 2, "matrix must be two-dimensional" pca_comp1 = Xshape[1] pca_comp2 = Xshape[0] if WRITE_CLUSTER: if len(yLabels) == 0: print( "Warning: y label names are automatically set as serial numbers. Provide yLabels option so that label names make sense." ) yLabels = list(map(str, range(Xshape[0]))) #sys.exit("if WRITE_CLUSTER=True, provide xLabels") if cluster_both == True and len(xLabels) == 0: print( "Warning: x label names are automatically set as serial numbers. Provide xLabels option so that label names make sense." ) xLabels = list(map(str, range(Xshape[1]))) """ This function generates heatmap of transcriptome data with the hierarchical clustering. """ save = save + "_" + methods # Compute and plot first dendrogram. if methods != "": print("reducing X axis dimension with " + methods) if methods == "umap": embeddingX = umap.UMAP(n_neighbors=_n_neighbors, min_dist=_min_dist, metric='euclidean', n_components=2).fit_transform(X) elif methods == "pca": embeddingX = PCA(n_components=_pca_comp).fit_transform(X) elif methods == "tsne": if CPU == 0: CPU = 1 tsne = TSNE(n_jobs=CPU, perplexity=_perplexity, n_iter=_n_iter) embeddingX = tsne.fit_transform(X) np.savez_compressed(save + "_heatmap_array.npz", X=embeddingX) else: embeddingX = np.array(X) fig = plt.figure(figsize=(8, 20)) ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.8]) print("calculating Y axis linkage") Y = fcl.linkage(embeddingX, method='ward', metric='euclidean') _cmap = cm.get_cmap(tCOLOR, _spectral) cmap = _cmap(range(_spectral)) #cmap = cm.nipy_spectral(np.linspace(0, 1, _spectral)) sch.set_link_color_palette([mpl.colors.rgb2hex(rgb[:3]) for rgb in cmap]) print('drawing dendrogram...') Z1 = sch.dendrogram(Y, orientation='left', color_threshold=_color_threshold * max(Y[:, 2])) if cluster_both: Xt = np.transpose(X) if methods != "": print("reducing Y axis dimension with " + methods) if methods == "umap": embeddingXt = umap.UMAP(n_neighbors=_n_neighbors, min_dist=_min_dist, metric='euclidean', n_components=2).fit_transform(Xt) elif methods == "pca": embeddingXt = PCA(n_components=_pca_comp).fit_transform(Xt) elif methods == "tsne": tsne = TSNE(n_jobs=CPU, perplexity=_perplexity, n_iter=_n_iter) embeddingXt = tsne.fit_transform(Xt) else: embeddingXt = Xt ax2 = fig.add_axes([0.3, 0.9, 0.5, 0.05]) #Xt=np.transpose(embeddingXt) print("calculating X axis linkage") Y2 = fcl.linkage(embeddingXt, method='ward', metric='euclidean') print('drawing dendrogram...') _cmap = cm.get_cmap(tCOLOR, _spectral) cmap2 = _cmap(range(_spectral)) sch.set_link_color_palette( [mpl.colors.rgb2hex(rgb[:3]) for rgb in cmap2]) Z2 = sch.dendrogram(Y2, orientation='top', color_threshold=_color_threshold * max(Y2[:, 2])) idx2 = Z2['leaves'] ax1.set_xticks([]) ax1.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes([0.3, 0.1, 0.5, 0.8]) idx1 = Z1['leaves'] #idx2 = Z2['leaves'] X2 = X[idx1] if cluster_both: X2 = X2[:, idx2] if WRITE_CLUSTER: new_xLabels = [] for i in idx2: new_xLabels.append(xLabels[i]) cluster_list2 = [] _tmp_set = set() cluster_idxs2 = defaultdict(list) #print(Z2['color_list']) for c, ic, dc in zip(Z2['color_list'], Z2['icoord'], Z2['dcoord']): for l in [[0, 1], [3, 2]]: if dc[l[0]] == 0.0: i = int((ic[l[1]] - 5.0) / 10.0) if not i in _tmp_set: _tmp_set.add(i) cluster_list2.append([i, c]) cluster_idxs2[c].append(i) cluster_list2 = sorted(cluster_list2) assert save is not "" with open(save + "_clusters_X_axis.txt", "w") as fo: #for k, v in cluster_idxs.items(): klist = [] m = 0 for k, v in cluster_list2: #for _v in v: #print _v, idx1[_v], yLabels[idx1[_v]] #print(k,v) _pos = xLabels[idx2[k]] #print(_pos, k, v) if v == "b": fo.write(_pos + "\t" + v + "\n") else: _key = ",".join(map(str, hex_to_rgb(v))) if len(klist) == 0: _c = ";" + str(m) m += 1 elif klist[-1] != _key: _c = ";" + str(m) m += 1 fo.write(_pos + "\t" + _key + _c + "\n") klist.append(_key) cluster_idxs = defaultdict(list) _tmp_set = set() cluster_list = [] for c, ic, dc in zip(Z1['color_list'], Z1['icoord'], Z1['dcoord']): for l in [[0, 1], [3, 2]]: if dc[l[0]] == 0.0: i = int((ic[l[1]] - 5.0) / 10.0) if not i in _tmp_set: _tmp_set.add(i) cluster_list.append([i, c]) cluster_idxs[c].append(i) else: print(c, ic, dc) cluster_list = sorted(cluster_list) if WRITE_CLUSTER: assert save is not "" with open(save + "_clusters_Y_axis.txt", "w") as fo: #for k, v in cluster_idxs.items(): klist = [] m = 0 for k, v in cluster_list: #for _v in v: #print _v, idx1[_v], yLabels[idx1[_v]] _pos = yLabels[idx1[k]] if v == "b": fo.write(_pos + "\t" + v + "\n") else: _key = ",".join(map(str, hex_to_rgb(v))) if len(klist) == 0: _c = ";" + str(m) m += 1 elif klist[-1] != _key: _c = ";" + str(m) m += 1 fo.write(_pos + "\t" + _key + _c + "\n") klist.append(_key) labels = [] sizes = [] colors = [] for k, v in cluster_idxs.items(): sizes.append(len(v)) colors.append(k) labels.append(len(v)) sizes, colors, labels = zip( *sorted(zip(sizes, colors, labels), reverse=True)) print("drawing heatmap") im = axmatrix.imshow(X2, aspect='auto', origin='lower', cmap=hCOLOR) if len(xLabels) <= 50: axmatrix.set_xticks(range(len(xLabels))) axmatrix.set_xticklabels(xLabels, rotation=90) else: axmatrix.set_xticks([]) axmatrix.set_xticklabels([]) axmatrix.yaxis.tick_right() if len(yLabels) <= 50: axmatrix.set_yticks(range(len(yLabels))) axmatrix.set_yticklabels(yLabels) else: axmatrix.set_yticks([]) axmatrix.set_yticklabels([]) #for label in axmatrix.get_yticklabels(): #label.set_fontname('Arial') #label.set_fontsize(6) # Plot colorbar. axcolor = fig.add_axes([0.5, 0.05, 0.16, 0.02]) pylab.colorbar(im, cax=axcolor, orientation='horizontal') fig2 = pylab.figure(figsize=(8, 8)) plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90) if save is not "": fig.savefig(save + "_heatmap.png", format="png") fig2.savefig(save + "_pie.pdf", format="pdf") if SHOW == True: plt.show()
def fig_cluster(out: Dataset, fontsize: float = 8, **fig_kws: dict) -> Tuple[Figure, Axes]: """Plots the dendrogram of a hierarchical clustering. Parameters ---------- out Valid Dataset from :func:`~araucaria.stats.cluster.cluster`. fontsize Font size for labels. The default is 8. fig_kws Additional arguments to pass to the :meth:`~matplotlib.figure.Figure.subplots` routine of ``Matplotlib``. Returns ------- figure ``Matplolib`` figure object. axes ``Matplotlib`` axes object. Raises ------ TypeError If ``out`` is not a valid Dataset instance. KeyError If attributes from :func:`~araucaria.stats.cluster.cluster` do not exist in ``out``. See also -------- :func:`~araucaria.stats.cluster.cluster` : Performs hierarchical clustering on a collection. Example ------- .. plot:: :context: reset >>> import matplotlib.pyplot as plt >>> from araucaria.testdata import get_testpath >>> from araucaria.xas import pre_edge >>> from araucaria.stats import cluster >>> from araucaria.io import read_collection_hdf5 >>> from araucaria.plot import fig_cluster >>> fpath = get_testpath('Fe_database.h5') >>> collection = read_collection_hdf5(fpath) >>> collection.apply(pre_edge) >>> datgroup = cluster(collection, cluster_region='xanes') >>> fig, ax = fig_cluster(datgroup) >>> fig.tight_layout() >>> plt.show(block=False) """ check_objattrs(out, Dataset, attrlist=['groupnames', 'Z', 'cluster_pars'], exceptions=True) # plotting the results fig, ax = plt.subplots(1, 1, **fig_kws) hierarchy.set_link_color_palette(['c', 'm', 'y', 'k']) dn = hierarchy.dendrogram(out.Z, ax=ax, orientation='right', leaf_font_size=fontsize, above_threshold_color='gray', labels=out.groupnames) ax.set_title(out.cluster_pars['cluster_region'].upper() + ' dendrogram') return (fig, ax)
def plot_img_with_dendrograms(self, use_abs_cor = True): ''' Plot an image or correlation matrix along with dendrograms Uses methods from: http://nbviewer.ipython.org/github/ucsd-scientific-python/user-group/blob/master/presentations/20131016/hierarchical_clustering_heatmaps_gridspec.ipynb Parameters ----------- use_abs_cor : {True, False}, optional Use the absolute values of correlation matrix ''' import matplotlib.gridspec as gridspec import scipy.cluster.hierarchy as sch # helper for cleaning up axes by removing ticks, tick labels, frame, etc. def clean_axis(ax): """Remove ticks, tick labels, and frame from axis""" ax.get_xaxis().set_ticks([]) ax.get_yaxis().set_ticks([]) for sp in ax.spines.values(): sp.set_visible(False) fig = plt.figure() heatmapGS = gridspec.GridSpec(2,2,wspace=0.0,hspace=0.0,width_ratios=[1,0.25],height_ratios=[0.25,1]) if use_abs_cor == True: D = np.abs(self.array) if use_abs_cor == False: D = self.array ## Col Dendrogram col_denAX = fig.add_subplot(heatmapGS[0,0]) clusters1 = sch.linkage(D, method='centroid') sch.set_link_color_palette(['black']) col_denD = sch.dendrogram(clusters1, labels = self.df.columns.values, orientation='top', color_threshold=np.inf) clean_axis(col_denAX) ## Row Dendrogram row_denAX = fig.add_subplot(heatmapGS[1,1]) clusters2 = sch.linkage(D, method='single') sch.set_link_color_palette(['black']) row_denD = sch.dendrogram(clusters2, labels = self.df.index.values, orientation='left', color_threshold=np.inf) clean_axis(row_denAX) # Heatmap heatmapAX = fig.add_subplot(heatmapGS[1,0]) idx1 = row_denD['leaves'] idx2 = col_denD['leaves'] D_remap = D.copy() D_remap = D_remap[idx1,:] D_remap = D_remap[:,idx2] axi = heatmapAX.imshow(D_remap,interpolation='nearest',aspect='auto',origin='lower',vmin = 0, vmax = 1) def _format_coord(x, y): x = int(x + 0.5) y = int(y + 0.5) par_row = row_denD.items()[0][1][y] par_col = col_denD.items()[0][1][x] try: return "%.3f %s | %s" % (D_remap[y, x], par_row, par_col) except IndexError: return "" heatmapAX.format_coord = _format_coord clean_axis(heatmapAX) ## row labels ## heatmapAX.set_yticks(np.arange(self.df.shape[0])) heatmapAX.yaxis.set_ticks_position('left') heatmapAX.set_yticklabels(self.df.index[row_denD['leaves']]) # remove the tick lines for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines(): l.set_markersize(0) ## col labels ## heatmapAX.set_xticks(np.arange(self.df.shape[1])) heatmapAX.xaxis.set_ticks_position('bottom') xlabelsL = heatmapAX.set_xticklabels(self.df.columns[col_denD['leaves']]) # rotate labels 90 degrees for label in xlabelsL: label.set_rotation(90) # remove the tick lines for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines(): l.set_markersize(0) ### scale colorbar ### scale_cbGSSS = gridspec.GridSpecFromSubplotSpec(1,2,subplot_spec=heatmapGS[0,1],wspace=0.5,hspace=0.5) scale_cbAX = fig.add_subplot(scale_cbGSSS[0,0]) # colorbar for scale in upper corner cb = fig.colorbar(axi,scale_cbAX) # note that we tell colorbar to use the scale_cbAX axis cb.set_label('Abs. Cor.') cb.ax.yaxis.set_ticks_position('right') # move ticks to left side of colorbar to avoid problems with tight_layout cb.ax.yaxis.set_label_position('right') # move label to left side of colorbar to avoid problems with tight_layout cb.outline.set_linewidth(0) # make colorbar labels smaller tickL = cb.ax.yaxis.get_ticklabels() for t in tickL: t.set_fontsize(t.get_fontsize() - 3) heatmapGS.tight_layout(fig, h_pad = 0.1, w_pad = 0.5) #fig.tight_layout()
def scatter(X, xLabels=[], yLabels=[], save=os.getcwd() + os.path.sep, WRITE_CLUSTER=True, methods="tsne", CPU=os.cpu_count() // 2, SHOW=True, COLOR='nipy_spectral', _spectral=18, _n_neighbors=5, _min_dist=0.1, _perplexity=50, _n_iter=5000, _color_threshold=0.1, s=0.5**2): """ X: M x N array. xLabels: N array. The labels or names of data X by column. yLabels: M array. The labels or names of data X by row. save: a saving directory with a prefix WRITE_CLUSTER: True or False. choose if cluster information is output ot not. methods: "", "tsne", "umap", "pca". Dimension reduction methods to apply before hierarchical clustering. CPU: CPU number to use. It has effect only when tsne methods is used. """ Xshape = np.shape(X) yind = list(map(str, range(Xshape[0]))) plt.rcParams.update({'font.size': 12}) if WRITE_CLUSTER: if len(yLabels) == 0: print( "Warning: y label names are automatically set as serial numbers. Provide yLabels option so that label names make sense." ) yLabels = list(map(str, range(Xshape[0]))) save = save + "_" + methods # Compute and plot first dendrogram. if methods != "": print("reducing X axis dimension with " + methods) if methods == "umap": embeddingX = umap.UMAP(n_neighbors=_n_neighbors, min_dist=_min_dist, metric='euclidean', n_components=2).fit_transform(X) elif methods == "pca": embeddingX = PCA(n_components=2).fit_transform(X) elif methods == "tsne": if CPU == 0: CPU = 1 tsne = TSNE(n_jobs=CPU, perplexity=_perplexity, n_iter=_n_iter) embeddingX = tsne.fit_transform(X) else: sys.exit("methods options can only accept umap, pca, tsne or ''.") np.savez_compressed(save + "_scatter_array.npz", X=embeddingX) else: print("skipping dimensionality reduction") if Xshape[1] != 2: sys.exit( "if methods is '', then the shape of the matrix must be N x 2." ) embeddingX = X fig, ax = plt.subplots(figsize=(8, 8)) print("calculating Y axis linkage") Y = fcl.linkage(embeddingX, method='ward', metric='euclidean') _cmap = cm.get_cmap(COLOR, _spectral) cmap = _cmap(range(_spectral)) sch.set_link_color_palette([mpl.colors.rgb2hex(rgb[:3]) for rgb in cmap]) print('drawing dendrogram...') Z1 = sch.dendrogram(Y, orientation='left', color_threshold=_color_threshold * max(Y[:, 2])) #ax.set_xticks([]) ax.set_yticks([]) ax.set_title('Hierarchical clustering ') # Plot distance matrix. fig2, ax2 = plt.subplots(figsize=(8, 8)) idx1 = Z1['leaves'] #idx2 = Z2['leaves'] X2 = X[idx1] cluster_idxs = defaultdict(list) _tmp_set = set() cluster_list = [] #print(Z1['color_list']) for c, ic, dc in zip(Z1['color_list'], Z1['icoord'], Z1['dcoord']): for l in [[0, 1], [3, 2]]: if dc[l[0]] == 0.0: i = int((ic[l[1]] - 5.0) / 10.0) if not i in _tmp_set: _tmp_set.add(i) cluster_list.append([i, c]) cluster_idxs[c].append(i) else: print(c, ic, dc) cluster_list = sorted(cluster_list) #print("sample num: "+str(Xshape[0])+"\ncluster_list: "+str(len(cluster_list))) _color_list = [""] * len(yLabels) if WRITE_CLUSTER: assert save is not "" with open(save + "_clusters_on_scatter_plot.txt", "w") as fo: #for k, v in cluster_idxs.items(): klist = [] m = 0 for k, v in cluster_list: #for _v in v: #print _v, idx1[_v], yLabels[idx1[_v]] _pos = str(yLabels[idx1[k]]) _ind = str(yind[idx1[k]]) #print(mpl.colors.hex2color(v)) _color_list[idx1[k]] = list(mpl.colors.hex2color(v)) + [1.0] if v == "b": fo.write(_ind + "\t" + _pos + "\t" + v + "\n") else: _key = ",".join(map(str, hex_to_rgb(v))) if len(klist) == 0: _c = ";" + str(m) m += 1 elif klist[-1] != _key: _c = ";" + str(m) m += 1 fo.write(_ind + "\t" + _pos + "\t" + _key + _c + "\n") klist.append(_key) else: for k, v in cluster_list: _color_list[idx1[k]] = list(mpl.colors.hex2color(v)) + [1.0] print("drawing scatter plot") plt.scatter(embeddingX[:, 0], embeddingX[:, 1], color=_color_list, s=s) ax2.set_title('Scatter plot colored by clusters') #plt.scatter(X[:, 0],X[:,1], color=_color_list) fig.savefig(save + "_dendro.png", format="png") fig2.savefig(save + "_scatter.png", format="png") if SHOW == True: plt.show()
url = "https://examples.obspy.org/dissimilarities.npz" with io.BytesIO(urlopen(url).read()) as fh, np.load(fh) as data: dissimilarity = data["dissimilarity"] plt.subplot(121) plt.imshow(1 - dissimilarity, interpolation="nearest", cmap=obspy_sequential) dissimilarity = distance.squareform(dissimilarity) threshold = 0.3 linkage = hierarchy.linkage(dissimilarity, method="single") clusters = hierarchy.fcluster(linkage, threshold, criterion="distance") # A little nicer set of colors. cmap = plt.get_cmap("Paired", lut=6) colors = ["#%02x%02x%02x" % tuple(col * 255 for col in cmap(i)[:3]) for i in range(6)] try: hierarchy.set_link_color_palette(colors[1:]) except AttributeError: # Old version of SciPy pass plt.subplot(122) try: hierarchy.dendrogram(linkage, color_threshold=0.3, above_threshold_color=cmap(0)) except TypeError: # Old version of SciPy hierarchy.dendrogram(linkage, color_threshold=0.3) plt.xlabel("Event number") plt.ylabel("Dissimilarity") plt.show()
'1-1-1', '2-1-2', '2-2-1', '3-2-1', '4-2-2', '3-2-2', '3-1-1', '1-2-2' ]) print(distDF) print("**********Lowest Level***************") totalStd = 0 for winnerWells in final: print(f"{winnerWells} with a standard err of {stderror[winnerWells]}") totalStd += stderror[winnerWells] print(f"Average Standard Err: {round(totalStd / (len(final)-1),2)}") print(f"Final confidence of {round(norm.cdf(totalStd / len(final)) * 100,2)}%") # if SYSTEM["SHOW_DENDROGRAM"]: # distDF = distDF.replace(np.nan,1) # print(z) # print(dn) # plt.savefig("static/results.jpg") # return (returnString,distDF) distDF.to_csv( f'jaccard_darkgreenLT_C{str(COUNT_CUTOFF)}_L{str(LENGTH_CUTOFF[0])}-{str(LENGTH_CUTOFF[1])}_Recovery_{str(int(RECOVERY_EFFICIENCY*100))}.csv' ) z = hierarchy.linkage( distDF, 'average' ) ##there are a few clustering choices here. for UPGMA, a standard algorithm, use 'average' instead of 'ward' # plt.figure(figsize=(14,6),dpi=100) hierarchy.set_link_color_palette(['k']) dn = hierarchy.dendrogram(z, labels=distDF.index, above_threshold_color='#bbbbbb', orientation='left') plt.show()