def random_distribution(n): #make up some data data = np.random.normal(scale=n, size=(n, n)) data[0:n / 2,0:n / 2] += 75 data[n / 2:, n / 2:] = np.random.poisson(lam=n,size=data[n / 2:, n / 2:].shape) #cluster the rows row_dist = ssd.squareform(ssd.pdist(data)) row_Z = sch.linkage(row_dist) row_idxing = sch.leaves_list(row_Z) row_labels = ['bar{}'.format(i) for i in range(n)] #cluster the columns col_dist = ssd.squareform(ssd.pdist(data.T)) col_Z = sch.linkage(col_dist) col_idxing = sch.leaves_list(col_Z) #make the dendrogram col_labels = ['foo{}'.format(i) for i in range(n)] data = data[:,col_idxing][row_idxing,:] heatmap = pdh.DendroHeatMap(heat_map_data=data,left_dendrogram=row_Z, top_dendrogram=col_Z, heatmap_colors=("#ffeda0", "#feb24c", "#f03b20"), window_size="auto", color_legend_displayed=False, label_color="#777777") heatmap.row_labels = row_labels heatmap.col_labels = col_labels heatmap.title = 'An example heatmap' heatmap.show()#heatmap.save("example.png")
def save_mat(c2map, filepath): mat = c2map['mat'] fig = pylab.figure(figsize=(8,8)) # Compute and plot first dendrogram. ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) Y = sch.linkage(mat, method='centroid') Z1 = sch.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3,0.71,0.6,0.2]) Y = sch.linkage(mat, method='single') Z2 = sch.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] mat = mat[idx1,:] mat = mat[:,idx2] im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu) axmatrix.set_xticks([]) axmatrix.set_yticks([]) # Plot colorbar. axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) pylab.colorbar(im, cax=axcolor) fig.savefig(filepath)
def getDistMatrixes(cls, distDict, distMeasure, linkageCriterion): """ Find and return the correlation matrix, linkage matrix and distance matrix for the distance/correlation measure given with distMeasure parameter. """ from scipy.spatial.distance import squareform from numpy import ones, fill_diagonal from scipy.cluster.hierarchy import linkage if distMeasure == cls.CORR_PEARSON or distMeasure == cls.SIM_MCCONNAUGHEY: '''As these measures generate values between -1 and 1, need special handling''' # Cluster distances, i.e. convert correlation into distance between 0 and 1 triangularCorrMatrix = distDict[distMeasure] triangularDistMatrix = ones(len(triangularCorrMatrix)) - [(x + 1) / 2 for x in triangularCorrMatrix] linkageMatrix = linkage(cls.removeNanDistances(triangularDistMatrix), linkageCriterion) # Make correlation matrix square correlationMatrix = squareform(triangularCorrMatrix) fill_diagonal(correlationMatrix, 1) else: # Cluster distances triangularDistMatrix = distDict[distMeasure] linkageMatrix = linkage(cls.removeNanDistances(triangularDistMatrix), linkageCriterion) # Convert triangular distances into square correlation matrix squareDistMatrix = squareform(triangularDistMatrix) squareSize = len(squareDistMatrix) correlationMatrix = ones((squareSize, squareSize)) - squareDistMatrix return correlationMatrix, linkageMatrix, triangularDistMatrix
def draw_intensity(a, cmap=GREEN_CMAP, metric='euclidean', method='average', sort_x=True, sort_y=True): main_axes = plt.gca() divider = make_axes_locatable(main_axes) if sort_x is True: plt.sca(divider.append_axes("top", 0.5, pad=0)) xlinkage = linkage(pdist(a.T, metric=metric), method=method, metric=metric) xdendro = dendrogram(xlinkage, orientation='top', no_labels=True, distance_sort='descending', link_color_func=lambda x: 'black') plt.gca().set_axis_off() a = a[[a.columns[i] for i in xdendro['leaves']]] if sort_y is True: plt.sca(divider.append_axes("left", 1.0, pad=0)) ylinkage = linkage(pdist(a, metric=metric), method=method, metric=metric) ydendro = dendrogram(ylinkage, orientation='right', no_labels=True, distance_sort='descending', link_color_func=lambda x: 'black') plt.gca().set_axis_off() a = a.ix[[a.index[i] for i in ydendro['leaves']]] plt.sca(main_axes) plt.imshow(a, aspect='auto', interpolation='none', cmap=cmap, vmin=0.0, vmax=1.0) plt.colorbar(pad=0.15) plt.gca().yaxis.tick_right() plt.xticks(range(a.shape[1]), a.columns, rotation=90, size='small') plt.yticks(range(a.shape[0]), a.index, size='x-small') plt.gca().xaxis.set_ticks_position('none') plt.gca().yaxis.set_ticks_position('none') plt.gca().invert_yaxis() plt.show()
def hierarchical_clustering(data, skill, method='single', metric='euclidean', dendrogram=True, concepts=False, cluster_number=3, corr_as_vectors=False): pk, level = data.get_skill_id(skill) items = data.get_items_df() skills = data.get_skills_df() corr = compute_corr(data, merge_skills=concepts) print("Corr ({}) contain total {} values and from that {} nans".format(corr.shape, corr.size, corr.isnull().sum().sum())) corr[corr.isnull()] = 0 if concepts: items = items[items["skill_lvl_" + str(level)] == pk] skill_ids = items[~items["skill_lvl_3"].isnull()]["skill_lvl_3"].unique() corr = pd.DataFrame(corr, index=skill_ids, columns=skill_ids) labels = list(skills.loc[corr.index]["name"]) else: items = items[items["skill_lvl_" + str(level)] == pk] items = items[items["visualization"] != "pairing"] corr = pd.DataFrame(corr, index=items.index, columns=items.index) labels = ["{1} - {0}".format(item["name"], item["visualization"][0]) for id, item in list(items.iterrows())] if corr_as_vectors: Z = hr.linkage(corr, method=method, metric=metric) else: Z = hr.linkage(dst.squareform(1 - corr), method=method) Z[Z < 0] = 0 if dendrogram: plt.title('{}: method: {}, metric: {}, as vectors: {}'.format(skill, method, metric, corr_as_vectors)) plt.xlabel('items' if not concepts else "concepts") plt.ylabel('distance') hr.dendrogram(Z, leaf_rotation=90., leaf_font_size=10., labels=labels) return hr.fcluster(Z, cluster_number, "maxclust")
def compare_clusters(args): ref_df = pd.read_table(args['ref'], sep='\t', skipinitialspace=True, index_col=0).as_matrix() check_symmetry(ref_df) linkage_ref = linkage(ref_df, 'average') c_ref, coph_dists_ref = cophenet(linkage_ref, pdist(ref_df)) outfile = open(args['output'],"w") outfile.write("Tree_cluster\tMantel_Correlation_Coefficient\tManter_P-value\tCophenetic_Pearson\tCophenetic_P-value\n") for i in args['all']: fst_df = pd.read_table(i, sep='\t', skipinitialspace=True, index_col=0).as_matrix() check_symmetry(fst_df) mantel_coeff = 0.0 p_value_mantel = 0.0 cophenetic_pearson = 0.0 p_value_cophenetic = 0.0 n = 0 try: mantel_coeff, p_value_mantel, n = mantel(ref_df, fst_df) linkage_fst = linkage(fst_df, 'average') c_fst, coph_dists_fst = cophenet(linkage_fst, pdist(fst_df)) cophenetic_pearson, p_value_cophenetic = pearsonr(coph_dists_ref, coph_dists_fst) except Exception as e: print("Error : %s" % str(e)) mantel_coeff = "Failed" p_value_manel = "Failed" cophenetic_pearson = "Failed" p_value_cophenetic = "Failed" outfile.write(i+"\t"+str(mantel_coeff)+"\t"+str(p_value_mantel)+"\t"+str(cophenetic_pearson)+"\t"+str(p_value_cophenetic)+"\n") outfile.close()
def cluster_fps(self): clkg = hcluster.linkage(self.dm,method = 'average') coarse_r = hcluster.fcluster(clkg,0.3,criterion = 'distance') self.coarse_r = coarse_r bcount = np.bincount(coarse_r) knum = len(np.nonzero(bcount > 1)[0]) s = self.density_matrix.shape if False and len(s) >1 and s[0] > 10 and s[1] > 10 and knum < min(s) / 2: (u,s,vt) = la.svds(self.sps_matrixs,k = knum) self.u = u print '============' else: self.result = self.coarse_r return (clkg,clkg) #rankA = npla.matrix_rank(self.sps_matrixs) # if rankA < 3: a = np.matrix(np.diag(s)) * np.matrix(vt) pd = dist.pdist(np.array(a.T),'cosine') pd[np.abs(pd) < 1e-11] = 0 lkg = hcluster.linkage(pd,method = 'average') self.lkg = lkg self.result = hcluster.fcluster(lkg,self.svd_cluster_thr,criterion = 'distance') # self.result = hcluster.fcluster(lkg,1) # self.result = hcluster.fclusterdata(u,0.7,metric = 'cosine', criterion = 'distance',method = 'average') return (lkg,clkg)
def main(): D = 2 # so we can visualize it more easily s = 4 # separation so we can control how far apart the means are mu1 = np.array([0, 0]) mu2 = np.array([s, s]) mu3 = np.array([0, s]) N = 900 # number of samples X = np.zeros((N, D)) X[:300, :] = np.random.randn(300, D) + mu1 X[300:600, :] = np.random.randn(300, D) + mu2 X[600:, :] = np.random.randn(300, D) + mu3 Z = linkage(X, 'ward') print "Z.shape:", Z.shape # Z has the format [idx1, idx2, dist, sample_count] # therefore, its size will be (N-1, 4) plt.title("Ward") dendrogram(Z) plt.show() Z = linkage(X, 'single') plt.title("Single") dendrogram(Z) plt.show() Z = linkage(X, 'complete') plt.title("Complete") dendrogram(Z) plt.show()
def HierarchicalCluster(A): #see http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python Corr = np.corrcoef(A.T) fig = plt.figure(figsize=(8,8)) ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) Y = hrc.linkage(Corr, method='centroid') Z1 = hrc.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) ax2 = fig.add_axes([0.3,0.71,0.6,0.2]) Y = hrc.linkage(Corr, method='centroid') Z2 = hrc.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] Corr = Corr[idx1, :] Corr = Corr[:, idx2] im = axmatrix.matshow(Corr, aspect='auto', origin='lower') axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) pylab.colorbar(im, cax=axcolor) fig.show() fig.savefig('dendrogram.png')
def hcluster_cols(self, thresh): try: link = linkage(self.X.T, method='complete', metric = 'cosine') assignments = fcluster(link, thresh, 'distance') except: link = linkage(self.X.T, method='complete', metric = 'euclidean') assignments = fcluster(link, thresh, 'distance') col_ind = np.arange(len(self.crimes)) d = pd.DataFrame(zip(col_ind, assignments)).groupby(1)[0].aggregate(lambda x: tuple(x)) df_new = pd.DataFrame(index = np.arange(len(self.names))) for i in d: cols = [] for w in i: cols.append(w) if len(cols) > 1: df_new[str(self.crimes[cols])] = np.mean(self.X[:,cols], axis = 1) else: df_new[str(self.crimes[cols[0]])] = self.X[:,cols[0]] # plt.figure(figsize=(10,20)) # dendro = dendrogram(link, color_threshold=thresh, leaf_font_size=13, labels = self.crimes, orientation = 'left') # plt.subplots_adjust(top=.99, bottom=0.5, left=0.05, right=0.99) # plt.show() self.df = df_new self.crimes = df_new.columns.values
def starthcc(self): print self.dm,self.lin dataFrame = pd.DataFrame(self.tr, columns=['x', 'y']) from scipy.spatial.distance import pdist, squareform # not printed as pretty, but the values are correct distxy = squareform(pdist(dataFrame, metric=(self.dm))) #print distxy if self.lin=="single": plt.figure() R = dendrogram(linkage(distxy, method=str(self.lin))) plt.xlabel('X units') plt.ylabel('Y units') plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14); plt.show() elif self.lin=="complete": plt.figure() R = dendrogram(linkage(distxy, method=str(self.lin))) plt.xlabel('X units') plt.ylabel('Y units') plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14); plt.show() else: plt.figure() R = dendrogram(linkage(distxy, method=str(self.lin))) plt.xlabel('X units') plt.ylabel('Y units') plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14); plt.show()
def plot_transition_clustermap(data_array, gene_names, pseudotimes, n_clusters=10, gradient=False): if gradient: data_to_plot = zscore(np.gradient(data_array)[1].T, axis=0) scale = None metric = 'seuclidean' row_linkage = linkage(pdist(abs(data_to_plot), metric=metric), method='complete') else: data_to_plot = data_array.T scale = 0 metric = 'correlation' row_linkage = linkage(pdist(data_to_plot, metric=metric), method='complete') assignments = fcluster(row_linkage, n_clusters, criterion='maxclust') cm = sns.clustermap(data_to_plot, col_cluster=False, standard_scale=scale, yticklabels=gene_names, row_linkage=row_linkage, row_colors=[settings.STATE_COLORS[i] for i in assignments]) r = np.arange(10, data_array.shape[0], data_array.shape[0]/10) plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5) cm.ax_heatmap.set_xticks(r) cm.ax_heatmap.set_xticklabels(['%.1f' % x for x in pseudotimes[r]]) cm.ax_heatmap.set_xlabel('Pseudotime') cm.ax_heatmap.set_ylabel('Gene') gene_clusters = defaultdict(list) for i, cl in enumerate(assignments): gene_clusters[settings.STATE_COLORS[cl]].append(gene_names[i]) return gene_clusters
def _cluster_idx(df): """ sort indices by clusters """ dcol = pdist(df.T) drow = pdist(df) lcol = linkage(dcol) lrow = linkage(drow) cols = dendrogram(lcol, no_plot=True)['leaves'] rows = dendrogram(lrow, no_plot=True)['leaves'] return rows,cols
def check_linkage_q(self, method): # Tests linkage(Y, method) on the Q data set. Z = linkage(hierarchy_test_data.X, method) expectedZ = getattr(hierarchy_test_data, "linkage_X_" + method) assert_allclose(Z, expectedZ, atol=1e-06) y = scipy.spatial.distance.pdist(hierarchy_test_data.X, metric="euclidean") Z = linkage(y, method) assert_allclose(Z, expectedZ, atol=1e-06)
def plot_clustered_heatmap(df, genes_list, cancer, output_path, scale='binary'): # Build nxm matrix (n samples, m genes) X = df[genes_list].as_matrix().transpose() if scale == 'binary': Z = linkage(X, method='complete', metric='hamming') colorscale = [[0, "rgb(111, 168, 220)"], [1, "rgb(5, 10, 172)"]] colorbar = {'tick0': 0,'dtick': 1} elif scale == 'logarithmic': Z = linkage(X, method='ward') X_max = X.max() colorscale = [[0, 'rgb(250, 250, 250)'], [1./X_max, 'rgb(200, 200, 200)'], [5./X_max, 'rgb(150, 150, 200)'], [20./X_max, 'rgb(100, 100, 200)'], [100./X_max, 'rgb(50, 50, 200)'], [1., 'rgb(0, 0, 200)']] colorbar = {'tick0': 0, 'tickmode': 'array', 'tickvals': [0, 1, 5, 20, 100, X_max]} c, coph_dists = cophenet(Z, pdist(X)) print "Cophenetic Correlation Coefficient:", c #layout = go.Layout(yaxis=dict(title='%s germline mutations (ordered by samples somatic mutation load)'% cancer, zeroline=False)) # fig = pylab.figure(figsize=(8,8)) # ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) # ax1.set_xticks([]) # ax1.set_yticks([]) # axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) den = dendrogram(Z, orientation='left') idx = den['leaves'] X = X[idx,:] print "X shape:", X.shape genes_ordered = [genes_list[i] for i in idx] logger.info("ordered genes: %s", str(genes_ordered)) # im = axmatrix.matshow(X, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu) # axmatrix.set_xticks([]) # axmatrix.set_yticks([]) # # Plot colorbar. # axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) # pylab.colorbar(im, cax=axcolor) # fig.savefig(output_path) # Plotting the heatmap (without the hirarchy) heatmap_trace = go.Heatmap(z=X.tolist(), x=df.patient_id, y=genes_ordered, showscale=True, colorscale=colorscale, colorbar=colorbar) mutation_load_trace = go.Bar(x=df.patient_id, y=df.somatic_mutations_count/30.0) fig = tls.make_subplots(rows=29, cols=1, specs=[[{'rowspan':5, 'colspan' : 1}]] + [[None]] * 4 + [[{'rowspan' : 24, 'colspan' : 1}]] + [[None]] * 23) fig.append_trace(mutation_load_trace, 1, 1) fig.append_trace(heatmap_trace, 6, 1) fig['layout']['xaxis1'].update(showticklabels = False) fig['layout']['xaxis1'].update(zeroline = False, showgrid=False) fig['layout']['yaxis1'].update(zeroline = False, showgrid = False, tickfont=dict(family='Arial', size=4)) fig['layout']['xaxis2'].update(showticklabels = False) fig['layout']['xaxis2'].update(zeroline = False, showgrid=False) fig['layout']['yaxis2'].update(zeroline = False, showgrid = False, tickfont=dict(family='Arial', size=4)) plot(fig, auto_open=False, filename="%s_%s_heatmap_clustered.html" % (output_path, cancer))
def refineEnsemble(ens, lower=.5, upper=10.): """Refine a PDB ensemble based on RMSD criterions.""" from scipy.cluster.hierarchy import linkage, fcluster from scipy.spatial.distance import squareform from collections import Counter ### calculate pairwise RMSDs ### RMSD = ens.getRMSDs(pairwise=True) # convert the RMSD table to the compressed form v = squareform(RMSD) ### apply upper threshold ### Z_upper = linkage(v, method='complete') labels = fcluster(Z_upper, upper, criterion='distance') most_common_label = Counter(labels).most_common(1)[0][0] I = np.where(labels==most_common_label)[0] ### apply lower threshold ### Z_lower = linkage(v, method='single') labels = fcluster(Z_lower, lower, criterion='distance') uniq_labels = np.unique(labels) clusters = [] for label in uniq_labels: indices = np.where(labels==label)[0] clusters.append(indices) J = np.ones(len(clusters), dtype=int) * -1 rmsd = None for i, cluster in enumerate(clusters): if len(cluster) > 0: # find the conformations with the largest coverage # (the weight of the ref should be 1) weights = [ens[j].getWeights().sum() for j in cluster] js = np.where(weights==np.max(weights))[0] # in the case where there are multiple structures with the same weight, # the one with the smallest rmsd wrt the ens._coords is selected. if len(js) > 1: # rmsd is not calulated unless necessary for the sake of efficiency rmsd = ens.getRMSDs() if rmsd is None else rmsd j = js[np.argmin(rmsd[js])] else: j = js[0] J[i] = cluster[j] else: J[i] = cluster[0] ### refine ensemble ### K = np.intersect1d(I, J) reens = ens[K] return reens
def clusterData(xdata, rowMethod=True, columnMethod=False, method='average', metric='euclidean'): """clusterData clusters the data either by row, by column, or both :param xdata: a data dictionary - the one to be transformed :type x: dict, must contain 'data', 'proteins', 'fractions' :param rowMethod: a boolean asking if you want to flip on the rows (proteins get clustered) :type rowMethod: bool :param columnMethod: a boolean asking if you want to flip on the columns (fractions get clustered) :type columnMethod: bool :param method: string defining the linkage type, defaults to 'average' - 'ward' might be a good option :type method: string :param metric: string defining the distance metric, defaults to 'euclidean' :type metric: string :returns: a data ditionary. 'data', 'proteins', 'fractions', 'fi', 'pi', 'topDendro', 'rightDendro' are updated """ xdat = xdata.copy() x = xdat['data'] ind1 = xdat['proteins'] ind2 = xdat['fractions'] xt = x idx1 = None idx2 = None toReturn = xdat Y1 = None Y2 = None if rowMethod: d1 = ssd.pdist(x) D1 = ssd.squareform(d1) # full matrix Y1 = sch.linkage(D1, method=method, metric=metric) ### gene-clustering metric - 'average', 'single', 'centroid', 'complete' Z1 = sch.dendrogram(Y1, no_plot=True, orientation='right') idx1 = Z1['leaves'] ### apply the clustering for the gene-dendrograms to the actual matrix data xt = xt[idx1,:] # xt is transformed x newIndex = [] for i in idx1: newIndex.append(ind1[i]) toReturn['proteins'] = newIndex toReturn['pi'] = idx1 if columnMethod: d2 = ssd.pdist(x.T) D2 = ssd.squareform(d2) Y2 = sch.linkage(D2, method=method, metric=metric) ### array-clustering metric - 'average', 'single', 'centroid', 'complete' Z2 = sch.dendrogram(Y2, no_plot=True) idx2 = Z2['leaves'] ### apply the clustering for the array-dendrograms to the actual matrix data xt = xt[:,idx2] newIndex = [] for i in idx2: newIndex.append(ind2[i]) toReturn['fractions'] = newIndex toReturn['fi'] = idx2 toReturn['data'] = xt toReturn['topDendro'] = Y2 toReturn['rightDendro'] = Y1 return toReturn
def heatmap_v1(self,data_I,row_labels_I,column_labels_I): '''Generate a heatmap using pandas and scipy DEPRECATED: kept for compatibility with old io methods''' """dendrogram documentation: Output: 'color_list': A list of color names. The k?th element represents the color of the k?th link. 'icoord' and 'dcoord': Each of them is a list of lists. Let icoord = [I1, I2, ..., Ip] where Ik = [xk1, xk2, xk3, xk4] and dcoord = [D1, D2, ..., Dp] where Dk = [yk1, yk2, yk3, yk4], then the k?th link painted is (xk1, yk1) - (xk2, yk2) - (xk3, yk3) - (xk4, yk4). 'ivl': A list of labels corresponding to the leaf nodes. 'leaves': For each i, H[i] == j, cluster node j appears in position i in the left-to-right traversal of the leaves, where \(j < 2n-1\) and \(i < n\). If j is less than n, the i-th leaf node corresponds to an original observation. Otherwise, it corresponds to a non-singleton cluster.""" #parse input into col_labels and row_labels #TODO: pandas is not needed for this. mets_data = pd.DataFrame(data=data_I, index=row_labels_I, columns=column_labels_I) mets_data = mets_data.dropna(how='all').fillna(0.) #mets_data = mets_data.replace([np.inf], 10.) #mets_data = mets_data.replace([-np.inf], -10.) col_labels = list(mets_data.columns) row_labels = list(mets_data.index) #heatmap data matrix heatmap_data = [] for i,g in enumerate(mets_data.index): for j,c in enumerate(mets_data.columns): #heatmap_data.append({"col": j+1, "row": i+1, "value": mets_data.ix[g][c]}) heatmap_data.append({"col": j, "row": i, "value": mets_data.ix[g][c]}) #perform the custering on the both the rows and columns dm = mets_data D1 = squareform(pdist(dm, metric='euclidean')) D2 = squareform(pdist(dm.T, metric='euclidean')) Y = linkage(D1, method='single') Z1 = dendrogram(Y, labels=dm.index) Y = linkage(D2, method='single') Z2 = dendrogram(Y, labels=dm.columns) #parse the output hccol = Z2['leaves'] # no hclustering; same as heatmap_data['col'] hcrow = Z1['leaves'] # no hclustering; same as heatmap_data['row'] hccolicoord = Z2['icoord'] # no hclustering; same as heatmap_data['col'] hcrowicoord = Z1['icoord'] # no hclustering; same as heatmap_data['row'] hccoldcoord = Z2['dcoord'] # no hclustering; same as heatmap_data['col'] hcrowdcoord = Z1['dcoord'] # no hclustering; same as heatmap_data['row'] #hccol = [x+1 for x in hccol]; # hccol index should match heatmap_data index #hcrow = [x+1 for x in hcrow]; return {'hcrow': hcrow, 'hccol': hccol, 'row_labels':row_labels, 'col_labels':col_labels, 'heatmap_data':heatmap_data, 'maxval' : max([x['value'] for x in heatmap_data]), 'minval' : min([x['value'] for x in heatmap_data])}
def test_correspond_4_and_up(self): # Tests correspond(Z, y) on linkage and CDMs over observation sets of # different sizes. Correspondance should be false. for (i, j) in list(zip(list(range(2, 4)), list(range(3, 5)))) + list(zip(list(range(3, 5)), list(range(2, 4)))): y = np.random.rand(i * (i - 1) // 2) y2 = np.random.rand(j * (j - 1) // 2) Z = linkage(y) Z2 = linkage(y2) assert_equal(correspond(Z, y2), False) assert_equal(correspond(Z2, y), False)
def analyzeClusters(n_loops=1, cl=None, sp=None, shuffled=False, spShuff=False): results = {} n = n_loops bins = [i for i in drange(0.0, 1.0, 0.1)] total_hist = [0 for i in bins] data = win.getData(shuffle=shuffled, class_=cl, spec=sp) if spShuff is True: win.shuffleIt(data, mode=2) Z = hie.linkage(data, method='average', metric='correlation') D = hie.dendrogram(Z, orientation='left', no_plot=True) total_ys = [0 for d in D['dcoord']] total_z = [0 for d in Z[::-1, 2]] total_acc = [0 for d in np.diff(Z[::-1, 2], 2)] for ii in range(0, n): # for loop added to average shuffled results # data = win.getData(shuffle=True, class_='J') # labels = win.getStudents(class_=classes[0]) # labels = [str(st.class_) + " " + str(st.spec) for st in labels] Z = hie.linkage(data, method='average', metric='correlation') D = hie.dendrogram(Z, orientation='left', no_plot=True) # print(data[40, :]) # print(data[42, :]) # freq method ys = [d[1] for d in D['dcoord']] total_ys = [a + b for a, b in zip(ys, total_ys)] hist, bins = np.histogram(ys, bins=bins) total_hist = [a + b for a, b in zip(hist, total_hist)] # elbow method (sort of) z = Z[::-1, 2] total_z = [a + b for a, b in zip(z, total_z)] # inv elbow acceleration = np.diff(Z[::-1, 2], 2) # 2nd derivative of distances total_acc = [a + b for a, b in zip(acceleration, total_acc)] if ii < n - 1: # dont get new data if there wont be another loop data = win.getData(shuffle=shuffled, class_=cl, spec=sp) total_hist = [a / n for a in total_hist] total_ys = [a / n for a in total_ys] total_z = [a / n for a in total_z] total_acc = [a / n for a in total_acc] results['bins'] = (bins[:-1] + bins[1:]) / 2 results['hist'] = total_hist results['ys'] = total_ys results['z'] = total_z results['acc'] = total_acc return results
def heatmap_plot_zscore_bigneuron(df_zscore_features, df_all, output_dir, title=None): print "heatmap plot:bigneuron" #taiwan metric ='nt_type' mtypes = np.unique(df_all[metric]) print mtypes mtypes_pal = sns.color_palette("hls", len(mtypes)) mtypes_lut = dict(zip(mtypes, mtypes_pal)) mtypes_colors = df_all[metric].map(mtypes_lut) linkage = hierarchy.linkage(df_zscore_features, method='ward', metric='euclidean') data = df_zscore_features.transpose() row_linkage = hierarchy.linkage(data, method='ward', metric='euclidean') feature_order = hierarchy.leaves_list(row_linkage) #print data.index matchIndex = [data.index[x] for x in feature_order] #print matchIndex data = data.reindex(matchIndex) pl.figure() g = sns.clustermap(data, row_cluster = False, col_linkage=linkage, method='ward', metric='euclidean', linewidths = 0.0,col_colors = [mtypes_colors], cmap = sns.cubehelix_palette(light=1, as_cmap=True),figsize=(40,10)) pl.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) pl.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) #g.ax_heatmap.set_xticklabels([]) pl.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.95) # !!!!! if title: pl.title(title) location ="best" num_cols=1 # Legend for row and col colors for label in mtypes: g.ax_row_dendrogram.bar(0, 0, color=mtypes_lut[label], label=label, linewidth=0.0) g.ax_row_dendrogram.legend(loc=location, ncol=num_cols,borderpad=0) filename = output_dir + '/zscore_feature_heatmap.png' pl.savefig(filename, dpi=300) #pl.show() print("save zscore matrix heatmap figure to :" + filename) pl.close() print "done clustering and heatmap plotting" return linkage
def is_distance_and_linkage_compatible(distance, linkage): is_linkage_method_OK(linkage) is_distance_metric_OK(distance) if distance == 'yule' and linkage != 'single': raise ConfigError("The cistance metric 'yule' will only work with the linkage 'single' :/") try: hierarchy.linkage([(1, 0), (0, 1), (1, 1)], metric=distance, method=linkage) except Exception as exception: raise ConfigError("Someone is upset here: %s" % exception)
def test_correspond_2_and_up(self): # Tests correspond(Z, y) on linkage and CDMs over observation sets of # different sizes. for i in xrange(2, 4): y = np.random.rand(i*(i-1)//2) Z = linkage(y) self.assertTrue(correspond(Z, y)) for i in xrange(4, 15, 3): y = np.random.rand(i*(i-1)//2) Z = linkage(y) self.assertTrue(correspond(Z, y))
def hierarchical_clustering(data, distance='correlation', method='ward'): """ Perform hierarchical clustering on distance matrix. Parameters ---------- data : array_like Data matrix to cluster, precompupted distances. distance : 'str' Distance metric to use. Passed as `metric`-key word to `scipy.spatial.distance.pdist` if not equal to `'precomputed'` method : str Linkage method, passed to `scipy.cluster.hierarchy.linkage`, default method is `"ward"`. metric : str Distance method, passed to `scipy.cluster.hierarchy.linkage`, defaults to Euclidean distance.. Returns ------- row_linkage : numpy.ndarray Row linkage matrix. col_linkage : numpy.ndarray Column linkage matrix. row_dist : numpy.ndarray Distance matrix """ symmetric = False if distance == 'precomputed': try: symmetric = np.allclose(data, data.T) except ValueError: symmetric = False if not symmetric: raise ValueError('precomputed distance not symmetric') row_dist = col_dist = data.copy() else: try: row_dist = scipy_dist.squareform( scipy_dist.pdist(data, metric=distance)) col_dist = scipy_dist.squareform( scipy_dist.pdist(data.T, metric=distance)) except ValueError: raise row_linkage = scipy_hc.linkage(row_dist, method=method) if symmetric: col_linkage = row_linkage else: col_linkage = scipy_hc.linkage(col_dist, method=method) return row_linkage, col_linkage
def test_correspond_4_and_up_2(self): # Tests correspond(Z, y) on linkage and CDMs over observation sets of # different sizes. Correspondance should be false. for (i, j) in (list(zip(list(range(2, 7)), list(range(16, 21)))) + list(zip(list(range(2, 7)), list(range(16, 21))))): y = np.random.rand(i*(i-1)//2) y2 = np.random.rand(j*(j-1)//2) Z = linkage(y) Z2 = linkage(y2) self.assertTrue(correspond(Z, y2) == False) self.assertTrue(correspond(Z2, y) == False)
def cluster(df, metric="euclidean", method="single", row=True, column=True): row_linkmat, col_linkmat = None, None if row: distmat = dist.pdist(df, metric) row_linkmat = hier.linkage(distmat, method) df = df.iloc[hier.leaves_list(row_linkmat), :] if column: df = df.T distmat = dist.pdist(df, metric) col_linkmat = hier.linkage(distmat, method) df = df.iloc[hier.leaves_list(col_linkmat), :].T return df, row_linkmat, col_linkmat
def test_optimal_leaf_ordering(): # test with the distance vector y Z = optimal_leaf_ordering(linkage(hierarchy_test_data.ytdist), hierarchy_test_data.ytdist) expectedZ = hierarchy_test_data.linkage_ytdist_single_olo assert_allclose(Z, expectedZ, atol=1e-10) # test with the observation matrix X Z = optimal_leaf_ordering(linkage(hierarchy_test_data.X, 'ward'), hierarchy_test_data.X) expectedZ = hierarchy_test_data.linkage_X_ward_olo assert_allclose(Z, expectedZ, atol=1e-06)
def heatmap_fc(self, labels=False, dendro=False): """Make heatmap with log2fold change... function not ready yet :param labels: :param dendro: """ fig = plt.figure(figsize=(7, 9)) D = self.matrix.values if dendro: rectangle1 = (0, 0.2, 0.2, 0.69) ax1 = fig.add_axes(rectangle1) Y = sch.linkage(D, method='centroid') Z1 = sch.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) # need to transpose the array so you can sort by RPKM Dt = np.transpose(D) # Compute and plot the top dendrogram. ax2 = fig.add_axes([0.4, 0.9, 0.4, 0.1]) Y = sch.linkage(Dt, method='single') Z2 = sch.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) # Plot heatmap distance matrix. axmatrix = fig.add_axes([0.4, 0.2, 0.4, 0.69]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] D = D[idx1, :] D = D[:, idx2] color = plt.cm.jet colormap = plt.get_cmap(color) axmatrix = fig.add_axes([0.4, 0.2, 0.4, 0.69]) normal = mpl.colors.Normalize(vmin=np.nanmin(D), vmax=np.nanmax(D)) im = axmatrix.pcolormesh(D, cmap=colormap, norm=normal, clip_on=True) if labels: if dendro: xlabels = list(self.matrix.columns[i].__str__() for i in idx2) ylabels = list(self.matrix.index[i].__str__() for i in idx1) else: ylabels = list(self.matrix.index) xlabels = list(self.matrix.columns) axmatrix.set_xticklabels(xlabels, rotation=90, minor=False) axmatrix.set_xticks(np.arange(xlabels.__len__()) + 0.5, minor=False) axmatrix.set_yticklabels(ylabels, fontsize='small', minor=False) axmatrix.set_yticks(np.arange(ylabels.__len__()) + 0.5, minor=False) plt.ylim(0, self.matrix.shape[0]) axcolor = fig.add_axes([.85, 0.2, 0.02, 0.6]) plt.colorbar(im, cax=axcolor) axcolor.set_title('FC')
def cluster(data): pairwise_dists = distance.squareform(distance.pdist(data)) # cluster sch.set_link_color_palette(['black']) row_clusters = sch.linkage(pairwise_dists,method='complete') # rename row clusters #row_clusters = clusters # calculate pairwise distances for columns col_pairwise_dists = distance.squareform(distance.pdist(data.T)) # cluster col_clusters = sch.linkage(col_pairwise_dists,method='complete') return row_clusters, col_clusters
def get_linkage(self, stat_linkage_method): #create the datamodel which is needed as input for the dendrogram #only method ward demands a redundant distance matrix while the others seem to get different #results with a redundant matrix and with a flat one, latter seems to be ok. #see https://github.com/scipy/scipy/issues/2614 (not sure this is still an issue) if stat_linkage_method == "ward": z = sch.linkage(self, method='ward', metric='euclidean') else: #creating a flat representation of the dist matrix deltas_flat = ssd.squareform(self) z = sch.linkage(deltas_flat, method=stat_linkage_method, metric='euclidean') return z
def linkage_tree(X, connectivity=None, n_components=None, n_clusters=None, linkage='complete', affinity="euclidean", return_distance=False): """Linkage agglomerative clustering based on a Feature matrix. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Parameters ---------- X : array, shape (n_samples, n_features) feature matrix representing n_samples samples to be clustered connectivity : sparse matrix (optional). connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is None, i.e, the Ward algorithm is unstructured. n_components : int (optional) Number of connected components. If None the number of connected components is estimated from the connectivity matrix. n_clusters : int (optional) Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. In this case, the complete tree is not computed, thus the 'children' output is of limited use, and the 'parents' output should rather be used. This option is valid only when specifying a connectivity matrix. linkage : {"average", "complete"}, optional, default: "complete" Which linkage critera to use. The linkage criterion determines which distance to use between sets of observation. - average uses the average of the distances of each observation of the two sets - complete or maximum linkage uses the maximum distances between all observations of the two sets. affinity : string or callable, optional, default: "euclidean". which metric to use. Can be "euclidean", "manhattan", or any distance know to paired distance (see metric.pairwise) return_distance : bool, default False whether or not to return the distances between the clusters. Returns ------- children : 2D array, shape (n_nodes, 2) The children of each non-leaf node. Values less than `n_samples` correspond to leaves of the tree which are the original samples. A node `i` greater than or equal to `n_samples` is a non-leaf node and has children `children_[i - n_samples]`. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_samples + i` n_components : int The number of connected components in the graph. n_leaves : int The number of leaves in the tree. parents : 1D array, shape (n_nodes, ) or None The parent of each node. Only returned when a connectivity matrix is specified, elsewhere 'None' is returned. distances : ndarray, shape (n_nodes,) Returned when return_distance is set to True. distances[i] refers to the distance between children[i][0] and children[i][1] when they are merged. See also -------- ward_tree : hierarchical clustering with ward linkage """ X = np.asarray(X) if X.ndim == 1: X = np.reshape(X, (-1, 1)) n_samples, n_features = X.shape linkage_choices = { 'complete': _hierarchical.max_merge, 'average': _hierarchical.average_merge, } try: join_func = linkage_choices[linkage] except KeyError: raise ValueError('Unknown linkage option, linkage should be one ' 'of %s, but %s was given' % (linkage_choices.keys(), linkage)) if connectivity is None: from scipy.cluster import hierarchy # imports PIL if n_clusters is not None: warnings.warn( 'Partial build of the tree is implemented ' 'only for structured clustering (i.e. with ' 'explicit connectivity). The algorithm ' 'will build the full tree and only ' 'retain the lower branches required ' 'for the specified number of clusters', stacklevel=2) if affinity == 'precomputed': # for the linkage function of hierarchy to work on precomputed # data, provide as first argument an ndarray of the shape returned # by pdist: it is a flat array containing the upper triangular of # the distance matrix. i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] elif affinity == 'l2': # Translate to something understood by scipy affinity = 'euclidean' elif affinity in ('l1', 'manhattan'): affinity = 'cityblock' elif callable(affinity): X = affinity(X) i, j = np.triu_indices(X.shape[0], k=1) X = X[i, j] out = hierarchy.linkage(X, method=linkage, metric=affinity) children_ = out[:, :2].astype(np.int) if return_distance: distances = out[:, 2] return children_, 1, n_samples, None, distances return children_, 1, n_samples, None connectivity = _fix_connectivity(X, connectivity, n_components=n_components) connectivity = connectivity.tocoo() # Put the diagonal to zero diag_mask = (connectivity.row != connectivity.col) connectivity.row = connectivity.row[diag_mask] connectivity.col = connectivity.col[diag_mask] connectivity.data = connectivity.data[diag_mask] del diag_mask # FIXME We compute all the distances, while we could have only computed # the "interesting" distances distances = paired_distances(X[connectivity.row], X[connectivity.col], metric=affinity) connectivity.data = distances if n_clusters is None: n_nodes = 2 * n_samples - 1 else: assert n_clusters <= n_samples n_nodes = 2 * n_samples - n_clusters if return_distance: distances = np.empty(n_nodes - n_samples) # create inertia heap and connection matrix A = np.empty(n_nodes, dtype=object) inertia = list() # LIL seems to the best format to access the rows quickly, # without the numpy overhead of slicing CSR indices and data. connectivity = connectivity.tolil() # We are storing the graph in a list of IntFloatDict for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)): A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64)) # We keep only the upper triangular for the heap # Generator expressions are faster than arrays on the following inertia.extend( _hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind) del connectivity heapify(inertia) # prepare the main fields parent = np.arange(n_nodes, dtype=np.intp) used_node = np.ones(n_nodes, dtype=np.intp) children = [] # recursive merge loop for k in xrange(n_samples, n_nodes): # identify the merge while True: edge = heappop(inertia) if used_node[edge.a] and used_node[edge.b]: break i = edge.a j = edge.b if return_distance: # store distances distances[k - n_samples] = edge.weight parent[i] = parent[j] = k children.append((i, j)) # Keep track of the number of elements per cluster n_i = used_node[i] n_j = used_node[j] used_node[k] = n_i + n_j used_node[i] = used_node[j] = False # update the structure matrix A and the inertia matrix # a clever 'min', or 'max' operation between A[i] and A[j] coord_col = join_func(A[i], A[j], used_node, n_i, n_j) for l, d in coord_col: A[l].append(k, d) # Here we use the information from coord_col (containing the # distances) to update the heap heappush(inertia, _hierarchical.WeightedEdge(d, k, l)) A[k] = coord_col # Clear A[i] and A[j] to save memory A[i] = A[j] = 0 # Separate leaves in children (empty lists up to now) n_leaves = n_samples # # return numpy array for efficient caching children = np.array(children)[:, ::-1] if return_distance: return children, n_components, n_leaves, parent, distances return children, n_components, n_leaves, parent
users = range(10000) purchases = [] for p in range(100000): u = random.choice(users) p = random.choice(products) purchases.append((u,p)) X = purchases.iloc[:, [3, 4]].values y = purchases.iloc[:, 3].values from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) import scipy.cluster.hierarchy as sch dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward')) plt.title('Dendrogram') plt.xlabel('Products') plt.ylabel('Users') plt.show() from sklearn.cluster import AgglomerativeClustering hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward') y_hc = hc.fit_predict(X) plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1') plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2') plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3') plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4') plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5') plt.scatter(X[y_hc == 5, 0], X[y_hc == 5, 1], s = 100, c = 'cyan', label = 'Cluster 6')
plt.scatter(df.total_salaries, df.total_wins, s=60, c=labels) # This one looks better, in my opinion, with 4 clusters, one of which is only NY # Of K-means and DBSCAN, DBSCAN is better at identifying outliers ############ # Dendrogram ############ from scipy.spatial.distance import pdist from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, fclusterdata distanceMatrix = pdist(data) # print dendrogram dend = dendrogram(linkage(distanceMatrix, method='complete'), color_threshold=1, leaf_font_size=10, labels=df.teamID.tolist()) # This give us 7 clusters # let's set the cutoff at 2 for 4 clusters dend = dendrogram(linkage(distanceMatrix, method='complete'), color_threshold=2, leaf_font_size=10, labels=df.teamID.tolist()) # get cluster assignments assignments = fcluster(linkage(distanceMatrix, method='complete'), 2, 'distance')
for i in range(len(x)): X = x[i][0] Y = x[i][1] plt.scatter(X, Y) plt.xlabel('x axis') plt.ylabel('y axis') plt.title('The raw dataset') Labels = range(1, 11) #Labeling the points #Let's plot the dendrogram for our data points, we must use Scipy Library from scipy.cluster.hierarchy import dendrogram, linkage from matplotlib import pyplot as plt linked = linkage( x, 'single' ) #Determine whether this is a single_linkage, complete_linkage or average clustering labelList = range(1, 11) plt.figure(figsize=(10, 7)) dendrogram(linked, orientation='top', labels=labelList, distance_sort='descending', show_leaf_counts=True) plt.xlabel('point labels') plt.ylabel('The distance and the cluster tress') plt.show() from sklearn.cluster import AgglomerativeClustering
'''We are going to continue the investigation into the sightings of legendary Pokémon from the previous exercise. Remember that in the scatter plot of the previous exercise, you identified two areas where Pokémon sightings were dense. This means that the points seem to separate into two clusters. In this exercise, you will form two clusters of the sightings using hierarchical clustering. 'x' and 'y' are columns of X and Y coordinates of the locations of sightings, stored in a Pandas data frame, df. The following are available for use: matplotlib.pyplot as plt, seaborn as sns, and pandas as pd.''' import pandas as pd x = [9, 6, 2, 3, 1, 7, 1, 6, 1, 7, 23, 26, 25, 23, 21, 23, 23, 20, 30, 23] y = [8, 4, 10, 6, 0, 4, 10, 10, 6, 1, 29, 25, 30, 29, 29, 30, 25, 27, 26, 30] df = pd.DataFrame({'x':x,'y':y}) # Import linkage and fcluster functions from scipy.cluster.hierarchy import linkage, fcluster # Use the linkage() function to compute distances Z = linkage(df, 'ward') # Generate cluster labels df['cluster_labels'] = fcluster(Z, 2, criterion='maxclust') # Plot the points with seaborn import matplotlib.pyplot as plt import seaborn as sns sns.scatterplot(x='x', y='y', hue='cluster_labels', data=df) plt.show()
z = tmp[:, 2] + (rxyz[i, 2] * csize) tmp = np.column_stack([x, y, z]) cls = np.vstack([cls, tmp]) return cls # Generate a cluster of clusters and distance matrix. cls = clusters() D = pdist(cls[:, 0:2]) D = squareform(D) # Compute and plot first dendrogram. fig = mpl.figure(figsize=(8, 8)) ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6]) Y1 = hy.linkage(D, method='complete') cutoff = 0.3 * np.max(Y1[:, 2]) Z1 = hy.dendrogram(Y1, orientation='right', color_threshold=cutoff) ax1.xaxis.set_visible(False) ax1.yaxis.set_visible(False) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2]) Y2 = hy.linkage(D, method='average') cutoff = 0.3 * np.max(Y2[:, 2]) Z2 = hy.dendrogram(Y2, color_threshold=cutoff) ax2.xaxis.set_visible(False) ax2.yaxis.set_visible(False) # Plot distance matrix. ax3 = fig.add_axes([0.3, 0.1, 0.6, 0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves']
s2 = samples[j] #a[i][j] = a[j][i] = len(s2snps[s1].intersection(s2snps[s2])) a[i][j] = a[j][i] = len(s2snps[s1].symmetric_difference(s2snps[s2])) np.savetxt(sys.stdout, a, delimiter="\t", header="\t".join(samples[:len(s2snps)]), fmt='%i') sys.stderr.write("Plotting...\n") D = a # Compute and plot first dendrogram. fig = pylab.figure(figsize=(8, 8)) ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6]) Y = sch.linkage(D, method='centroid') Z1 = sch.dendrogram(Y, orientation='right') ax1.set_xticks([]) #ax1.set_yticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2]) Y = sch.linkage(D, method='single') Z2 = sch.dendrogram(Y) #ax2.set_xticks([]) ax2.set_yticks([]) fig.savefig('dendrogram.svg') # Plot distance matrix. axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
def sample_from_corrgan(model_loc, dim=10, n_samples=1): # pylint: disable=import-outside-toplevel, disable=too-many-locals """ Samples correlation matrices from the pre-trained CorrGAN network. It is reproduced with modifications from the following paper: `Marti, G., 2020, May. CorrGAN: Sampling Realistic Financial Correlation Matrices Using Generative Adversarial Networks. In ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 8459-8463). IEEE. <https://arxiv.org/pdf/1910.09504.pdf>`_ It loads the appropriate CorrGAN model for the required dimension. Generates a matrix output from this network. Symmetries this matrix and finds the nearest correlation matrix that is positive semi-definite. Finally, it maximizes the sum of the similarities between adjacent leaves to arrange it with hierarchical clustering. The CorrGAN network was trained on the correlation profiles of the S&P 500 stocks. Therefore the output retains these properties. In addition, the final output retains the following 6 stylized facts: 1. Distribution of pairwise correlations is significantly shifted to the positive. 2. Eigenvalues follow the Marchenko-Pastur distribution, but for a very large first eigenvalue (the market). 3. Eigenvalues follow the Marchenko-Pastur distribution, but for a couple of other large eigenvalues (industries). 4. Perron-Frobenius property (first eigenvector has positive entries). 5. Hierarchical structure of correlations. 6. Scale-free property of the corresponding Minimum Spanning Tree (MST). :param model_loc: (str) Location of folder containing CorrGAN models. :param dim: (int) Dimension of correlation matrix to sample. In the range [2, 200]. :param n_samples: (int) Number of samples to generate. :return: (np.array) Sampled correlation matrices of shape (n_samples, dim, dim). """ # Import here needed to prevent unnecessary imports in other parts of code. import tensorflow as tf # Validate dimension. if not (1 < dim <= 200): raise ValueError("Dimension not supported, {}".format(dim)) # Resulting correlation matrices. nearest_corr_mats = [] # Load generator model closest to the required dimension by looking at the models folder. dimension_from_folder = [ int(f.split("_")[1][:-1]) for f in listdir(model_loc) if not path.isfile(path.join(model_loc, f)) ] all_generator_dimensions = np.sort(dimension_from_folder) closest_dimension = next( filter(lambda i: i >= dim, all_generator_dimensions)) # Load model. generator = tf.keras.models.load_model("{}/generator_{}d".format( model_loc, closest_dimension), compile=False) # Sample from generator. Input dimension based on network. noise_dim = generator.layers[0].input_shape[1] noise = tf.random.normal([n_samples, noise_dim]) generated_mat = generator(noise, training=False) # Get the indices of an upper triangular matrix. tri_rows, tri_cols = np.triu_indices(dim, k=1) # For each sample generated, make them strict correlation matrices # by projecting them on the nearest correlation matrix using Higham’s # alternating projections method. for i in range(n_samples): # Grab only the required dimensions from generated matrix. corr_mat = np.array(generated_mat[i, :dim, :dim, 0]) # Set diagonal to 1 and symmetrize. np.fill_diagonal(corr_mat, 1) corr_mat[tri_cols, tri_rows] = corr_mat[tri_rows, tri_cols] # Get nearest correlation matrix that is positive semi-definite. nearest_corr_mat = corr_nearest(corr_mat) # Set diagonal to 1 and symmetrize. np.fill_diagonal(nearest_corr_mat, 1) nearest_corr_mat[tri_cols, tri_rows] = nearest_corr_mat[tri_rows, tri_cols] # Arrange with hierarchical clustering by maximizing the sum of the # similarities between adjacent leaves. dist = 1 - nearest_corr_mat linkage_mat = hierarchy.linkage(dist[tri_rows, tri_cols], method="ward") optimal_leaves = hierarchy.optimal_leaf_ordering( linkage_mat, dist[tri_rows, tri_cols]) optimal_ordering = hierarchy.leaves_list(optimal_leaves) ordered_corr = nearest_corr_mat[optimal_ordering, :][:, optimal_ordering] nearest_corr_mats.append(ordered_corr) return np.array(nearest_corr_mats)
row += 1 col += 1 joint_num = 0 Dis_all = np.zeros((motion_num, motion_num)) for dis_Mat in Dis_Mat_list: Dis_all += dis_Mat df_dis = pd.DataFrame(dis_Mat, columns=namelist, index=namelist) df_dis.to_csv("/home/kei/document/experiments/Master2/AJ_result/" + OpenPoseJoint[joint_num] + "_dis.csv") joint_num += 1 Dis_all = pd.DataFrame(Dis_all, columns=namelist, index=namelist) Dis_all.to_csv("/home/kei/document/experiments/Master2/AJ_result/Distance.csv") Distance = Dis_all.values print(Distance) darray = distance.squareform(Distance) result = linkage(darray, method="average") plt.rcParams["font.family"] = "Times New Roman" plt.rcParams['font.size'] = 14 #フォントサイズを設定 dendrogram(result, labels=namelist) plt.ylabel("distance") #plt.show() #plt.savefig("/home/kei/document/experiments/Master/UJ_result/elder.png") plt.cla() NUM_CLUSTERS_RANGE = range(2, 24) silhouette_coefficient = [] davies_bouldin_index = [] plt.xlabel('Number of Clusters') plt.ylabel('Silhouette Coefficient') plt.rcParams["ytick.direction"] = "in"
print("Read expression in") ase_rows, ase_cols = ase.shape ase = ase.ix[np.sum(np.isfinite(ase), axis=1) > .75 * ase_cols] ase = ase.ix[ase.index.intersection(all_expr.index)].dropna(axis='columns', how='all') all_expr = all_expr.ix[ase.index] all_expr_lognorm = np.log(all_expr + 1).divide( np.log(all_expr.max(axis=1) + 1), axis=0) print("Precalculating distances") metric = DistributionDifference.earth_mover_multi dist_mat = DistributionDifference.mp_pandas_pdist(ase + eps, metric) Z = hierarchy.linkage(dist_mat, method='weighted') make_treeview_files( "analysis/results/all_log_normed_" + is_sparse + metric.__name__, all_expr_lognorm, Z) make_treeview_files("analysis/results/all_" + is_sparse + metric.__name__, all_expr, Z) make_treeview_files("analysis/results/ase_" + is_sparse + metric.__name__, ase, Z) make_treeview_files( "analysis/results/all_maxnorm_" + is_sparse + metric.__name__, all_expr.divide(all_expr.max(axis=1) + 1, axis=0), Z)
col_colors=colors_list, xticklabels=False, yticklabels=False) title plotitle = str(total_samples) + ' samples clustered by ' + str( len(att_IDS)) + ' attractors' + ', genes = ' + str(total_genes) att_heatmap.fig.suptitle(plotitle, fontsize=18) #saves figure att_heatmap.plot plt.savefig('attractors_heatmap.png', format='png', dpi=300) plt.clf() # plots attractors dendrogram: from scipy.cluster import hierarchy # computes distance between samples datts = hierarchy.linkage(U_attractors.T, metric='euclidean') # plots the dendrogram plt.title('Hierarchical Clustering Dendrogram') plt.ylabel('attractors') plt.xlabel('distance [Euclidean]') hierarchy.set_link_color_palette(None) hierarchy.dendrogram(datts, labels=U_attractors.columns, leaf_rotation=0, orientation='left') plt.savefig('attractors_dendrogram.png', format='png', dpi=300) plt.clf() # plots attractor stacked bar plot for sample type content vl_title = 'Attractor/type' + ', ARI = ' + str(ARI) + ', AMI =' + str(AMI) att_content = samples_to_attractors[['type', 'attractor']]
def DrawSHC(samples, labels): plt.title("Customer Dendograms") dend = shc.dendrogram(shc.linkage(samples, method='ward')) plt.show()
def makeDendro(flatDist, labels, meta): clusters = sciHi.linkage(flatDist, metric=distMetric, method='average') print("Linkage:") print(clusters) plt.subplot(20, 1, (1, 15)) plt.rcParams['lines.linewidth'] = 0.6 dendro = sciHi.dendrogram(clusters, labels=labels) for i in dendro: print(i, dendro[i]) ax = plt.gca() plt.setp(ax.get_xticklabels(), visible=False) plt.subplot(20, 1, 16) genders = [ genderToNum(meta.loc[meta['Sample_Name'] == ind, 'Delivery_Sex'].values[0]) for ind in dendro['ivl'] ] plt.imshow([genders] * 4) plt.yticks([]) plt.xticks([]) plt.axis('off') plt.subplot(20, 1, 17) smoking = [ smokingToNum(meta.loc[meta['Sample_Name'] == ind, 'Patient_tobacco_now'].values[0]) for ind in dendro['ivl'] ] plt.imshow([smoking] * 4) plt.yticks([]) plt.xticks([]) plt.axis('off') plt.subplot(20, 1, 18) ga = [ m.trunc(meta.loc[meta['Sample_Name'] == ind, 'Delivery_week_at_delivery'].values[0]) for ind in dendro['ivl'] ] plt.imshow([ga] * 4) plt.yticks([]) plt.xticks([]) plt.axis('off') plt.subplot(20, 1, 19) gd = [ dgToNum(meta.loc[meta['Sample_Name'] == ind, 'DG'].values[0]) for ind in dendro['ivl'] ] plt.imshow([gd] * 4) plt.yticks([]) plt.xticks([]) plt.axis('off') plt.subplot(20, 1, 20) baby_weight = [ weightToNum( meta.loc[meta['Sample_Name'] == ind, 'SGA, AGA ou LGA (par rapport au poids)'].values[0]) for ind in dendro['ivl'] ] plt.imshow([baby_weight] * 4) plt.yticks([]) plt.xticks([]) plt.axis('off') plt.savefig('%sdendro_test_%s.svg' % (savePath, distDataPath.split('/')[-1]), format='svg') # dendro = sciHi.dendrogram(clusters, truncate_mode='level', p=20) # plt.savefig('%sdendro_p20_%s.svg' % (savePath, distDataPath.split('/')[-1]), format='svg') plt.close() return dendro
def cluster_ssh(sla, lat, lon, nclusters, distthres=3000, returnall=False): # Remove All NaN Points ntime, nlat, nlon = sla.shape slars = sla.reshape(ntime, nlat * nlon) okdata, knan, okpts = proc.find_nan(slars, 0) npts = okdata.shape[1] # --------------------------------------------- # Calculate Correlation and Covariance Matrices # --------------------------------------------- srho = np.corrcoef(okdata.T, okdata.T) scov = np.cov(okdata.T, okdata.T) srho = srho[:npts, :npts] scov = scov[:npts, :npts] # -------------------------- # Calculate Distance Matrix # -------------------------- lonmesh, latmesh = np.meshgrid(lon, lat) coords = np.vstack([lonmesh.flatten(), latmesh.flatten()]).T coords = coords[okpts, :] coords1 = coords.copy() coords2 = np.zeros(coords1.shape) coords2[:, 0] = np.radians(coords1[:, 1]) # First point is latitude coords2[:, 1] = np.radians(coords1[:, 0]) # Second Point is Longitude sdist = haversine_distances(coords2, coords2) * 6371 # -------------------------- # Combine the Matrices # -------------------------- a_fac = np.sqrt( -distthres / (2 * np.log(0.5))) # Calcuate so exp=0.5 when distance is 3000km expterm = np.exp(-sdist / (2 * a_fac**2)) distance_matrix = 1 - expterm * srho # -------------------------- # Do Clustering (scipy) # -------------------------- cdist = squareform(distance_matrix, checks=False) linked = linkage(cdist, 'weighted') clusterout = fcluster(linked, nclusters, criterion='maxclust') # ------------------------- # Calculate the uncertainty # ------------------------- uncertout = np.zeros(clusterout.shape) for i in range(len(clusterout)): covpt = scov[i, :] # cid = clusterout[i] # covin = covpt[np.where(clusterout == cid)] covout = covpt[np.where(clusterout != cid)] uncertout[i] = np.mean(covin) / np.mean(covout) # Apply rules from Thompson and Merrifield (Do this later) # if uncert > 2, set to 2 # if uncert <0.5, set to 0 #uncertout[uncertout>2] = 2 #uncertout[uncertout<0.5] = 0 # ----------------------- # Replace into full array # ----------------------- clustered = np.zeros(nlat * nlon) * np.nan clustered[okpts] = clusterout clustered = clustered.reshape(nlat, nlon) cluster_count = [] for i in range(nclusters): cid = i + 1 cnt = (clustered == cid).sum() cluster_count.append(cnt) print("Found %i points in cluster %i" % (cnt, cid)) uncert = np.zeros(nlat * nlon) * np.nan uncert[okpts] = uncertout uncert = uncert.reshape(nlat, nlon) if returnall: return clustered, uncert, cluster_count, srho, scov, sdist, distance_matrix return clustered, uncert, cluster_count
elif ((truth[i] != truth[j]) and (predicted[i] != predicted[j])): disagree_same += 1 count += 1 return (agree_same + disagree_same) / float(count) # Code Sample import scipy.cluster.hierarchy as sch import numpy as np import pylab as pl # Plot dendogram and cut the tree to find resulting clusters fig = pl.figure() data = np.array([[1, 2, 3], [1, 1, 1], [5, 5, 5]]) datalable = ['first', 'second', 'third'] hClsMat = sch.linkage(data, method='complete') # Complete clustering sch.dendrogram(hClsMat, labels=datalable, leaf_rotation=45) fig.savefig("thing.pdf") resultingClusters = sch.fcluster(hClsMat, t=3, criterion='distance') print resultingClusters # Your code starts from here .... # 1. # Scaling min max # STUDENT CODE TODO # 2. # K-means http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html # STUDENT CODE TODO
def step5(max_d): global eventL, notCombineRDDL, resultEventL, resultRDDL, outputPath, specialNum #vectorize the text vectorizer = CountVectorizer(analyzer="word", tokenizer=my_tokenizer, preprocessor=None, stop_words=['*'], max_features=10000) train_data_features = vectorizer.fit_transform(eventL) train_data_features = train_data_features.toarray() #hierarchical clustering Z = linkage(train_data_features, 'complete', 'cityblock') #c, coph_dists = cophenet(Z, pdist(train_data_features)) #print 'The goodness of cluster result:', c clusters = fcluster(Z, max_d, criterion='distance') #initialize RDD list and Event list resultEventLL = [] resultRDDLL = [] numCombinedEvents = max(clusters) for i in range(numCombinedEvents): resultRDDLL.append([]) resultEventLL.append([]) #Put event/RDD that belong to the same cluster into the same list currentEventNum = 0 for clusterNum in clusters: resultRDDLL[clusterNum - 1].append(notCombineRDDL[currentEventNum]) resultEventLL[clusterNum - 1].append(eventL[currentEventNum]) currentEventNum += 1 #Merge the event/RDD in the same list for sameEventL in resultEventLL: if len(sameEventL) == 1: resultEventL.append(sameEventL[0]) else: combinedEvent = sameEventL[0].strip().split() count = 0 for currentEvent in sameEventL: if count == 0: count += 1 continue else: combinedEvent = LCS(combinedEvent, currentEvent.strip().split()) count += 1 resultEventL.append(' '.join(combinedEvent)) for sameRDDL in resultRDDLL: if len(sameRDDL) == 1: resultRDDL.append(sameRDDL[0]) else: resultRDDL.append(sc.union(sameRDDL)) resultRDDL[-1].map(lambda (ID, log): ID).saveAsTextFile( outputPath + str(len(resultRDDL) + specialNum))
def batch_process_aggregate(folder_path: str, group_criteria: float) -> List[dict]: """ this function will read all the labeled defects from ./rect.json, aggregate close ones and output the aggregated defects information to ./defects.json :param folder_path: folder_path of the raw ir images :param group_criteria: in meters, if two defects are closer than this, they will be aggregated :return: list of information about defects """ with open(join(folder_path, "exif.json"), "r") as f: exif = json.load(f) with open(join(folder_path, "rect.json"), "r") as f: rect_info = json.load(f) rects = list() for d in rect_info: for rect in d.get("rects"): rect.update({"height": d.get("height"), "width": d.get("width"), "image": d.get("image")}) rects.append(rect) group_ids = set([x.get("panel_group_id") for x in rects]) defect_num = 0 defects = list() for group_id in group_ids: rects_match_id = [x for x in rects if x.get("panel_group_id") == group_id] if len(rects_match_id) == 1: cluster = [0] else: pixel_location_table = np.array([[x.get("easting"), x.get("northing")] for x in rects_match_id]) linkage_matrix = linkage(pixel_location_table, method='single', metric='chebyshev') ctree = cut_tree(linkage_matrix, height=[group_criteria]) cluster = np.array([x[0] for x in ctree]) for i in range(len(rects_match_id)): rects_match_id[i].update({"defectId": "DEF{:05d}".format(cluster[i] + defect_num)}) defect_num += max(cluster) + 1 defect_id_set = set([x.get("defectId") for x in rects_match_id]) for defect_id in defect_id_set: defect = {"defectId": defect_id, "panelGroupId": group_id, "category": DefectCategory.UNCONFIRMED} rect_match_defect = [x for x in rects_match_id if x.get("defectId") == defect_id] easting = float(np.mean([x.get("easting") for x in rect_match_defect])) northing = float(np.mean([x.get("northing") for x in rect_match_defect])) severity = float(np.mean([x.get("severity") for x in rects_match_id])) utm_zone = rects_match_id[0].get("utm_zone") lat, lng = utm.to_latlon(easting, northing, utm_zone, northern=True) defect.update({"lat": lat, "lng": lng, "utmEasting": easting, "utmNorthing": northing, "utmZone": utm_zone, "severity": severity}) defect.update({"rects": [x for x in rect_match_defect]}) defects.append(defect) with open(join(folder_path, "defects.json"), "w") as f: json.dump(defects, f) return defects
def link_clusters(self, distances: numpy.ndarray, num: int) -> pandas.DataFrame: Z = hierarchy.linkage(distances, method = self.linkage_method, optimal_ordering = True) return format_linkage_matrix(Z, num)
def evaluate_distance_matrix(distanceMatrix, trueClusters, clusteringType, **kwargs): # TODO: 1. clear blackList dependency # 2. clustering type is an unlucky name for betaCV and the like. trueClusterNum = len(np.unique(trueClusters)) # distanceMatrixCopy = np.copy(distanceMatrix) if clusteringType == 'all' or 'betaCV' in clusteringType: res = beta_cv(distanceMatrix, trueClusters, blackList=None, ranks=False) print "Beta-CV = %f" % (res, ) if clusteringType == 'all' or 'cIndex' in clusteringType: res = c_index(distanceMatrix, trueClusters, blackList=None) print "C-Index = %f" % (res, ) if clusteringType == 'all' or 'silhouette' in clusteringType: print "Silhouette = %f" % (metrics.silhouette_score( distanceMatrix, trueClusters, metric='precomputed'), ) if clusteringType == 'all' or 'hierarchical' in clusteringType: print "\nEvaluating **Hierarchical Clustering**" distArray = ssd.squareform(distanceMatrix) try: linkageFunction = kwargs['linkage'] except: linkageFunction = "complete" print "Linkage = " + linkageFunction Z = hierarchy.linkage(distArray, method=linkageFunction) T = hierarchy.fcluster(Z, trueClusterNum, criterion="maxclust") if len(np.unique(T)) != trueClusterNum: print "!Clusters found: " + str(len(np.unique(T))) res = evaluate_unsup_clustering(trueClusters, T, None, verbose=True) if clusteringType == 'all' or 'affinity' in clusteringType: print "\nEvaluating **Affinity Propagation**" affinities = np.exp(-(distanceMatrix**2) / (2 * (np.median(distanceMatrix)**2))) cluster_centers_indices, labels = sklearn_cluster.affinity_propagation( affinities, copy=False, verbose=True) res = evaluate_unsup_clustering(trueClusters, labels, len(cluster_centers_indices), verbose=True) if clusteringType == 'all' or "dbscan" in clusteringType: print "\nEvaluating **DBScan Clustering**" # TODO maybe adapt eps eps = np.percentile(distanceMatrix, 5) predictedLabels = sklearn_cluster.DBSCAN( eps, metric='precomputed').fit_predict(distanceMatrix) print "Predicted as Noise: " + str(np.sum(predictedLabels == -1)) res = evaluate_unsup_clustering(trueClusters, predictedLabels, len(np.unique(predictedLabels)), verbose=True) if clusteringType == 'all' or "spectral" in clusteringType: print "\nEvaluating **Spectral (with Normalized Laplacian) Clustering**" affinities = np.exp(-(distanceMatrix**2) / (2 * (np.median(distanceMatrix)**2))) # arpack was chosen for stability reasons. classifier = sklearn_cluster.SpectralClustering( n_clusters=trueClusterNum, affinity='precomputed', assign_labels='kmeans', eigen_solver='arpack') classifier.fit(affinities) res = evaluate_unsup_clustering(trueClusters, classifier.labels_, None, verbose=True) # assert(np.all(distanceMatrixCopy == distanceMatrix)) return res
k = np.max([ np.where(pacf(consommation.loc[:, colname]) < 0)[0][0] for colname, col in consommation.iteritems() ]) DM_GCC = np.zeros((consommation.shape[1], consommation.shape[1])) for i, j in itertools.combinations(range(consommation.shape[1]), 2): DM_GCC[i, j] = DM_GCC[j, i] = 1 - helpers.get_GCC( consommation.iloc[:, i], consommation.iloc[:, j], k) DM_GCC = pd.DataFrame(DM_GCC, index=consommation.columns, columns=consommation.columns) # sns.clustermap(consommation, col_linkage=hcl.linkage(squareform(DM_GCC))) plt.figure() hcl.dendrogram(hcl.linkage(squareform(DM_GCC), method="average")) plt.figure() plt.plot( np.arange(.1, 1.1, .1), np.array([ np.unique( hcl.fcluster(hcl.linkage(squareform(DM_GCC), method="average"), t=t, criterion="distance")).shape[0] for t in np.arange(0.1, 1.1, 0.1) ])) hcl.fcluster(hcl.linkage(squareform(DM_GCC), method="average"), t=0.4, criterion="distance")
#SciPy >> spatial.distance module >> pdist function from scipy.spatial.distance import pdist, squareform row_dist = pd.DataFrame(squareform(pdist(df, metric="euclidean")), columns=labels, index=labels) row_dist # In[ ]: #agglomerative >>scipy.cluster.hierarchy submodule >> linkage function from scipy.cluster.hierarchy import linkage help(linkage) # In[ ]: row_clusters = linkage(pdist(df, metric="euclidean"), method="complete") pd.DataFrame(row_clusters, columns=["row label 1", "row label 2", "distance", "No."], index=[ "cluster of {}".format(i + 1) for i in range(row_clusters.shape[0]) ]) # In[ ]: from scipy.cluster.hierarchy import dendrogram row_dendr = dendrogram(row_clusters, labels=labels) plt.ylabel("Euclidean distance") plt.tight_layout() plt.show()
labelList = range(len(x)) plt.figure(figsize=(10, 7)) dendrogram(centr, orientation='top', labels=labelList, distance_sort='descending', show_leaf_counts=True) plt.title('Dendrograma using centroid method') plt.show() # ============================================================================= # ALERTA! Hay más técnicas de clustering # ============================================================================= linked = linkage(df, 'single') labelList = range(len(x)) plt.figure(figsize=(10, 7)) dendrogram(linked, orientation='top', labels=labelList, distance_sort='descending', show_leaf_counts=True) plt.title('Dendrograma using linkage method') plt.show() # ============================================================================= # DATOS Con diferentes unidades --- SIN NORMALIZAR # ============================================================================= # Cambiamos los valores de un eje
def hecheng(a, b): m, N = a.shape n = b.shape[1] c = zeros((m, n)) for i in range(m): for j in range(n): c[i, j] = max([min(a[i, k], b[k, j]) for k in range(N)]) return c a = array([[5, 5, 3, 2], [2, 3, 4, 5], [5, 5, 2, 3], [1, 5, 3, 1], [2, 4, 5, 1]]) d = array([[sum(abs(a[i] - a[j])) for i in range(5)] for j in range(5)]) r = 1 - 0.1 * d print(r) tr = hecheng(r, r) while abs(r - tr).sum() > 0.00001: r = tr tr = hecheng(r, r) print('\n------------------------\n', tr) d2 = 1 - tr #为了画图,再次转换为距离关系 d2 = triu(d2, 1) d2 = d2[d2 != 0] #提取矩阵上三角中的非零元素 z = sch.linkage(d2) s = ['I', 'II', 'III', 'IV', 'V'] sch.dendrogram(z, labels=s) #画聚类树 plt.yticks([]) #y轴不可见 plt.show()
# Remove the x ticks, y ticks, x and y axis plt.xticks([]) plt.yticks([]) #plt.axis('off') # Display the plot of the original data before clustering plt.scatter(X1[:, 0], X1[:, 1], marker='.') # Display the plot plt.show() dist_matrix = distance_matrix(X1,X1) print(dist_matrix) Z = hierarchy.linkage(dist_matrix, 'complete') dendro = hierarchy.dendrogram(Z) filename = 'cars_clus.csv' #Read csv pdf = pd.read_csv(filename) print ("Shape of dataset before cleaning: ", pdf.size) pdf[[ 'sales', 'resale', 'type', 'price', 'engine_s', 'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap', 'mpg', 'lnsales']] = pdf[['sales', 'resale', 'type', 'price', 'engine_s', 'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap', 'mpg', 'lnsales']].apply(pd.to_numeric, errors='coerce') pdf = pdf.dropna() pdf = pdf.reset_index(drop=True)
print(' %g%% of total patterns' % (100*len(inds)/len(ids_clusters))) for real_class in unique_y: clustered = (list(y[inds])).count(real_class) total = len(y) print(real_class,":", (clustered/total)*100 ) # ### Hierarchical clustering using ward # In[87]: from scipy.cluster.hierarchy import linkage, fcluster, dendrogram # Hierarchichal clustering, single-linkage: ward_cluster = linkage(NX, 'ward') unique_y = np.unique(y) ids_clusters = fcluster(ward_cluster, 5, # number of final clusters criterion='maxclust') - 1 for i in np.unique(ids_clusters): inds = (np.where(np.array(ids_clusters) == i))[0] print('\033[1m'+'- Cluster %d' % i + '\033[0m') print(' %g%% of total patterns' % (100*len(inds)/len(ids_clusters))) for real_class in unique_y: clustered = (list(y[inds])).count(real_class) total = len(y) print(real_class, ":" ,(clustered/total)*100 ) print()
def hierarchical(data=None, k=0, linkage='average', metric='euclidean', metric_args=None): """Perform clustering using hierarchical agglomerative algorithms. Parameters ---------- data : array An m by n array of m data samples in an n-dimensional space. k : int, optional Number of clusters to extract; if 0 uses the life-time criterion. linkage : str, optional Linkage criterion; one of 'average', 'centroid', 'complete', 'median', 'single', 'ward', or 'weighted'. metric : str, optional Distance metric (see 'biosppy.metrics'). metric_args : dict, optional Additional keyword arguments to pass to the distance function. Returns ------- clusters : dict Dictionary with the sample indices (rows from 'data') for each found cluster; outliers have key -1; clusters are assigned integer keys starting at 0. Raises ------ TypeError If 'metric' is not a string. ValueError When the 'linkage' is unknown. ValueError When 'metric' is not 'euclidean' when using 'centroid', 'median', or 'ward' linkage. ValueError When 'k' is larger than the number of data samples. """ # check inputs if data is None: raise TypeError("Please specify input data.") if linkage not in [ 'average', 'centroid', 'complete', 'median', 'single', 'ward', 'weighted' ]: raise ValueError("Unknown linkage criterion '%r'." % linkage) if not isinstance(metric, six.string_types): raise TypeError("Please specify the distance metric as a string.") N = len(data) if k > N: raise ValueError("Number of clusters 'k' is higher than the number" \ " of input samples.") if metric_args is None: metric_args = {} if linkage in ['centroid', 'median', 'ward']: if metric != 'euclidean': raise TypeError("Linkage '{}' requires the distance metric to be" \ " 'euclidean'.".format(linkage)) Z = sch.linkage(data, method=linkage) else: # compute distances D = metrics.pdist(data, metric=metric, **metric_args) # build linkage Z = sch.linkage(D, method=linkage) if k < 0: k = 0 # extract clusters if k == 0: # life-time labels = _life_time(Z, N) else: labels = sch.fcluster(Z, k, 'maxclust') # get cluster indices clusters = _extract_clusters(labels) return utils.ReturnTuple((clusters, ), ('clusters', ))
modelo = AgglomerativeClustering(n_clusters=17) grupos = modelo.fit_predict(generos_escalados) grupos tsne = TSNE() visualizacao = tsne.fit_transform(generos_escalados) visualizacao sns.scatterplot(x=visualizacao[:, 0], y=visualizacao[:, 1], hue=grupos) from scipy.cluster.hierarchy import dendrogram, linkage modelo = KMeans(n_clusters=17) modelo.fit(generos_escalados) grupos = pd.DataFrame(modelo.cluster_centers_, columns=generos.columns) grupos.transpose().plot.bar(subplots=True, figsize=(25, 50), sharex=False, rot=0) matriz_de_distancia = linkage(grupos) matriz_de_distancia dendrograma = dendrogram(matriz_de_distancia)
import pylab from matplotlib import pyplot as plt import scipy.spatial.distance as dist import scipy.cluster.hierarchy as hier import cv2 import numpy as np NUM_CLUST = 6 distSqMat = np.loadtxt('/home/brinstongonsalves/Documents/PyCharm/CV/mat.txt') link_mat = hier.linkage(distSqMat,'single') plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram : Full') hier.dendrogram(link_mat) plt.savefig("dendogram.jpg") plt.clf() plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram : Truncated') hier.dendrogram(link_mat,truncate_mode='lastp',p = NUM_CLUST,) plt.savefig("dendogram1.jpg")
def make_figure(df, pa): """Generates figure. Args: df (pandas.core.frame.DataFrame): Pandas DataFrame containing the input data. pa (dict): A dictionary of the style { "argument":"value"} as outputted by `figure_defaults`. Returns: A Plotly figure. A Pandas DataFrame with columns clusters. A Pandas DataFrame with rows clusters. A Pandas DataFrame as displayed in the the Maptlotlib figure. """ #fig = go.Figure( ) #fig.update_layout( width=pa_["fig_width"], height=pa_["fig_height"] ) # autosize=False, tmp = df.copy() tmp.index = tmp[pa["xvals"]].tolist() tmp = tmp[pa["yvals"]] if pa["add_constant"] != "": tmp = tmp + float(pa["add_constant"]) if pa["log_transform_value"] == "log2": tmp = np.log2(tmp) elif pa["log_transform_value"] == "log10": tmp = np.log10(tmp) pa_ = {} checkboxes = [ "row_cluster", "col_cluster", "xticklabels", "yticklabels", "row_dendogram_dist", "col_dendogram_dist", "reverse_color_scale" ] # "robust" for c in checkboxes: if (pa[c] == "on") | (pa[c] == ".on"): pa_[c] = True else: pa_[c] = False for v in [ "col_color_threshold", "row_color_threshold", "upper_value", "center_value", "lower_value" ]: if pa[v] == "": pa_[v] = None else: pa_[v] = float(pa[v]) if pa_["reverse_color_scale"]: pa_["colorscale_value"] = pa["colorscale_value"] + "_r" else: pa_["colorscale_value"] = pa["colorscale_value"] selfdefined_cmap = True for value in [ "lower_value", "center_value", "upper_value", "lower_color", "center_color", "upper_color" ]: if pa[value] == "": selfdefined_cmap = False break if selfdefined_cmap: range_diff = float(pa["upper_value"]) - float(pa["lower_value"]) center = float(pa["center_value"]) - float(pa["lower_value"]) center = center / range_diff color_continuous_scale=[ [0, pa["lower_color"]],\ [center, pa["center_color"]],\ [1, pa["upper_color"] ]] pa_["colorscale_value"] = color_continuous_scale if pa["zscore_value"] == "row": tmp = pd.DataFrame(stats.zscore(tmp, axis=1, ddof=1), columns=tmp.columns.tolist(), index=tmp.index.tolist()) elif pa["zscore_value"] == "columns": tmp = pd.DataFrame(stats.zscore(tmp, axis=0, ddof=1), columns=tmp.columns.tolist(), index=tmp.index.tolist()) if len(pa["findrow"]) > 0: rows_to_find = pa["findrow"] possible_rows = tmp.index.tolist() not_found = [s for s in rows_to_find if s not in possible_rows] if len(not_found) > 0: message = "˜The following rows could not be found: %s. Please check your entries for typos." % ( ", ".join(not_found)) flash(message, 'error') rows_to_plot = [] + rows_to_find if (pa["findrowup"] != "") | (pa["findrowdown"] != ""): d = scs.distance.pdist(tmp, metric=pa["distance_value"]) d = squareform(d) d = pd.DataFrame(d, columns=tmp.index.tolist(), index=tmp.index.tolist()) d = d[rows_to_find] for r in rows_to_find: dfrow = d[[r]] if pa["findrowtype_value"] == "percentile": row_values = dfrow[r].tolist() if pa["findrowup"] != "": upperc = np.percentile(row_values, float(pa["findrowup"])) upperc = dfrow[dfrow[r] >= upperc] rows_to_plot = rows_to_plot + upperc.index.tolist() if pa["findrowdown"] != "": downperc = np.percentile(row_values, float(pa["findrowdown"])) downperc = dfrow[dfrow[r] <= downperc] rows_to_plot = rows_to_plot + downperc.index.tolist() if pa["findrowtype_value"] == "n rows": dfrow = dfrow.sort_values(by=[r], ascending=True) row_values = dfrow.index.tolist() if pa["findrowdown"] != "": rows_to_plot = rows_to_plot + row_values[:int( pa["findrowdown"])] if pa["findrowup"] != "": rows_to_plot = rows_to_plot + row_values[ -int(pa["findrowup"]):] if pa["findrowtype_value"] == "absolute": if pa["findrowup"] != "": upperc = dfrow[dfrow[r] >= float(pa["findrowup"])] rows_to_plot = rows_to_plot + upperc.index.tolist() if pa["findrowdown"] != "": downperc = dfrow[dfrow[r] <= float(pa["findrowdown"])] rows_to_plot = rows_to_plot + downperc.index.tolist() rows_to_plot = list(set(rows_to_plot)) tmp = tmp[tmp.index.isin(rows_to_plot)] data_array = tmp.values data_array_ = tmp.transpose().values labels = tmp.columns.tolist() rows = tmp.index.tolist() # # Initialize figure by creating upper dendrogram if pa_["col_cluster"]: fig = ff.create_dendrogram(data_array_, orientation='bottom', labels=labels, color_threshold=pa_["col_color_threshold"],\ distfun = lambda x: scs.distance.pdist(x, metric=pa["distance_value"]),\ linkagefun= lambda x: sch.linkage(x, pa["method_value"])) for i in range(len(fig['data'])): fig['data'][i]['yaxis'] = 'y2' dendro_leaves_y_labels = fig['layout']['xaxis']['ticktext'] #dendro_leaves_y = [ labels.index(i) for i in dendro_leaves_y_labels ] #for data in dendro_up['data']: # fig.add_trace(data) if pa_["col_color_threshold"]: d = scs.distance.pdist(data_array_, metric=pa["distance_value"]) Z = sch.linkage(d, pa["method_value"]) #linkagefun(d) max_d = pa_["col_color_threshold"] clusters_cols = fcluster(Z, max_d, criterion='distance') clusters_cols = pd.DataFrame({ "col": tmp.columns.tolist(), "cluster": list(clusters_cols) }) else: clusters_cols = pd.DataFrame({"col": tmp.columns.tolist()}) else: fig = go.Figure() dendro_leaves_y_labels = tmp.columns.tolist() dendro_leaves_y = [labels.index(i) for i in dendro_leaves_y_labels] # Create Side Dendrogram if pa_["row_cluster"]: dendro_side = ff.create_dendrogram(data_array, orientation='right', labels=rows, color_threshold=pa_["row_color_threshold"],\ distfun = lambda x: scs.distance.pdist(x, metric=pa["distance_value"]),\ linkagefun= lambda x: sch.linkage(x, pa["method_value"] )) for i in range(len(dendro_side['data'])): dendro_side['data'][i]['xaxis'] = 'x2' dendro_leaves_x_labels = dendro_side['layout']['yaxis']['ticktext'] #dendro_leaves_x = [ rows.index(i) for i in dendro_leaves_x_labels ] if pa_["row_color_threshold"]: d = scs.distance.pdist(data_array, metric=pa["distance_value"]) Z = sch.linkage(d, pa["method_value"]) #linkagefun(d) max_d = pa_["row_color_threshold"] clusters_rows = fcluster(Z, max_d, criterion='distance') clusters_rows = pd.DataFrame({ "col": tmp.index.tolist(), "cluster": list(clusters_rows) }) else: clusters_rows = pd.DataFrame({"col": tmp.index.tolist()}) #if pa_["col_cluster"]: # Add Side Dendrogram Data to Figure #print(dendro_side['data'][0]) for data in dendro_side['data']: fig.add_trace(data) #else: # fig=dendro_side else: dendro_leaves_x_labels = tmp.index.tolist() dendro_leaves_x = [rows.index(i) for i in dendro_leaves_x_labels] if pa["robust"] != "": vals = tmp.values.flatten() up = np.percentile(vals, 100 - float(pa["robust"])) down = np.percentile(vals, float(pa["robust"])) tmp[tmp > up] = up tmp[tmp < down] = down data_array = tmp.values # Create Heatmap heat_data = data_array heat_data = heat_data[dendro_leaves_x, :] heat_data = heat_data[:, dendro_leaves_y] heatmap = [ go.Heatmap(x=dendro_leaves_x_labels, y=dendro_leaves_y_labels, z=heat_data, zmax=pa_["upper_value"], zmid=pa_["center_value"], zmin=pa_["lower_value"], colorscale=pa_['colorscale_value'], colorbar={ "title": { "text": pa["color_bar_label"], "font": { "size": float(pa["color_bar_font_size"]) } }, "lenmode": "pixels", "len": float(pa["fig_height"]) / 4, "xpad": float(pa["color_bar_horizontal_padding"]), "tickfont": { "size": float(pa["color_bar_ticks_font_size"]) } }) ] if pa_["col_cluster"]: heatmap[0]['x'] = fig['layout']['xaxis']['tickvals'] else: heatmap[0]['x'] = dendro_leaves_y_labels if pa_["row_cluster"]: heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals'] else: fake_vals = [] i = 0 for f in range(len(dendro_leaves_x_labels)): fake_vals.append(i) i += 1 #dendro_leaves_x_labels=tuple(fake_vals) heatmap[0]['y'] = tuple(fake_vals) #dendro_leaves_x_labels # Add Heatmap Data to Figure # if (pa_["col_cluster"]) | (pa_["row_cluster"]): for data in heatmap: fig.add_trace(data) # else: # fig = go.Figure(data=heatmap[0]) # Edit Layout fig.update_layout({ 'width': float(pa["fig_width"]), 'height': float(pa["fig_height"]), 'showlegend': False, 'hovermode': 'closest', "yaxis": { "mirror": "allticks", 'side': 'right', 'showticklabels': pa_["xticklabels"], 'ticktext': dendro_leaves_x_labels }, "xaxis": { "mirror": "allticks", 'side': 'right', 'showticklabels': pa_["yticklabels"], 'ticktext': dendro_leaves_y_labels } }) # Edit xaxis fig.update_layout(xaxis={'domain': [ float(pa["row_dendogram_ratio"]), 1], 'mirror': False, 'showgrid': False, 'showline': False, 'zeroline': False, 'showticklabels': pa_["yticklabels"], "tickfont":{"size":float(pa["yaxis_font_size"])}, 'ticks':"",\ 'ticktext':dendro_leaves_y_labels}) # Edit xaxis2 if pa_["row_cluster"]: fig.update_layout( xaxis2={ 'domain': [0, float(pa["row_dendogram_ratio"])], 'mirror': False, 'showgrid': False, 'showline': False, 'zeroline': False, 'showticklabels': pa_["row_dendogram_dist"], 'ticks': "" }) # Edit yaxis fig.update_layout(yaxis={'domain': [0, 1-float(pa["col_dendogram_ratio"]) ], 'mirror': False, 'showgrid': False, 'showline': False, 'zeroline': False, 'showticklabels': pa_["xticklabels"], "tickfont":{"size":float(pa["xaxis_font_size"])} , 'ticks': "",\ 'tickvals':heatmap[0]['y'],\ 'ticktext':dendro_leaves_x_labels}) #'tickvals':dendro_side['layout']['yaxis']['tickvals'],\ # Edit yaxis2 showticklabels if pa_["col_cluster"]: fig.update_layout( yaxis2={ 'domain': [1 - float(pa["col_dendogram_ratio"]), 1], 'mirror': False, 'showgrid': False, 'showline': False, 'zeroline': False, 'showticklabels': pa_["col_dendogram_dist"], 'ticks': "" }) fig.update_layout(template='plotly_white') fig.update_layout( title={ "text": pa["title"], "yanchor": "top", "font": { "size": float(pa["title_size_value"]) } }) cols = list(fig['layout']['xaxis']['ticktext']) rows = list(fig['layout']['yaxis']['ticktext']) df_ = pd.DataFrame({"i": range(len(rows))}, index=rows) df_ = df_.sort_values(by=["i"], ascending=False) df_ = df_.drop(["i"], axis=1) df_ = pd.merge(df_, tmp, how="left", left_index=True, right_index=True) df_ = df_[cols] clusters_cols_ = pd.DataFrame({"col": cols}) if pa_["col_cluster"]: clusters_cols = pd.merge(clusters_cols_, clusters_cols, on=["col"], how="left") else: clusters_cols = clusters_cols_ clusters_rows_ = pd.DataFrame({"col": df_.index.tolist()}) if pa_["row_cluster"]: clusters_rows = pd.merge(clusters_rows_, clusters_rows, on=["col"], how="left") else: clusters_rows = clusters_rows_ df_.reset_index(inplace=True, drop=False) cols = df_.columns.tolist() cols[0] = "rows" df_.columns = cols return fig, clusters_cols, clusters_rows, df_
rcParams['font.sans-serif'] = [ 'Linux Biolinum', 'Tahoma', 'DejaVu Sans', 'Lucida Grande', 'Verdana' ] for i, path in enumerate(paths): if path.name != 'tf_doc_symbol_matrix.npy': continue feature_matrix = np.load(path) methods = ['ward'] for method in methods: # Should consider using L2 distance here for word-to-vec model rather than euclidean links = hierarchy.linkage(feature_matrix, method=method, optimal_ordering=True) fig = plt.figure(figsize=((3.33 * 2) + 0.33, 3.25), dpi=220) hierarchy.dendrogram( links, leaf_label_func=get_leaf_label, orientation='left', leaf_rotation=0., # rotates the x axis labels color_threshold=0.4, above_threshold_color='xkcd:light grey', ) # plt.title('Hierarchical Clustering of MSWE Document Representations', x=0.2, y = 1.04) plt.xlabel('Euclidean Distance Between Clusters',