def compare_clusters(args): ref_df = pd.read_table(args['ref'], sep='\t', skipinitialspace=True, index_col=0).as_matrix() check_symmetry(ref_df) linkage_ref = linkage(ref_df, 'average') c_ref, coph_dists_ref = cophenet(linkage_ref, pdist(ref_df)) outfile = open(args['output'],"w") outfile.write("Tree_cluster\tMantel_Correlation_Coefficient\tManter_P-value\tCophenetic_Pearson\tCophenetic_P-value\n") for i in args['all']: fst_df = pd.read_table(i, sep='\t', skipinitialspace=True, index_col=0).as_matrix() check_symmetry(fst_df) mantel_coeff = 0.0 p_value_mantel = 0.0 cophenetic_pearson = 0.0 p_value_cophenetic = 0.0 n = 0 try: mantel_coeff, p_value_mantel, n = mantel(ref_df, fst_df) linkage_fst = linkage(fst_df, 'average') c_fst, coph_dists_fst = cophenet(linkage_fst, pdist(fst_df)) cophenetic_pearson, p_value_cophenetic = pearsonr(coph_dists_ref, coph_dists_fst) except Exception as e: print("Error : %s" % str(e)) mantel_coeff = "Failed" p_value_manel = "Failed" cophenetic_pearson = "Failed" p_value_cophenetic = "Failed" outfile.write(i+"\t"+str(mantel_coeff)+"\t"+str(p_value_mantel)+"\t"+str(cophenetic_pearson)+"\t"+str(p_value_cophenetic)+"\n") outfile.close()
def measure_cluster_accuracy(hier, data): """ Generate score for Hierarchy clusters. The closer the value is to 1, the better the clustering preserves the original distances """ score, coph_dists = cophenet(hier, pdist(data)) print('\n', 'Cophenet distance for ', cat, '==> ', round(score, 2))
def cengci(data): X = data distMatrix = pdist(X) Z = linkage(X, 'ward') c, coph_dists = cophenet(Z, pdist(X)) print c dendrogram(Z)
def Hierarchical_cluster_part(csvFile): df = pd.read_csv(csvFile) data = df.as_matrix() data = data[:, 1:] # generate the linkage matrix Z = linkage(data, 'ward') c, coph_dists = cophenet(Z, pdist(data)) print c ## Plotting a Dendrogram # calculate full dendrogram plt.figure(figsize=(140, 60)) plt.title('Hierarchical Clustering Dendrogram(part)') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=2., # font size for the x axis labels ) # fancy_dendrogram( # Z, # truncate_mode='lastp', # show only the last p merged clusters #p=18, # show only the last p merged clusters # leaf_rotation=90., # rotates the x axis labels # leaf_font_size=8., # font size for the x axis labels # show_leaf_counts=True, # numbers in brackets are counts # show_contracted=True, # to get a distribution impression in truncated branches # max_d = 6000 # max_d as in max_distance # ) plt.savefig( '/Users/CeciliaLee/Dropbox/Intren/HKIA/2/Dendrogram_Tree(part).png') plt.show() return c, Z
def hierarchical_clustering( df: Union[pd.DataFrame, np.ndarray], method: str = "ward") -> Union[HierCluster, None, ValueError]: """Hierarchical cluster of a dataframe. Return clustering created using scipy from a given dataframe of correlations, using the HierCluster class available in prestools.classes. See Also: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html Args: df: input dataframe of correlations method: method to use to cluster the data ('ward', 'single', 'complete', 'average', 'weighted', 'centroid', 'median') (default: 'ward') Returns: cl: instance of prestools.classes.HierCluster() """ if method not in [ "ward", "single", "complete", "average", "weighted", "centroid", "median" ]: return ValueError("Method not valid!") if df.shape == (0, 0) or df.shape == (1, 1): return cl = HierCluster() cl.linkage = sch.linkage(df, method=method) cl.pair_dist = ssd.pdist(df) cl.coph_dist, cl.coph_matr = sch.cophenet(cl.linkage, cl.pair_dist) return cl
def _get_doc_clusters(self, paint=False): start_time = time.time() self.logger.info("start make doc cluster...") full_d2v = np.empty(shape=(len(self.lda_d2v), self.topic_conf['num_topics'])) for i, v in enumerate(self.lda_d2v): v = matutils.unitvec(matutils.sparse2full( v, self.topic_conf['num_topics']), norm='l1') full_d2v[i] = v dist_matrix = sch.distance.pdist(full_d2v, 'euclidean') link_matrix = sch.linkage(dist_matrix, method='average') cophenet, cophenet_dist = sch.cophenet(link_matrix, dist_matrix) self.logger.info("doc cluster cophenet is [%s]" % cophenet) self.num_doc_clusters = len( self.lda_d2v) // self.cluster_conf['num_clusters_factor'] self.logger.info("doc cluster number is [%d]" % self.num_doc_clusters) sch_d2c = None if self.num_doc_clusters < 2: self.logger.error("too small doc cluster number") else: sch_d2c = sch.fcluster(link_matrix, t=self.num_doc_clusters, criterion='maxclust') with open(self.d2c_file, "w") as fo: fo.write("\n".join(map(str, sch_d2c))) if paint: self._paint(full_d2v, sch_d2c, link_matrix) self.logger.info("end make doc cluster cost %ds" % (time.time() - start_time)) return sch_d2c
def create_linkage(vecs, metric="cosine", order=True): link = linkfun(vecs, metric, order) c, coph_dists = cophenet(link, pdist(vecs, metric)) print("Cophenet Distance between linkage and original vecs: " + str(c)) return link
def test_linkage_cophenet_tdist_Z(self): # Tests cophenet(Z) on tdist data set. expectedM = np.array([268, 295, 255, 255, 295, 295, 268, 268, 295, 295, 295, 138, 219, 295, 295]) Z = hierarchy_test_data.linkage_ytdist_single M = cophenet(Z) assert_allclose(M, expectedM, atol=1e-10)
def copheneticCorrelationCoeff(self): from scipy.cluster.hierarchy import cophenet from scipy.spatial.distance import pdist coeff, coph_dists = cophenet(self.Z, pdist(self.X)) return coeff
def dendrogram(self, X, metric = 'Euclidean', linkage = 'ward', x_label = 'Patterns'): """Generate hierarchical dendrogram. Keyword arguments: metric -- distance metric; default value = 'Euclidean' linkage -- default value = 'ward'""" X = X.reshape((X.shape[0], X.shape[1] * X.shape[2])) Z = sch.linkage(X, linkage) c, coph_dists = sch.cophenet(Z, pdist(X, metric)) # Cophenetic Correlation Coefficient of clustering. # This compares (correlates) the actual pairwise distances of all your samples to those implied by the hierarchical clustering. # The closer the value is to 1, the better the clustering preserves the original distances. print 'Cophenetic correlation coefficient of clustering (the closer to 1, the better):', c # calculate full dendrogram fig = plt.figure(figsize=(20, 10)) ax = fig.add_subplot(111) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel(x_label) plt.ylabel(metric + ' Distance') sch.dendrogram( Z, leaf_rotation=90, # rotates the x axis labels leaf_font_size=25, # font size for the x axis labels labels = self.labels ) ax.set_ylim(bottom=-0.5) ax.tick_params(labelsize=25)
def cophenetic(M): """ Calculate the cophenetic correlation coefficient to assess the quality of clutering """ Z = linkage(M, method='average') c, cophe_dist = cophenet(Z, pdist(M)) return c
def Hierarchical_cluster_part(csvFile): df=pd.read_csv(csvFile) data=df.as_matrix() data=data[:,1:] # generate the linkage matrix Z = linkage(data, 'ward') c, coph_dists = cophenet(Z, pdist(data)) print c ## Plotting a Dendrogram # calculate full dendrogram plt.figure(figsize=(140, 60)) plt.title('Hierarchical Clustering Dendrogram(part)') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=2., # font size for the x axis labels ) # fancy_dendrogram( # Z, # truncate_mode='lastp', # show only the last p merged clusters #p=18, # show only the last p merged clusters # leaf_rotation=90., # rotates the x axis labels # leaf_font_size=8., # font size for the x axis labels # show_leaf_counts=True, # numbers in brackets are counts # show_contracted=True, # to get a distribution impression in truncated branches # max_d = 6000 # max_d as in max_distance # ) plt.savefig('/Users/CeciliaLee/Dropbox/Intren/HKIA/2/Dendrogram_Tree(part).png') plt.show() return c, Z
def dendrogram(self, X, metric='Euclidean', linkage='ward', x_label='Patterns'): """Generate hierarchical dendrogram. Keyword arguments: metric -- distance metric; default value = 'Euclidean' linkage -- default value = 'ward'""" X = X.reshape((X.shape[0], X.shape[1] * X.shape[2])) Z = sch.linkage(X, linkage) c, coph_dists = sch.cophenet(Z, pdist(X, metric)) # Cophenetic Correlation Coefficient of clustering. # This compares (correlates) the actual pairwise distances of all your samples to those implied by the hierarchical clustering. # The closer the value is to 1, the better the clustering preserves the original distances. print 'Cophenetic correlation coefficient of clustering (the closer to 1, the better):', c # calculate full dendrogram fig = plt.figure(figsize=(20, 10)) ax = fig.add_subplot(111) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel(x_label) plt.ylabel(metric + ' Distance') sch.dendrogram( Z, leaf_rotation=90, # rotates the x axis labels leaf_font_size=25, # font size for the x axis labels labels=self.labels) ax.set_ylim(bottom=-0.5) ax.tick_params(labelsize=25)
def _hierarchical_cluster_consensus_matrix(consensus_matrix, force_diagonal=True, method='ward'): """ Hierarchical cluster consensus_matrix and compute cophenetic correlation coefficient. Convert consensus_matrix into distance matrix. Hierarchical cluster the distance matrix. And compute the cophenetic correlation coefficient. :param consensus_matrix: DataFrame; :param force_diagonal: bool; :param method: str; method parameter for scipy.cluster.hierarchy.linkage :return: ndarray float; linkage (Z) and cophenetic correlation coefficient """ # Convert consensus matrix into distance matrix distance_matrix = 1 - consensus_matrix if force_diagonal: for i in range(distance_matrix.shape[0]): distance_matrix.iloc[i, i] = 0 # Cluster consensus matrix to assign the final label hierarchical_clustering = linkage(consensus_matrix, method=method) # Compute cophenetic correlation coefficient cophenetic_correlation_coefficient = pearsonr( pdist(distance_matrix), cophenet(hierarchical_clustering))[0] return hierarchical_clustering, cophenetic_correlation_coefficient
def get_zx(start_time, method="single", fname="", Zf=False, **kwas): ''' :param start_time: int indicating the earliest query the window should include :param method: the linkage method to be used :param fname: string to be appended to end of plot file name :param Zf: boolean -> if True, will load from file (see code for file name). NOTE: it will save the most recent version you calculated. Make sure the right version of the file exists before setting Zf to true :param **kwas: keyword arguments for vv.get_svl() :return: linkage, dendrogram's output, svl computes and plots dendrogram with respect to distance between clients ''' if Zf is False: kwas['start_time'] = start_time X, fmt, _, ccache = vv.get_svl(**kwas) logger.warning("svl len: "+str(len(X))) dm = np.zeros((len(X) * (len(X) - 1)) // 2, dtype=np.double) k = 0 for i in xrange(0, len(X)-1): for j in xrange(i + 1, len(X)): dm[k] = 1.0 - ccache[X[i]][X[j]] k = k + 1 ccache.dump() Z = linkage(dm, method) df.pickleout(plotsdir+'pickles/'+'Z_'+method+fname+'.pickle', (Z, dm, X)) logger.warning('dumped Z to ' \ +plotsdir+'pickles/'+'Z_'+method+fname+'.pickle') else: Z, dm, X = df.picklein(plotsdir+'pickles/'+'Z_'+method+fname+'.pickle') logger.warning('loaded Z from '+plotsdir+'pickles/'+'Z_'+method+fname+'.pickle') c, coph_dists = cophenet(Z, dm) return Z, X
def computeLinkage( self, printDendogram = False ): # generate two clusters: a with 100 points, b with 50: #np.random.seed(4711) # for repeatability of this tutorial #a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,]) #b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,]) #X = np.concatenate((a, b),) self.X = array( self.buildingAverages.values() ) #print X # 150 samples with 2 dimensions #plt.scatter(X[:,0], X[:,1]) #plt.show() # generate the linkage matrix self.Z = linkage(self.X, 'ward') c, coph_dists = cophenet(self.Z, pdist(self.X)) if (printDendogram): # calculate full dendrogram plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram (truncated)') plt.xlabel('Dendogram of Dartmouth campus buildings clusters') plt.ylabel('distance') dendrogram( self.Z, #truncate_mode='lastp', # show only the last p merged clusters #p=20, # show only the last p merged clusters show_leaf_counts=True, # otherwise numbers in brackets are counts leaf_rotation=90., leaf_font_size=12., show_contracted=True, # to get a distribution impression in truncated branches ) plt.show() return self.Z
def get_optimal_hc_params(mouse_day): """ Returns a list of 2: [method, dist] method: {'ward', 'average', 'complete'} dist: {'cityblock', 'euclidean', 'chebychev'} Parameters ---------- mouse_day: a 170 * M numpy array, column 0 : strain, column 1: mouse, other columns corresponding to feature avg/std of a mouse over 16 days Returns ------- method_distance: list [method, dist] """ methods = ['ward', 'average', 'complete'] dists = ['cityblock', 'euclidean', 'chebychev'] method_dists = [(methods[i], dists[j]) for i in range(len(methods)) for j in range(len(dists))] method_dists = [(method, dist) for method, dist in method_dists if method != 'ward' or dist == 'euclidean'] cs = [] for method, dist in method_dists: Z = linkage(mouse_day[:, 2:], method=method, metric=dist) c, coph_dists = cophenet(Z, pdist(mouse_day[:, 2:])) cs.append(c) # determine the distance method method, dist = method_dists[np.argmax(cs)] return([method, dist])
def main(): # fetch distance matrix from specified input file distMatFile = sys.argv[1] nameList,Dij_sq,N=fetchDistMat(distMatFile) # in scipy most routines operate on 'condensed' # distance matrices, i.e. upper triagonal matrices # the function square contained in the scipy.spatial # submodule might be used in order to switch from # full square to condensed matrices and vice versa Dij_cd = ssd.squareform(Dij_sq) # hierarchical clustering where the distance between # two coordinates is the distance of the cluster # averages # cluster Result = 'top down view' of the hierarchical # clustering clusterResult = sch.linkage(Dij_cd, method='average') # returns cophenetic distances # corr = cophenetic correlation # Cij_cd = condensed cophenetic distance matrix corr,Cij_cd = sch.cophenet(clusterResult,Dij_cd) Cij_sq = ssd.squareform(Cij_cd) # print dendrogram on top of cophenetic distance # matrix to standard outstream droPyt_distMat_dendrogram_sciPy(Cij_sq,clusterResult,N)
def bestCOCLUSTER(df): df = df.T # from scipy.cluster.hierarchy import distance from scipy.spatial import distance linkmethod = [ 'single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward' ] paraDF = pd.DataFrame(columns=['method', 'CCC'], index=linkmethod) paraDF.loc[:, 'method'] = linkmethod for iter_m in linkmethod: Y = distance.pdist(np.asarray(df)) print(Y.shape) Z = hierarchy.linkage(Y, method=iter_m) c, coph_dists = hierarchy.cophenet(Z, Y) paraDF.loc[iter_m, 'CCC'] = c paraDF.sort_values(by='CCC', ascending=False, inplace=True) # print(paraDF) row_linkage = hierarchy.linkage(distance.pdist(np.asarray(df)), method=paraDF.iloc[0, 0]) col_linkage = hierarchy.linkage(distance.pdist(np.asarray(df).T), method=paraDF.iloc[0, 0]) # print(paraDF.iloc[0,0]) sns.clustermap(df, row_linkage=row_linkage, col_linkage=col_linkage, figsize=(13, 13)) plt.show() return hierarchy.linkage(distance.pdist(np.asarray(df)))
def make_dendogram(Amatrix,formation_stats,t=80,method='ward',dendogram=True): # calculate dendogram from matrix of distances between formation observations # returns a list that indicates the cluster for each formation observation based on the horizontal distance threshold 't' d = sch.distance.squareform(Amatrix) L = sch.linkage( d, method=method) c = sch.cophenet(L) rho_p = stats.pearsonr(c,d)[0] rho_s = stats.spearmanr(c,d).correlation print ("Cophenetic distance = %1.2f (spearman), %1.2f (pearson)" % (rho_s,rho_p)) if dendogram: fig,ax = plt.subplots(figsize=(25, 10)) #dn = sch.dendrogram(L) dn = fancy_dendrogram(L,truncate_mode='lastp',p=200,leaf_rotation=90.,leaf_font_size=12.,show_contracted=True,annotate_above=10) # group clusters fcl = sch.fcluster(L,t=t,criterion='distance',depth=2) # now map formations to clusters ctypes = [] tmat = ['A','D','A','D'] count = 0 for f in formation_stats: #teams = (f[0][0:3],f[0][3:],f[0][3:],f[0][0:3]) teams = ('H','A','A','H') for i in [1,2,3,4]: for j in range(f[i]): ctypes.append( (count,f[0],teams[i-1],tmat[i-1], fcl[count]) ) count += 1 ctypes = sorted(ctypes, key = lambda x: x[4] ) return ctypes
def calculate_cophenetic_correlation(connmat): Y = 1 - connmat Z = linkage(squareform(Y),method='average') c,d= cophenet(Z,squareform(Y)) #print c #print d return (c,d)
def get_cophenetic_scipy(A, k, n_iter, alg, start): """ Returns the cophenetic correlation coefficient for NMF (specified by alg) with k metagenes A : data-set to decompose k (int): number of metagenes n_iter (int): number of different decompositionn to average alg (string) : Which variant of SNMF to perform. options are: 'base' : (simultaneous NMF) 'sorth_W' : (simultaneous NMF with semi-orthogonal W) 'sorth_H : (simultaneous NMF with semi-orthogonal H) 'norm_sorth_W' : (simultaneous NMF with semi-orthogonal W where columns of W normalized every iteration) 'norm_sorth_H' : (simultaneous NMF with semi-orthogonal H where rows of H normalized every iteration) 'aff_sorth_W' : (simultaneous affine NMF with semi-orthogonal W where columns of W normalized every iteration) 'aff_sorth_H' : (simultaneous affine NMF with semi-orthogonal H where rows of H normalized every iteration) start (string): How to initialize matrices. options are: 'rand' : random initialization 'sorth_W' : semi-orthogonal W 'sorth_H' : semi-orthogonal H returns (float): cophenetic correlation coefficient for simultaeneous NMF with k metagenes """ Cb = get_avg_con_mats(A, k, n_iter, alg,start) lmat = linkage(Cb, method='average') return cophenet(lmat, pdist(Cb))[0]
def get_optimal_hc_params(mouse_day): """ Returns a list of 2: [method, dist] method: {'ward', 'average', 'complete'} dist: {'cityblock', 'euclidean', 'chebychev'} Parameters ---------- mouse_day: a 170 * M numpy array, column 0 : strain, column 1: mouse, other columns corresponding to feature avg/std of a mouse over 16 days Returns ------- method_distance: list [method, dist] """ methods = ['ward', 'average', 'complete'] dists = ['cityblock', 'euclidean', 'chebychev'] method_dists = [(methods[i], dists[j]) for i in range(len(methods)) for j in range(len(dists))] method_dists = [(method, dist) for method, dist in method_dists if method != 'ward' or dist == 'euclidean'] cs = [] for method, dist in method_dists: Z = linkage(mouse_day[:, 2:], method=method, metric=dist) c, coph_dists = cophenet(Z, pdist(mouse_day[:, 2:])) cs.append(c) # determine the distance method method, dist = method_dists[np.argmax(cs)] return ([method, dist])
def agglomerative(embeds,names,viz=True): l = linkage(embeds, method='complete', metric='seuclidean') if viz: plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.ylabel('word') plt.xlabel('distance') dendrogram( l, leaf_rotation=90., # rotates the x axis labels leaf_font_size=0., # font size for the x axis labels orientation='top', ) plt.show() minimal_dendrogram( l, truncate_mode='lastp', p=12, leaf_rotation=90., leaf_font_size=12., show_contracted=True, annotate_above=10, city='Sydney' ) plt.show() corr, coph_dists = cophenet(l, pdist(embeds)) print('\nCophenetic correlation:', corr,'\n') return l
def clus_agglome(dat, meth, order): # Generate the linkage matrix using the (Ward, ) algorithm Z = linkage(dat['x'].values, method=meth) #'ward', 'complete', single' # Generate the dendrogram (and save) #plt.ioff() # Turn interactive plotting off f = plt.figure(figsize=(12, 5)) plt.title('Hierarchical Clustering Dendrogram with link=' + meth) plt.ylabel('Distance in the space four dimensions') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., ) # font size for the x axis labels plt.savefig('dendogram_' + meth + '.png') plt.show() plt.close(f) # ============ # Create cluster objects clus_obj = AgglomerativeClustering(n_clusters=3, linkage=meth) # Assign the elements to groups group = clus_obj.fit_predict(dat['x']) # Evaluate the success ratio of the clustering d = pd.crosstab(dat['vari'].variety, group, margins=True, margins_name="Total") d = d.reindex(order) truth = sum(np.diag(d)) success = 100 * truth / (np.shape(dat['x'])[0]) # Check the Cophenetic Correlation Coefficient to assess quality of clusters: c, coph_dists = cophenet(Z, pdist(dat['x'])) # Let's plot our clusters plt.figure() plt.subplot(121) plt.scatter(dat['x'].s_length, dat['x'].s_width, s=10, c=vari.variety_num) plt.title("Real groups") plt.xticks(()) plt.yticks(()) plt.subplot(122) plt.scatter(dat['x'].s_length, dat['x'].s_width, s=10, c=group) # predicted plt.title('Predicted groups ' + meth) plt.xticks(()) plt.yticks(()) plt.show() resul = dict() resul['accuracy'] = success resul['cophe'] = c return resul
def test_linkage_cophenet_tdist_Z_Y(self): # Tests cophenet(Z, Y) on tdist data set. Z = hierarchy_test_data.linkage_ytdist_single (c, M) = cophenet(Z, hierarchy_test_data.ytdist) expectedM = np.array([268, 295, 255, 255, 295, 295, 268, 268, 295, 295, 295, 138, 219, 295, 295]) expectedc = 0.639931296433393415057366837573 assert_allclose(c, expectedc, atol=1e-10) assert_allclose(M, expectedM, atol=1e-10)
def coph_cor(A, idx=None): avec = np.array([ A[i, j] for i in range(A.shape[0] - 1) for j in range(i + 1, A.shape[1]) ]) Y = 1 - avec Z = linkage(Y, method='average') return cophenet(Z, Y)[0]
def nmf_sigs(x, k_start, k_end, trial_lower, trial_upper, trial_tol, N_iter, dist_metric, hierarchy_method): Z1 = x.group_cluster model_d = {} for k in range(k_start, k_end + 1): print('k = {}'.format(k)) num_trials = 0 corr_avg = [] corr_std = 0.0 current_std = 0.0 prev_std = 0.0 std_diff = [] error_avg = [] while num_trials <= trial_lower or np.mean( std_diff) > trial_tol and num_trials < trial_upper: prev_std = current_std model = JNMF_model(x.storage) model.init_wh(k=k) for j in range(N_iter): model.mult_update() error_avg.append(model.error()) Z2 = linkage(minmax_scale(pdist(model.w, dist_metric)), method=hierarchy_method) corr = cophenet_corr(cophenet(Z1), cophenet(Z2)) corr_avg.append(corr) current_std = np.std(corr_avg) std_diff.append(abs(prev_std - current_std)) if len(std_diff) > trial_lower: std_diff = std_diff[1:] num_trials += 1 try: model_d[k].append((corr, model.error(), model)) except: model_d[k] = [(corr, model.error(), model)] print('num trials = {}'.format(num_trials)) print('avg corr = {}'.format(np.mean(corr_avg))) print('avg error = {}'.format(np.mean(error_avg))) print() return model_d
def visualize(filepath, ceiling=1000, ward=None): """Render dendrograms of rhyme clustering Parameters: filepath (str): path to XML file with poem, required ceiling (int): maximum number of stanzas to return (useful for sampling long poems), defaults to high value ward (boolean): show Ward dendrogram separately (improves legibility of long stanzas), defaults to None Return: No return; prints text and renders dendrograms directly """ df = explore(filepath, ceiling, ward) stanzas = df.groupby(level=[0, 1]) i = 0 for id, lines in stanzas: if i < 11: print( pd.concat([ lines["Text"].str.replace(r"<[^>]+?>", ""), lines[["RhymeWord", "RhymeZone"]] ], axis=1)) # diagnostic data = lines.copy().filter( regex=r"^token\d_") # only one-hot features labelList = list(range(1, len(lines) + 1)) # labels are line numbers within stanza data.loc[:, "LineNo"] = [2 * n / len(labelList) for n in labelList ] # scale to avoid tyranny of proximity complete = linkage(data, method="complete") complete_c, complete_coph_dist = cophenet(complete, pdist(data)) ward = linkage(data, method="ward") ward_c, ward_coph_dists = cophenet(ward, pdist(data)) plt.figure(figsize=(12, 4)) plt.subplot(1, 2, 1) plt.title("Complete: " + str(complete_c)) dendrogram(complete, labels=labelList) plt.subplot(1, 2, 2) plt.title("Ward: " + str(ward_c)) dendrogram(ward, labels=labelList) i += 1 plt.show()
def FormCluster(X): Z = linkage(X, 'single') c1, coph_dists = cophenet(Z, pdist(X)) cl1.append(c1) Z = linkage(X, 'complete') c2, coph_dists = cophenet(Z, pdist(X)) cl2.append(c2) Z = linkage(X, 'average') c3, coph_dists = cophenet(Z, pdist(X)) cl3.append(c3) Z = linkage(X, 'weighted') c4, coph_dists = cophenet(Z, pdist(X)) cl4.append(c4) Z = linkage(X, 'centroid') c5, coph_dists = cophenet(Z, pdist(X)) cl5.append(c5) Z = linkage(X, 'median') c6, coph_dists = cophenet(Z, pdist(X)) cl6.append(c6) Z = linkage(X, 'ward') c7, coph_dists = cophenet(Z, pdist(X)) cl7.append(c7)
def get_linkage(id): batch = configs[16*id:16*(id+1)-1] X = [x[1:] for x in batch] labels = [x[0] for x in batch] Z = linkage(X, 'average') c, coph_dists = cophenet(Z, pdist(X)) print(c) return Z,X,labels
def plot_dendrogram(self): # Get linkage matrix Z = linkage(self.neurons.T, "ward") c, coph_dists = cophenet(Z, pdist(self.neurons.T)) plt.figure() dendrogram(Z) print( "Plotted dendrogram with cophenetic distance of {:.2f}".format(c)) plt.show(block=False)
def plot_clustered_heatmap(df, genes_list, cancer, output_path, scale='binary'): # Build nxm matrix (n samples, m genes) X = df[genes_list].as_matrix().transpose() if scale == 'binary': Z = linkage(X, method='complete', metric='hamming') colorscale = [[0, "rgb(111, 168, 220)"], [1, "rgb(5, 10, 172)"]] colorbar = {'tick0': 0,'dtick': 1} elif scale == 'logarithmic': Z = linkage(X, method='ward') X_max = X.max() colorscale = [[0, 'rgb(250, 250, 250)'], [1./X_max, 'rgb(200, 200, 200)'], [5./X_max, 'rgb(150, 150, 200)'], [20./X_max, 'rgb(100, 100, 200)'], [100./X_max, 'rgb(50, 50, 200)'], [1., 'rgb(0, 0, 200)']] colorbar = {'tick0': 0, 'tickmode': 'array', 'tickvals': [0, 1, 5, 20, 100, X_max]} c, coph_dists = cophenet(Z, pdist(X)) print "Cophenetic Correlation Coefficient:", c #layout = go.Layout(yaxis=dict(title='%s germline mutations (ordered by samples somatic mutation load)'% cancer, zeroline=False)) # fig = pylab.figure(figsize=(8,8)) # ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) # ax1.set_xticks([]) # ax1.set_yticks([]) # axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) den = dendrogram(Z, orientation='left') idx = den['leaves'] X = X[idx,:] print "X shape:", X.shape genes_ordered = [genes_list[i] for i in idx] logger.info("ordered genes: %s", str(genes_ordered)) # im = axmatrix.matshow(X, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu) # axmatrix.set_xticks([]) # axmatrix.set_yticks([]) # # Plot colorbar. # axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) # pylab.colorbar(im, cax=axcolor) # fig.savefig(output_path) # Plotting the heatmap (without the hirarchy) heatmap_trace = go.Heatmap(z=X.tolist(), x=df.patient_id, y=genes_ordered, showscale=True, colorscale=colorscale, colorbar=colorbar) mutation_load_trace = go.Bar(x=df.patient_id, y=df.somatic_mutations_count/30.0) fig = tls.make_subplots(rows=29, cols=1, specs=[[{'rowspan':5, 'colspan' : 1}]] + [[None]] * 4 + [[{'rowspan' : 24, 'colspan' : 1}]] + [[None]] * 23) fig.append_trace(mutation_load_trace, 1, 1) fig.append_trace(heatmap_trace, 6, 1) fig['layout']['xaxis1'].update(showticklabels = False) fig['layout']['xaxis1'].update(zeroline = False, showgrid=False) fig['layout']['yaxis1'].update(zeroline = False, showgrid = False, tickfont=dict(family='Arial', size=4)) fig['layout']['xaxis2'].update(showticklabels = False) fig['layout']['xaxis2'].update(zeroline = False, showgrid=False) fig['layout']['yaxis2'].update(zeroline = False, showgrid = False, tickfont=dict(family='Arial', size=4)) plot(fig, auto_open=False, filename="%s_%s_heatmap_clustered.html" % (output_path, cancer))
def hycluster(X, link, metr, datatype): # generate the linkage matrix Z = linkage(X, link) cuttree = cut_tree(Z, n_clusters=[2, 10]) #print('cut tree shape', cuttree.shape) #print('Full cuttree', cuttree) global clus2, clus10 for i in cuttree: clus2.append(i[0]) clus10.append(i[1]) c, coph_dists = cophenet(Z, X) print('Cophenet:', metr, c) titl = 'Hierarchical Clustering ' + datatype + ',' + link + ',' + metr # calculate full dendrogram #plt.figure(figsize=(15, 8)) plt.figure() def fancy_dendrogram(*args, **kwargs): max_d = kwargs.pop('max_d', None) if max_d and 'color_threshold' not in kwargs: kwargs['color_threshold'] = max_d annotate_above = kwargs.pop('annotate_above', 0) ptitle = kwargs.pop('plttitle', 0) ddata = dendrogram(*args, **kwargs) if not kwargs.get('no_plot', False): plt.title(ptitle) plt.xlabel('sample index or (cluster size)') plt.ylabel('distance') for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']): x = 0.5 * sum(i[1:3]) y = d[1] if y > annotate_above: plt.plot(x, y, 'o', c=c) plt.annotate("%.3g" % y, (x, y), xytext=(0, -5), textcoords='offset points', va='top', ha='center') if max_d: plt.axhline(y=max_d, c='k') return ddata fancy_dendrogram( Z, #truncate_mode='lastp', #p=12, leaf_rotation=90., leaf_font_size=12., show_contracted=True, annotate_above=10, # useful in small plots so annotations don't overlap plttitle=titl)
def buildtree(featuresvector, method): '''Creates tree from peptide features and returns root node''' featuresvector = removeprevlabel(featuresvector) x_scaled, _ = scale(featuresvector) print('Building linkage matrix using {} algorithm ...'.format(method)) linkage_matrix = linkage(x_scaled, method) coph, _ = cophenet(linkage_matrix, pdist(x_scaled)) print('Cophenet parameter (values close to 1 are good): {}'.format(coph)) return to_tree(linkage_matrix), linkage_matrix
def cal_cophenetic(C): """ calculate cophenetic correlation coefficient """ print("=== calculate cophenetic correlation coefficient ===") X = C # Original data (1000 observations) """Z = linkage(X)""" Z = fc.linkage_vector(X) # Clustering orign_dists = fc.pdist(X) # Matrix of original distances between observations cophe_dists = cophenet(Z) # Matrix of cophenetic distances between observations corr_coef = np.corrcoef(orign_dists, cophe_dists)[0,1] return corr_coef
def compare_clusters(args): ref_df = pd.read_table(args['ref'], sep='\t', skipinitialspace=True, index_col=0).as_matrix() check_symmetry(ref_df) linkage_ref = linkage(ref_df, 'average') c_ref, coph_dists_ref = cophenet(linkage_ref, pdist(ref_df)) outfile = open(args['output'], "w") outfile.write( "Tree_cluster\tMantel_Correlation_Coefficient\tManter_P-value\tCophenetic_Pearson\tCophenetic_P-value\n" ) for i in args['all']: fst_df = pd.read_table(i, sep='\t', skipinitialspace=True, index_col=0).as_matrix() check_symmetry(fst_df) mantel_coeff = 0.0 p_value_mantel = 0.0 cophenetic_pearson = 0.0 p_value_cophenetic = 0.0 n = 0 try: # mantel_coeff, p_value_mantel, n = mantel(ref_df, fst_df) mantel_coeff, p_value_mantel, n = mantel_test(ref_df, fst_df) linkage_fst = linkage(fst_df, 'average') c_fst, coph_dists_fst = cophenet(linkage_fst, pdist(fst_df)) cophenetic_pearson, p_value_cophenetic = pearsonr( coph_dists_ref, coph_dists_fst) except Exception as e: print("Error : %s" % str(e)) mantel_coeff = "Failed" p_value_manel = "Failed" cophenetic_pearson = "Failed" p_value_cophenetic = "Failed" outfile.write(i + "\t" + str(mantel_coeff) + "\t" + str(p_value_mantel) + "\t" + str(cophenetic_pearson) + "\t" + str(p_value_cophenetic) + "\n") outfile.close()
def cophenetic_best(condensedD, methods=('single', 'complete', 'average', 'weighted')): # What hierarchical clustering method is the best, according to the cophenetic correlation? # 'centroid', 'median' and 'ward' do not make sense with dice, since the dm needs to be Euclidean # In fact, they require the original matrix and not the distance matrix # (so change the API if ever considereing them). results = {} for method in methods: Z = linkage(condensedD, method=method) cophenetic_correlation, _ = hierarchy.cophenet(Z, condensedD) results[method] = cophenetic_correlation results = pd.Series(results) return results.sort_values(ascending=False), results.idxmax()
def create_dendrogram(dist): global points distances=linkage(points,dist) c,coph_dists=cophenet(distances,pdist(points)) plt.figure(figsize=(25,10)) plt.title('Dendogram') plt.xlabel('Points') plt.ylabel('Distance') dend=dendrogram(distances,show_contracted=True) plt.show() dend2=dendrogram(distances,show_contracted=True,truncate_mode='lastp',p=3) plt.show() clusters=fcluster(distances,3,criterion='maxclust') return c,clusters
def evaluate_cluster_w(TopicData, LinkageMatrix, GroundTruth): # check the correlation coefficient CorrCoeff, coph_dists = cophenet(LinkageMatrix, pdist(TopicData)) ## check several cluster evaluation metrics Threshold = 2 FlatClusterNumbers = fcluster(LinkageMatrix, Threshold) #print(GroundTruth) #print(FlatClusterNumbers) ARI = metrics.adjusted_rand_score(GroundTruth, FlatClusterNumbers) Homog = metrics.homogeneity_score(GroundTruth, FlatClusterNumbers) Compl = metrics.completeness_score(GroundTruth, FlatClusterNumbers) VMeasure = metrics.v_measure_score(GroundTruth, FlatClusterNumbers) print("Evaluation metrics with threshold "+str(Threshold)) print("CorrCoeff:", CorrCoeff) print("adjustedRI:", ARI) print("Homogeneity:", Homog) print("Completeness:", Compl) print("V-Measure:", VMeasure) return CorrCoeff, ARI, Homog, Compl, VMeasure
def _make_cluster_variants(gps, samples, max_k, variants=None): res = {} # TODO: performance: don't recompute hierarchical clusterings for diff max_k # TODO: include sklearn hierarchical clustering # model = AgglomerativeClustering() # model.fit(samples) # print('clustering results:') # print('labels:') # print(model.labels_) # print('n_leaves:') # print(model.n_leaves_) # print('n_components:') # print(model.n_components_) # print('children:') # print(model.children_) # TODO: include non hierarchical variants # a difference of 0.2 in cosine similarity is allowed to merge clusters # model = AffinityPropagation() # model.fit(samples) # labels = model.labels_ # core_samples_mask = np.zeros_like(labels, dtype=bool) # core_samples_mask[model.core_sample_indices_] = True metrics = ['euclidean', 'cityblock', 'cosine'] methods = [ 'single', 'complete', 'weighted', 'average', 'centroid', 'median', 'ward', ] for scale in ['', 'scaled_']: ssamples = samples if scale: ssamples = StandardScaler().fit_transform(samples) for metric in metrics: cdist = pdist(ssamples, metric) if metric == 'cosine': # see https://github.com/scipy/scipy/issues/5208 np.clip(cdist, 0, 1, out=cdist) for method in methods: name = '%s%s_%s' % (scale, metric, method) logger.debug('computing clustering %s', name) try: if variants and name not in variants: # could skip earlier but would make code more complex continue if method in ['ward', 'centroid', 'median']: # method needs raw feature vectors in euclidean space if metric == 'euclidean': cluster_hierarchy = linkage(ssamples, method=method) else: continue elif method not in [ 'single', 'complete', 'weighted', 'average']: # method needs raw inputs, recompute: if metric == 'cosine': # see: https://github.com/scipy/scipy/issues/5208 continue cluster_hierarchy = linkage( ssamples, method=method, metric=metric) else: cluster_hierarchy = linkage(cdist, method=method) c, coph_dists = cophenet(cluster_hierarchy, cdist) res[name] = HierarchicalCluster( name, gps, samples, max_k, cluster_hierarchy, c) logger.info('clustering %s computed with c: %0.3f', name, c) except ValueError: logger.warning( 'The following exception occurred during clustering ' 'with variant %s:\nException:', name, exc_info=1, # appends exception to message ) logger.info('computed %d clustering variants', len(res)) return res
#for c, i, target_name in zip("rgb", [0, 1, 2], target_names): plt.legend() plt.title('Phoneme PCA') plt.xlabel('First Principal Component (explained variance ratio = '+str(np.around(pca.explained_variance_ratio_[0], decimals = 3))+')') plt.ylabel('Second Principal Component (explained variance ratio = '+str(np.around(pca.explained_variance_ratio_[1], decimals = 3))+')') fig.savefig('pca.png', bbox_inches='tight') fig.savefig('pca.pdf', bbox_inches='tight') #print phoneme #print label label = list(label) #X = np.asarray(phoneme) # generate the linkage matrix Z = sch.linkage(X, 'ward') c, coph_dists = sch.cophenet(Z, pdist(X, 'euclidean')) # c, coph_dists = sch.cophenet(Z, pdist(X)) # Cophenetic Correlation Coefficient of clustering. # This compares (correlates) the actual pairwise distances of all your samples to those implied by the hierarchical clustering. # The closer the value is to 1, the better the clustering preserves the original distances. print label, type(label[0]) print c # calculate full dendrogram fig = plt.figure(figsize=(15, 5)) ax = fig.add_subplot(111) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('International Phonetic Alphabet Phoneme') plt.ylabel('Distance') sch.dendrogram( Z,
def distance_patients_from_consensus_file( result_folder, distance_patients, ppi_data, mut_type, influence_weight, simplification, alpha, tol, keep_singletons, ngh_max, min_mutation, max_mutation, n_components, n_permutations, lambd, tol_nmf, linkage_method): consensus_directory = result_folder+'consensus_clustering/' consensus_mut_type_directory = consensus_directory + mut_type + '/' hierarchical_directory = result_folder+'hierarchical_clustering/' os.makedirs(hierarchical_directory, exist_ok=True) hierarchical_mut_type_directory = hierarchical_directory + mut_type + '/' os.makedirs(hierarchical_mut_type_directory, exist_ok=True) if lambd > 0: consensus_factorization_directory = ( consensus_mut_type_directory + 'gnmf/') hierarchical_factorization_directory = ( hierarchical_mut_type_directory + 'gnmf/') else: consensus_factorization_directory = ( consensus_mut_type_directory + 'nmf/') hierarchical_factorization_directory = ( hierarchical_mut_type_directory + 'nmf/') os.makedirs(hierarchical_factorization_directory, exist_ok=True) hierarchical_clustering_file = ( hierarchical_factorization_directory + 'hierarchical_clustering_Patients_weight={}_simp={}_alpha={}_tol={}_singletons={}_ngh={}_minMut={}_maxMut={}_comp={}_permut={}_lambd={}_tolNMF={}_method={}.mat' .format(influence_weight, simplification, alpha, tol, keep_singletons, ngh_max, min_mutation, max_mutation, n_components, n_permutations, lambd, tol_nmf, linkage_method)) existance_same_param = os.path.exists(hierarchical_clustering_file) if existance_same_param: print(' **** Same parameters file of hierarchical clustering already exists') else: # print(type(distance_patients), distance_patients.shape) # hierarchical clustering on distance matrix (here: distance_patients) Z = linkage(distance_patients, method=linkage_method) # Plot setting matplotlib.rcParams.update({'font.size': 14}) fig = plt.figure(figsize=(20, 20)) fig.suptitle( 'Hierarchical clustering\n\nPatients', fontsize=30, x=0.13, y=0.95) # Compute and plot dendrogram ax_dendro = fig.add_axes([0, 0.71, 0.6, 0.15]) P = dendrogram(Z, count_sort='ascending', no_labels=True) ax_dendro.set_xticks([]) ax_dendro.set_yticks([]) # Plot distance matrix. ax_matrix = fig.add_axes([0, 0.1, 0.6, 0.6]) idx = np.array(P['leaves']) D = distance_patients[idx, :][:, idx] im = ax_matrix.imshow(D, interpolation='nearest', cmap=cm.viridis) ax_matrix.set_xticks([]) ax_matrix.set_yticks([]) # Plot colorbar. ax_color = fig.add_axes([0.62, 0.1, 0.02, 0.6]) ax_color.set_xticks([]) plt.colorbar(im, cax=ax_color) # forms flat clusters from Z # given k -> maxclust clust_nb = fcluster(Z, n_components, criterion='maxclust') # cophenetic correlation distance coph_dist, coph_matrix = cophenet(Z, pdist(distance_patients)) print(' cophenetic correlation distance = ', coph_dist) ax_dendro.set_title( 'network = {}\nalpha = {}\nmutation type = {}\ninfluence weight = {}\nsimplification = {}\ncomponent number = {}\nlambda = {}\nmethod = {}\ncophenetic corr = {}\n' .format(ppi_data, alpha, mut_type, influence_weight, simplification, n_components, lambd, linkage_method, format(coph_dist, '.2f')), loc='right') plot_name = "similarity_matrix_Patients" + ( '_alpha={}_tol={}_singletons={}_ngh={}_minMut={}_maxMut={}_comp={}_permut={}_lambd={}_tolNMF={}_method={}' .format(alpha, tol, keep_singletons, ngh_max, min_mutation, max_mutation, n_components, n_permutations, lambd, tol_nmf, linkage_method)) plt.savefig('{}{}.pdf'.format(hierarchical_factorization_directory, plot_name), bbox_inches='tight') plt.savefig('{}{}.svg'.format(hierarchical_factorization_directory, plot_name), bbox_inches='tight') # start = time.time() savemat(hierarchical_clustering_file, {'Z_linkage_matrix': Z, 'dendrogram_data_dictionary': P, 'dendrogram_index': idx, 'flat_cluster_number': clust_nb, 'cophenetic_correlation_distance': coph_dist, 'cophenetic_correlation_matrix': coph_matrix}, do_compression=True)
ax.annotate(txt, (X_r[i, 0], X_r[i, 1]), horizontalalignment='center', verticalalignment='top',size = 14) #for c, i, target_name in zip("rgb", [0, 1, 2], target_names): plt.legend() plt.title('Phoneme PCA') plt.xlabel('First Principal Component (explained variance ratio = '+str(np.around(pca.explained_variance_ratio_[0], decimals = 3))+')') plt.ylabel('Second Principal Component (explained variance ratio = '+str(np.around(pca.explained_variance_ratio_[1], decimals = 3))+')') #plt.show() #fig.savefig('pca.jpg', bbox_inches='tight') #fig.savefig('pca.pdf', bbox_inches='tight') # generate the linkage matrix X = X_r print '\n\nX = ', X Z = sch.linkage(X, 'ward') c, coph_dists = sch.cophenet(Z, pdist(X, 'jaccard')) # c, coph_dists = sch.cophenet(Z, pdist(X)) # Cophenetic Correlation Coefficient of clustering. # This compares (correlates) the actual pairwise distances of all your samples to those implied by the hierarchical clustering. # The closer the value is to 1, the better the clustering preserves the original distances. print label, type(label[0]) print c # calculate full dendrogram fig = plt.figure(figsize=(20, 10)) ax = fig.add_subplot(111) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('Photos') plt.ylabel('Distance') sch.dendrogram( Z,
def silhouette_score(dendroMatrix, distance_metric, linkage_method, labels): """ Generate silhoutte score based on hierarchical clustering. Args: dendroMatrix: list, occurance of words in different files distance_metric: string, style of distance metric in the dendrogram linkage_method: string, style of linkage method in the dendrogram labels: list, file names Returns: silhouetteScore: string, containing the result of silhouette score silhouetteAnnotation: string, annotation of the silhouette score score: float, silhouette score inconsistentMax: float, upper bound of threshold to calculate silhouette score if using Inconsistent criterion maxclustMax: integer, upper bound of threshold to calculate silhouette score if using Maxclust criterion distanceMax: float, upper bound of threshold to calculate silhouette score if using Distance criterion distanceMin: float, lower bound of threshold to calculate silhouette score if using Distance criterion monocritMax: float, upper bound of threshold to calculate silhouette score if using Monocrit criterion monocritMin: float, lower bound of threshold to calculate silhouette score if using Monocrit criterion threshold: float/integer/string, threshold (t) value that users entered, equals to 'N/A' if users leave the field blank """ activeFiles = len(labels) - 1 if (activeFiles > 2): # since "number of lables should be more than 2 and less than n_samples - 1" Y = metrics.pairwise.pairwise_distances(dendroMatrix, metric=distance_metric) Z = hierarchy.linkage(Y, method=linkage_method) monocrit = None # 'maxclust' range maxclustMax = len(labels) - 1 # 'incosistent' range R = hierarchy.inconsistent(Z, 2) inconsistentMax = R[-1][-1] slen = len('%.*f' % (2, inconsistentMax)) inconsistentMax = float(str(inconsistentMax)[:slen]) # 'distance' range d = hierarchy.cophenet(Z) distanceMax = d.max() slen = len('%.*f' % (2, distanceMax)) distanceMax = float(str(distanceMax)[:slen]) distanceMin = d.min() + 0.01 slen = len('%.*f' % (2, distanceMin)) distanceMin = float(str(distanceMin)[:slen]) # 'monocrit' range MR = hierarchy.maxRstat(Z, R, 0) monocritMax = MR.max() slen = len('%.*f' % (2, monocritMax)) monocritMax = float(str(monocritMax)[:slen]) monocritMin = MR.min() + 0.01 slen = len('%.*f' % (2, monocritMin)) monocritMin = float(str(monocritMin)[:slen]) threshold = request.form['threshold'] if threshold == '': threshold = str(threshold) else: threshold = float(threshold) if request.form['criterion'] == 'maxclust': criterion = 'maxclust' if (threshold == '') or (threshold > maxclustMax): threshold = len(labels) - 1 else: threshold = round(float(threshold)) elif request.form['criterion'] == 'distance': criterion = 'distance' if (threshold == '') or (threshold > distanceMax) or (threshold < distanceMin): threshold = distanceMax elif request.form['criterion'] == 'inconsistent': criterion = 'inconsistent' if (threshold == '') or (threshold > inconsistentMax): threshold = inconsistentMax elif request.form['criterion'] == 'monocrit': criterion = 'monocrit' monocrit = MR if (threshold == '') or (threshold > monocritMax) or (threshold < monocritMin): threshold = monocritMax scoreLabel = hierarchy.fcluster(Z, t=threshold, criterion=criterion, monocrit=monocrit) if len(set(scoreLabel)) <= 1: # this means all the files are divided into only 1 or less cluster silhouetteScore = "Silhouette Score: invalid for only 1 cluster." silhouetteAnnotation = "because your file are too similar to each other, program classify all of them in the same cluster" score = 'invalid for only 1 cluster' inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = threshold = 'N/A' else: score = metrics.silhouette_score(Y, labels=scoreLabel, metric='precomputed') score = round(score, constants.ROUND_DIGIT) inequality = '≤'.decode('utf-8') silhouetteScore = "Silhouette Score: " + str( score) + "\n(-1 " + inequality + " Silhouette Score " + inequality + " 1)" silhouetteAnnotation = "The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar." else: silhouetteScore = "Silhouette Score: invalid for less than or equal to 2 files." silhouetteAnnotation = "" score = 'invalid for less than or equal to 2 files.' threshold = inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = 'N/A' return silhouetteScore, silhouetteAnnotation, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold
# We stick stick with average-linkage # Maybe we should check how original medoid order affects the clustering print('Clustering the medoids') D = dicedist_metric(medoids_df) # And this is as cool as for spitting back a pandas dataframe condensedD = squareform(D) # Clustering time is irrelevant, report the qualities all at once cophenetic_ranking, best_method = cophenetic_best(condensedD) print('Cophenetic ranking\n%s\nbest: %s' % (cophenetic_ranking, best_method)) linkage_method = 'average' print('Linkage: %s' % linkage_method) # --- Perform the linkage calculation Z = linkage(condensedD, method=linkage_method) cophenetic_correlation, _ = hierarchy.cophenet(Z, condensedD) # --- Save clustering to a json file for web-ingestion fn = get_hierarchy_file_prefix(dataset=dataset, region=neuropil, cluster_type=cluster_type) def save_hierarchy_json(): # To keep the json small we should probably reduce the digits we save # http://stackoverflow.com/questions/1447287/format-floats-with-standard-json-module # And of course, remove spaces, and maybe, use much shorter keys... # So make a function out of this, with parameter "small", and coordinate with the js world print('Saving json') tree = hierarchy2dictionary(Z, dendrogram=False, base=1) hdict = { 'dataset': dataset,
iiTD_ordered = OrderedDict(sorted(iiTD.items())) tfidfV = TfidfVectorizer(stop_words='english',vocabulary=featureIndex,sublinear_tf=True) tfs = tfidfV.fit_transform(iiTD_ordered.values()) #---(doc#,feature#) print('TD matrix dimensions :', tfs.shape) #---SVD! svd = TruncatedSVD(n_components=MedianUniqueKeys,algorithm="arpack") #CAN ALSO TRY: some% of #of Features instead of fixed n_components OR change algo arpack/randomized svd.fit(tfs) Sigma=svd.transform(tfs) print('Reduced Dimensions of TD Matrix', Sigma.shape) #---clustering! Clustering_Order = linkage(Sigma,method='ward', metric='euclidean') #can also provide a consistency constraint by making a graph of activities linked by category values. See http://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html c, coph_dists = cophenet(Clustering_Order, pdist(Sigma)) print(str(c)) #print(Clustering_Order) plt.figure(figsize=(25, 10)) plt.title('Clustering Dendrogram') plt.xlabel('ID_space') plt.ylabel('Distance') dendrogram( Clustering_Order, #truncate_mode='lastp', #p=20, #show_leaf_counts=False, leaf_rotation=90., leaf_font_size=10., #show_contracted=True, )
# load the data lang = pd.read_csv('https://raw.githubusercontent.com/generalassembly-studio/dsi-course-materials/master/curriculum/04-lessons/week-07/3.2-lesson/assets/datasets/lang.csv?token=ANUte4ku6wHT_-2xOgUxMM_08YUJ0RB6ks5XWWISwA%3D%3D') lang.head() # scatter to guess clusters plt.scatter(lang['country'], lang['english']) plt.show() # Now, let's convert our data to a matrix to pass to the clustering algorithm - the matrix makes it easier for our algorithm to compute distance: X = lang.as_matrix(columns=None) # We'll implement the actual clustering algorithm using the ward method: Z = linkage(X, 'ward') # We can calculate the cophenetic correlation coefficient to see how well our algorithm has measured the distances between the points: c, coph_dists = cophenet(Z, pdist(X)) # let's 'c' how it did c # now let's make our dendrogram plt.title('Dendrogram') plt.xlabel('Index Numbers') plt.ylabel('Distance') dendrogram( Z, leaf_rotation=90., leaf_font_size=8., ) plt.show()
#data # In[ ]: #clusterInfo = linkage(data, 'ward') # c=0.62 #clusterInfo = linkage(data, 'centroid') # c=0.89 #clusterInfo = linkage(data, 'weighted') # c=0.86 #clusterInfo = linkage(data, 'average') # c=0.91 #clusterInfo = linkage(data, 'complete') # c=0.90 #clusterInfo = linkage(data, 'single') # c=0.78 # Cophenet correlation coefficient measures # how faithfully a dendrogram preserves pairwise # distance between the original data points: (c, coph_dists) = cophenet(clusterInfo, pdist(data)) c # In[ ]: pandas.DataFrame(clusterInfo[:20],columns=['feature1', 'feature2', 'distance', 'clusterSize']) # In[ ]: # calculate full dendrogram plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('Survey Question') plt.ylabel('Distance')
print("Similarity matrix according to cos distances") print((np.matrix(cosDistsFeatures)+np.matrix(cosDistsCirc[:][:]))/2) print("Similarity matrix according to jaccard distances") print((np.matrix(jsfea)+np.matrix(jsCirc))/2) #Kmeands CLUSTER------------------------ concatted=np.concatenate((np.array(circlepeople), np.array(feats)), axis=1) for num in range(2,10): print("k=") print(num) codebook, distortion = kmeans(concatted, num) code, dist = vq(concatted, codebook) print(code) #centroids, labels = kmeans([ys2,circles,feats], 3) #Hiearchical clustering Z = linkage(concatted, 'ward') c, coph_dists = cophenet(Z, pdist(concatted)) plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ) plt.show()
for i in range(0,len(tokens_txt)): a0.append(np.sum([Counter(tokens_txt[i])[x] for x in tokens_lsi[0]])) topic1=norm(a0) threshold=0.3 [print(topic1[i],documents[i]) for i in np.where(topic1>threshold)[0]] lsi.print_topics(1) from scipy.cluster.hierarchy import dendrogram, linkage P = linkage(matrix3, 'ward') from scipy.cluster.hierarchy import cophenet from scipy.spatial.distance import pdist corr, coph_distances = cophenet(P, pdist(matrix3)) corr plt.figure(figsize=(9,4)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('PARAGRAPH') plt.ylabel('DISTANCE') dendrogram(P, leaf_rotation=0., leaf_font_size=12.,) plt.show() model00=[] for i in range(0,len(sentences)): tokens = word_tokenize(str(sentences[i]))
distance = scipy.spatial.distance.pdist(X) single_hierarchy = scipy.cluster.hierarchy.single(distance) # max_d = 1.05 # prediction = fcluster(single_hierarchy, max_d, criterion='distance') # prediction_single_hier = prediction # Want to use the cophenetic distance matrix for each heirarchical algorithm # to: # 1: compare then against the dististance matrix # 2: compare against themselves single_cophenet = cophenet(single_hierarchy) from scipy.stats import pearsonr distance_metrics = [distance, single_cophenet] comparisons = np.zeros((len(distance_metrics), len(distance_metrics))) for i, j in itertools.product(np.arange(len(distance_metrics)), np.arange(len(distance_metrics))): comparisons[i, j] = pearsonr(distance_metrics[i], distance_metrics[j])[0] ###### # to get all through the different data types:
#Happened Correctly # In[43]: #Clustering based on Cosine Metric and Average linkage Method import matplotlib.pyplot as plt from scipy.spatial.distance import pdist, squareform from scipy.cluster.hierarchy import linkage, dendrogram from scipy.cluster.hierarchy import cophenet checkx=diabetic_patients_binary_ddup[1:] X=pdist(checkx.ix[:,0:len(diabetic_patients_binary_ddup.columns)-1],metric='cosine') Z = linkage(X,method='average') c,cd=cophenet(Z,X) c # In[44]: ##Code to make dendogram # plt.title('Hierarchical Clustering Dendrogram (truncated)') # plt.xlabel('Patient Group') # plt.ylabel(' Cosine distance') # dendrogram( # Z, # truncate_mode='level', # show only the last p merged clusters # p=100, # show only the last p merged clusters # show_leaf_counts=True, # otherwise numbers in brackets are counts
def coph_cor(A, idx=None): avec = np.array([A[i, j] for i in range(A.shape[0] - 1) for j in range(i + 1, A.shape[1])]) Y = 1 - avec Z = linkage(Y, method='average') return cophenet(Z, Y)[0]
subsetProp[i, 0] = density[index] subsetProp[i, 1] = temperature[index] subsetProp[i, 2] = snII[index] # Genearate the linkage matrix # Use the Ward variance minimization algorithm time1 = time.time() z = sh.linkage(subsetLoc, 'ward') time2 = time.time() print 'Duration of linkage = {0:f} seconds'.format(time2-time1) # Determine how well the clustering preserves the # original distance c, coph_dist = sh.cophenet(z, sd.pdist(subsetLoc)) print c # Create the fancy dendrogram and save fancy_dendrogram(z, truncate_mode='lastp', p=12, leaf_rotation=90, leaf_font_size=12, show_contracted=True, annotate_above=10) figname = '{0:s}_{1:s}_{2:s}_abscells_dendrogram.png'.format(ion,galID,expn) plt.savefig(figname, bbox_inches='tight') plt.cla() plt.clf() k = num_clusters(z) print 'Number of clusters = {0:d}'.format(k)
def main(): all_rpkms = {"names": [], "rpkms": []} srna_rpkms = {"names": [], "rpkms": []} gene_rpkms = {"names": [], "rpkms": []} gff_f = open(args.gff_file, "r") genes = [] for entry in Gff3Parser().entries(gff_f): if entry.feature != "source": genes.append(entry) libs = {"TSB_OD_0.2": [], "TSB_OD_0.5": [], "TSB_OD_1": [], "TSB_t0": [], "TSB_t1": [], "TSB_t2": [], "TSB_ON": [], "pMEM_OD_0.2": [], "pMEM_OD_0.5": [], "pMEM_OD_1": [], "pMEM_t0": [], "pMEM_t1": [], "pMEM_t2": [], "pMEM_ON": []} fh = open(args.input_file, "r") for row in csv.reader(fh, delimiter='\t'): if (not row[0].startswith("Orientation")) and ( row[0] == "sense"): gene_name = get_name(row) rpkm_row = [float(row[10]), float(row[11]), float(row[12]), float(row[13]), float(row[14]), float(row[15]), float(row[16]), float(row[17]), float(row[18]), float(row[19]), float(row[20]), float(row[21]), float(row[22]), float(row[23])] if row[3] == "CDS": all_rpkms["names"].append(gene_name) gene_rpkms["names"].append(gene_name) all_rpkms["rpkms"].append(rpkm_row) gene_rpkms["rpkms"].append(rpkm_row) elif row[3] == "sRNA": all_rpkms["names"].append(gene_name) srna_rpkms["names"].append(gene_name) all_rpkms["rpkms"].append(rpkm_row) srna_rpkms["rpkms"].append(rpkm_row) data = np.array(all_rpkms["rpkms"]) Z = linkage(data, method='ward', metric='euclidean') c, coph_dists = cophenet(Z, pdist(data)) clusters = fcluster(Z, args.max_d, criterion='distance') nums = {} names = {} c_genes = {} index = 0 for c in clusters: if c not in nums.keys(): nums[c] = 1 names[c] = [all_rpkms["names"][index]] c_genes[c] = [all_rpkms["rpkms"][index]] else: nums[c] += 1 names[c].append(all_rpkms["names"][index]) c_genes[c].append(all_rpkms["rpkms"][index]) index += 1 print(nums) # x = np.arange(14) # labels = ["TSB_OD_0.2", "TSB_OD_0.5", "TSB_OD_1", "TSB_t0", "TSB_t1", "TSB_t2", "TSB_ON", # "pMEM_OD_0.2", "pMEM_OD_0.5", "pMEM_OD_1", "pMEM_t0", "pMEM_t1", "pMEM_t2", "pMEM_ON"] # color_list = list(six.iteritems(colors.cnames)) # for index, gene_list in c_genes.items(): # plt.figure(figsize=(12.5, 8)) # srna_detect = False # srna_num = 1 # color_num = 0 # for i in range(len(gene_list)): # if "sRNA" in names[index][i]: # srna_detect = True # if ":" in names[index][i]: # srna_name = names[index][i].split(":")[-1] # else: # srna_name = "novel_" + str(srna_num) # srna_num += 1 # if ("grey" not in color_list[color_num][0]) and ( # "gray" not in color_list[color_num][0]) and ( # "white" not in color_list[color_num][0]) and ( # "snow" not in color_list[color_num][0]) and ( # color_list[color_num][0] != "w"): # plt.plot(x,gene_list[i], color=color_list[color_num][0], label=srna_name) # color_num += 1 # else: # plt.plot(x,gene_list[i],color='lightgrey') ## plt.axhline(y=0, linewidth=2, color='red') # plt.ylabel("log2 fold change", fontsize=10) # plt.xticks(x,labels,rotation=45, fontsize=8) # if srna_detect: # plt.legend(loc=9, bbox_to_anchor=(1.065, 1), fontsize=8) # plt.savefig("test_" + str(index) + ".png") for index, gene_names in names.items(): print(index) for name in gene_names: for gene in genes: if ("locus_tag" in gene.attributes.keys()): if name == gene.attributes["locus_tag"]: print(gene.info) elif ("sRNA_hit" in gene.attributes.keys()): infos = name.split("|") if (infos[0] == gene.attributes["Name"]) and ( infos[1] == str(gene.start)) and ( infos[2] == str(gene.end)) and ( infos[3] == gene.strand): print(gene.info) plt.style.use('ggplot') plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('Genes') plt.ylabel('distance') fancy_dendrogram( Z, # truncate_mode='lastp', # p=12, leaf_rotation=90., leaf_font_size=12., # show_contracted=True, # annotate_above=10, no_labels=True, show_leaf_counts=False, max_d=args.max_d, # plot a horizontal cut-off line ) plt.savefig("hierarchical_tree.png")