def make_tree(X, C, method='single'): if method == 'single': tree = to_tree(single(C)) elif method == 'ward': tree = to_tree(ward(X)) elif method == 'average': tree = to_tree(average(C)) return Tree(root=construct_node(tree))
def load_linkages(self): if not isfile(join(self.input_dir, 'X_linkage.npy')): self.name = 'AllA' return self.name = 'HAllA' self.X_linkage = np.load(join(self.input_dir, 'X_linkage.npy')) self.Y_linkage = np.load(join(self.input_dir, 'Y_linkage.npy')) self.X_tree = sch.to_tree(self.X_linkage) self.Y_tree = sch.to_tree(self.Y_linkage)
def plot_leaf_ordering(X, method, metric): dists = distance.squareform(distance.pdist(X, metric=metric)) dists2 = distance.squareform(distance.pdist(X.T, metric=metric)) Z = hierarchy.linkage(X, method=method, metric=metric) Z2 = hierarchy.linkage(X.T, method=method, metric=metric) t, rd = hierarchy.to_tree(Z, True) t2, rd2 = hierarchy.to_tree(Z2, True) M = optimal_scores(Z, rd, dists) order_tree(Z, rd, M) M2 = optimal_scores(Z2, rd2, dists2) order_tree(Z2, rd2, M2) rr = t.pre_order() rr2 = t2.pre_order() import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec fig = plt.figure(figsize=(8, 8)) gs = GridSpec(2, 2, top=0.95, bottom=0.05, left=0.05, right=0.95, hspace=0.01, wspace=0.01, width_ratios=(1, 3), height_ratios=(1, 3)) ax01 = fig.add_subplot(gs[0, 1]) ax10 = fig.add_subplot(gs[1, 0]) ax11 = fig.add_subplot(gs[1, 1]) hierarchy.dendrogram(Z2, ax=ax01) ax01.set_axis_off() hierarchy.dendrogram(Z, orientation='right', ax=ax10) ax10.set_axis_off() ax11.matshow(X[np.ix_(rr, rr2)], cmap="Blues", aspect="auto") ax11.tick_params(**{s: 'off' for s in ('top', 'bottom', 'right')}) ax11.tick_params(labeltop='off', labelleft='off', labelright='on') ax11.set_xticks(np.arange(len(rr2))) ax11.set_xticklabels(rr2, fontsize=5.0) ax11.set_yticks(np.arange(len(rr))) ax11.set_yticklabels(rr, fontsize=5.0) plt.show()
def classify_by_scores(M, threshold, loci, return_file_names=None): M_array = ssd.squareform(M) Z = linkage(M_array, method='average') root = to_tree(Z) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 total_count = 1 pool = [] while True: cur_node = id2node[leaf_ids[i]] parent_dist = cur_node.parent.dist while parent_dist < threshold: cur_node = cur_node.parent parent_dist = cur_node.parent.dist cur_leaf_ids = get_leaves(cur_node) pool.append([id for id in cur_leaf_ids]) total_count += cur_node.count i += len(cur_leaf_ids) if i >= len(leaf_ids)-1: break cnt += 1 clusters = [l for l in pool if len(l) > 1] singles = [l[0] for l in pool if len(l) == 1] clusters = sorted(clusters, key=lambda x: len(x), reverse=True) if return_file_names: clusters_fn = [] for cluster in clusters: clusters_fn.append([os.path.basename(loci[i].file_name) for i in cluster]) singles_fn = [ os.path.basename(loci[i].file_name) for i in singles] return singles_fn, clusters_fn else: return singles, clusters
def linkage_to_newick(dataframe, output_file): """ Thanks to https://github.com/biocore/scikit-bio/issues/1579 Input : Z = linkage matrix, labels = leaf labels Output: Newick formatted tree string """ dataframe_only_samples = dataframe.set_index(dataframe['Position'].astype(int)).drop(['Position','N','Samples'], axis=1) #extract three first colums and use 'Position' as index labelList = dataframe_only_samples.columns.tolist() Z = shc.linkage(dataframe_only_samples.T, method='average') tree = shc.to_tree(Z, False) def buildNewick(node, newick, parentdist, leaf_names): if node.is_leaf(): #print("%s:%f%s" % (leaf_names[node.id], parentdist - node.dist, newick)) return "%s:%f%s" % (leaf_names[node.id], parentdist - node.dist, newick) else: if len(newick) > 0: newick = f"):{(parentdist - node.dist)/2}{newick}" else: newick = ");" newick = buildNewick(node.get_left(), newick, node.dist, leaf_names) newick = buildNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names) newick = "(%s" % (newick) #print(newick) return newick with open(output_file, 'w') as f: f.write(buildNewick(tree, "", tree.dist, labelList)) return buildNewick(tree, "", tree.dist, labelList)
def mkClustaloNewickTree(fastaFileName, verbose=False): """ make a newick tree using sequences in a fasta file. first, use mkDistanceMatrix to compute a distance matrix between the sequences, then use the scipy.cluster module to compute a tree. This tree is then converted into a newick string with the function getNewick input: - fastaFileName -- file with sequences - verbose -- print messages output: - newick_str -- the newick string representing the tree """ ## get the distance matrix distmat, distmat_header = mkClustaloDistanceMatrix(fastaFileName, verbose=verbose) flat_distmat = aux.mkFlatDistMat(distmat) ## use scipy.cluster.hierarchy to make a dendogram Z = sclush.linkage(flat_distmat, method='average') ## UPGMA, cf. MhcCluster ## scale Z such that depth is 1 Z = scaleHClus(Z) ## optionally plot the dendogram #fig, (ax1) = plt.subplots(1, 1, figsize=(20,5)) #sclush.dendrogram(Z, orientation='top', labels=distmat_header, ax=ax1) #plt.setp(ax1.get_xticklabels(), rotation=90) tree = sclush.to_tree(Z, False) newick_str = getNewick(tree, "", tree.dist, distmat_header) return newick_str
def cluster_dandelion_2(dataset, gamma=0.91, filter=False): #duplicato, mi serve solo per tornare la linkage_matrix doc_proc = dp.DocumentsProcessor(dataset) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion( gamma=gamma, filter=filter) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion() svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) tfidf_matrix = lsa.fit_transform(tfidf_matrix) #linkage_matrix = hr.average(tfidf_matrix.toarray()) linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l return linkage_matrix
def CreateHeiarchicalTree(cluster_set, params_set, interval, print_outs=False): L = len(cluster_set) if L > 1: Dmatrix = zeros((L, L)) leaf_index_cluster_dict = {} if print_outs: print "calculating function distances" for i in range(L): p1 = params_set[i] c1 = cluster_set[i] leaf_index_cluster_dict[i] = (c1, p1) for j in range(L): p2 = params_set[j] Dmatrix[i, j] = Cdist(p1, p2, interval) if print_outs: print "converting to square form" Dmatrix_c = squareform(Dmatrix) if print_outs: print "hclustering" linkageMatrix = hcluster.linkage(Dmatrix_c) if print_outs: print "creating tree" Root, node_list = to_tree(linkageMatrix, rd=True) else: raise Exception("Cannot generate a tree from a single cluster") return Root, node_list, leaf_index_cluster_dict
def plot_dendrogram(Z, dendogram_file_name): root = to_tree(Z) threshold = root.dist / 3.0 all_leaves = get_leaves(root) plt.figure(figsize=(30, 30)) title = 'Hierarchical Clustering Dendrogram( %d leaves)' % len(all_leaves) xlabel = 'loci' ylabel = 'distance' fancy_dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=4., # font size for the x axis labels annotate_above=10, max_d=threshold, title=title, xlabel=xlabel, ylabel=ylabel) # plt.savefig(os.path.join(report_path, 'dendrogram_distance_array.eps'), format='eps', dpi=900) if dendogram_file_name.endswith('pdf'): plt.savefig(dendogram_file_name, format='pdf') elif dendogram_file_name.endswith('png'): plt.savefig(dendogram_file_name, format='png') else: raise NotImplemented('File format has to be either png or pdf') plt.close() return threshold
def reconstruct_scipy_linkage(df_mutation_table, path_out_newick): from scipy.cluster import hierarchy # get newick without distance def getNewick(node, newick, parentdist, leaf_names): if node.is_leaf(): return "%s%s" % (leaf_names[node.id], newick) else: if len(newick) > 0: newick = ")%s" % (newick) else: newick = ");" newick = getNewick(node.get_left(), newick, node.dist, leaf_names) newick = getNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names) newick = "(%s" % (newick) return newick tdf = df_mutation_table.drop(columns='root').transpose() link = scipy.cluster.hierarchy.linkage(tdf) tree = hierarchy.to_tree(link, False) newick_str = getNewick(tree, "", tree.dist, tdf.index) with open(path_out_newick, 'wt') as fout: fout.write(newick_str) fout.write('\n') return tree
def _process_block(): """Initialize nested dictionary for d3, then recursively iterate through tree and create the dict.""" tree = to_tree(linkage, rd=False) _add_node(tree, bcluster_dendro) _label_tree(bcluster_dendro["children"][-1]) # get the last element return bcluster_dendro
def ward_dynamicTreeCut(rmsd_mat, tau=5): n = rmsd_mat.shape[0] dend = get_ward_dendrogram(rmsd_mat) tree = to_tree(dend) breakpoints = dynamicTreeCut(tree, n, tau) comm_assing = report_assingments(breakpoints, tree) return comm_assing
def __get_column_dendrogram__(self): #root and nodes have the coloumn clustered data root, nodes = hcluster.to_tree(self.cluster_object.column_clustering, rd=True) #node_id2node is a list node_id2node = {} #dendogram is a graph having node as starting address and a list followed by every node dendrogram = {"nodes":{}} #iterate through all nodes for node in nodes: print ("id is:", id) node_id = node.id # if node is leaf node if node.count == 1: node_id2node[node_id] = {"count":1, "distance":0} else: # assign left and right child in form of graph to a node_id2 node_left_child = node.get_left().id node_right_child = node.get_right().id node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child} #assigning parent as the number of node in id2node for n in node_id2node: node = node_id2node[n] if node["count"] != 1: node_id2node[node["left_child"]]["parent"] = n node_id2node[node["right_child"]]["parent"] = n #if array list of nodes is not present in the dandrogram for n in node_id2node: if not n in dendrogram["nodes"]: dendrogram["nodes"][n] = node_id2node[n] return dendrogram
def hierarchical_clustering_to_dendrogram(clustering): """Converts an array representing a clustering to a dendrogram. Args: clustering (ndarray): A hierarchical clustering matrix, in the form returned by scipy.hierarchical.linkage. Returns: (networkx.DiGraph): A dendrogram. Each node in the dendrogram has the 'distance' attribute, which is the threshold at which its children are merged in the clustering. """ root = _hierarchy.to_tree(clustering) tree = _nx.DiGraph() tree.add_node(root.id, distance=root.dist) if root.left: queue = [(root, root.left), (root, root.right)] while queue: parent, child = queue.pop(0) tree.add_edge(parent.id, child.id) tree.node[child.id]['distance'] = float(child.dist) if child.left: queue.append((child, child.left)) if child.right: queue.append((child, child.right)) return tree
def to_dict(self, correlation_matrix, linkage_matrix): from scipy.cluster import hierarchy tree = hierarchy.to_tree(linkage_matrix, rd=False) leaves_list = hierarchy.leaves_list(linkage_matrix) d = {} # http://w3facility.org/question/scipy-dendrogram-to-json-for-d3-js-tree-visualisation/ # https://gist.github.com/mdml/7537455 def add_node(node): if node.is_leaf(): return cluster_id = node.get_id() - len(linkage_matrix) - 1 row = linkage_matrix[cluster_id] d[cluster_id+1] = { 'datasets': [i+1 for i in sorted(node.pre_order())], 'height': row[2], } # Recursively add the current node's children if node.left: add_node(node.left) if node.right: add_node(node.right) add_node(tree) return d
def plot_dendrogram(Z, dendogram_file_name): root = to_tree(Z) threshold = root.dist / 3.0 all_leaves = get_leaves(root) plt.figure(figsize=(30, 30)) title = 'Hierarchical Clustering Dendrogram( %d leaves)' % len(all_leaves) xlabel = 'loci' ylabel = 'distance' fancy_dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=4., # font size for the x axis labels annotate_above=10, max_d=threshold, title=title, xlabel=xlabel, ylabel=ylabel ) # plt.savefig(os.path.join(report_path, 'dendrogram_distance_array.eps'), format='eps', dpi=900) if dendogram_file_name.endswith('pdf'): plt.savefig(dendogram_file_name, format='pdf') elif dendogram_file_name.endswith('png'): plt.savefig(dendogram_file_name, format='png') else: raise NotImplemented('File format has to be either png or pdf') plt.close() return threshold
def to_dict(self, linkage_matrix): from scipy.cluster import hierarchy tree = hierarchy.to_tree(linkage_matrix, rd=False) leaves_list = hierarchy.leaves_list(linkage_matrix) d = {} # http://w3facility.org/question/scipy-dendrogram-to-json-for-d3-js-tree-visualisation/ # https://gist.github.com/mdml/7537455 def add_node(node): if node.is_leaf(): return cluster_id = node.get_id() - len(linkage_matrix) - 1 row = linkage_matrix[cluster_id] d[cluster_id + 1] = { 'datasets': [i + 1 for i in sorted(node.pre_order())], 'height': row[2], } # Recursively add the current node's children if node.left: add_node(node.left) if node.right: add_node(node.right) add_node(tree) return d
def optMDL(df): Z = getDist(df) tree = sc.to_tree(Z, rd=True)[1] minMDL = 1000000 optK = 0 desLength = 0 DList = [] for n_cluster in range(1, 11, 1): #range(df.shape[0]+1) N = fcluster(Z, n_cluster, criterion='maxclust') L, M = sc.leaders(Z, N) leaders = list(L) print(leaders) leafDict = {} for node in tree: if node.get_id() in leaders: key = node.get_id() if node.get_count() > 1: dist = getleafdict(node) else: dist = {key: 0} leafDict[key] = dist desLength = binning(leafDict) + n_cluster * np.log2(df.shape[0]) DList.append(desLength) #if desLength if desLength < minMDL: minMDL = desLength optK = n_cluster return optK, minMDL, DList
def test_mirac_wrong_args(self): x = np.zeros((10, 10)) # wrong min_cl_n with pytest.raises(ValueError) as excinfo: cluster.MIRAC(x, metric='euclidean', min_cl_n=-0.1) with pytest.raises(ValueError) as excinfo: cluster.MIRAC(x, metric='euclidean', min_cl_n=-0.1) # wrong cl_mdl_scale_factor with pytest.raises(ValueError) as excinfo: cluster.MIRAC(x, metric='euclidean', cl_mdl_scale_factor=-0.1) # wrong encode type with pytest.raises(ValueError) as excinfo: cluster.MIRAC(x, metric='euclidean', encode_type='1') with pytest.raises(ValueError) as excinfo: cluster.MIRAC(x, metric='euclidean', encode_type=1) with pytest.raises(ValueError) as excinfo: cluster.MIRAC(x, metric='euclidean', dim_reduct_method='NONN') # hac tree n_leaves different from n_samples z = sch.linkage([[0], [5], [6], [8], [9], [12]], method='single', optimal_ordering=True) hct = eda.HClustTree(sch.to_tree(z)) with pytest.raises(ValueError) as excinfo: cluster.MIRAC(x, metric='euclidean', hac_tree=hct)
def get_motiftrees(motifs, buckettable, method="ward", metric="euclidean", outputdir="MotifTree/"): os.mkdir(outputdir) # select only features present in tree bt_sel = buckettable[buckettable['#OTU ID'].isin( list(set(buckettable['#OTU ID']) & set(motifs['scans'])))] bt_sel.to_csv(outputdir + 'Buckettable_Motifs.tsv', sep='\t', index=False) motifs = motifs[pd.notnull( motifs['motif'])] # remove features, which do not contain any motif motifs = motifs.loc[:, (motifs != 0).any(axis=0)] motifs.index = motifs['scans'] motifs = motifs.filter(like='motif_') # select all motif columns Z = scipy.cluster.hierarchy.linkage(motifs, method=method, metric=metric) leaf_names = motifs.index # remove white space from leaf labels tree = hierarchy.to_tree(Z, False) f = open(outputdir + 'Tree_Motifs.txt', 'w') f.write(getNewick(tree, "", tree.dist, leaf_names)) f.close()
def objectlm_covariance(matrix, savepath, metric="cosine"): if not savepath.endswith("/"): savepath = savepath + "/" if os.path.exists(savepath + "__linkage_average.npy"): Z = np.load(savepath + "__linkage_average.npy") else: if not os.path.exists(savepath): os.makedirs(savepath) Z = linkage(matrix, method='average', metric=metric) np.save(savepath + "__linkage_average.npy", Z) if os.path.exists(savepath + "__covariance__.npy"): Cov = np.load(savepath + "__covariance__.npy") observables = HierarchicalObservation(Cov) else: root, nodes = to_tree(Z, rd=True) assign_parents(root) adj_mat = get_adjacency_matrix(nodes) deg_mat = get_degree_matrix(nodes) sigma = 5 laplacian = np.diag(deg_mat) - adj_mat + 1 / (sigma**2) * np.eye( len(deg_mat)) Cov = np.linalg.inv(laplacian)[:matrix.shape[0], :matrix.shape[0]] np.save(savepath + "__covariance__.npy", Cov) observables = HierarchicalObservation(Cov) return observables
def __init__(self, num_topics, metric='jensenshannon', method='ward', unique_scale=True, topn=None): """ Saves linkage matrix `Z´ and `nodelist´ args: num_topics (int): Selects LDA model. metric (str): Metric passed to scipy.spatial.distance.pdist method (str): Method passed to scipy.cluster.hierarchy unique_scale (bool): Scale word proba by uniqueness topn (int, optional): only consider X words (don't use) """ self.num_topics = num_topics self.metric = metric self.method = method self.scale = 200 folder_path = os.path.join(params().paths['lda'], 'lda_model_' + str(self.num_topics)) file_path = os.path.join(folder_path, 'trained_lda') self.lda_model = gensim.models.LdaMulticore.load(file_path) topics = self.lda_model.get_topics() if unique_scale: topics = topics / (topics.sum(axis=0)) if topn: topics.sort(axis=1) topics = np.flip(topics, axis=1) topics = topics[:, 0:topn] y = pdist(topics, metric=self.metric) self.Z = hierarchy.linkage(y, method=self.method) rootnode, self.nodelist = hierarchy.to_tree(self.Z, rd=True)
def check_leaves_list_iris(self, method): # Tests leaves_list(Z) on the Iris data set X = eo['iris'] Y = pdist(X) Z = linkage(X, method) node = to_tree(Z) assert_equal(node.pre_order(), leaves_list(Z))
def linkage_matrix_to_dict(linkage_matrix): tree = hierarchy.to_tree(linkage_matrix, rd=False) d = {} # http://w3facility.org/question/scipy-dendrogram-to-json-for-d3-js-tree-visualisation/ # https://gist.github.com/mdml/7537455 def add_node(node): if node.is_leaf(): return cluster_id = node.get_id() - len(linkage_matrix) - 1 row = linkage_matrix[cluster_id] d[cluster_id + 1] = { "datasets": [i + 1 for i in sorted(node.pre_order())], "height": row[2], } # Recursively add the current node's children if node.left: add_node(node.left) if node.right: add_node(node.right) add_node(tree) return OrderedDict(sorted(d.items()))
def guide_tree_from_sequences(sequences, metric=kmer_distance, display_tree=False): """ Build a UPGMA tree by applying metric to sequences Parameters ---------- sequences : list of skbio.Sequence objects (or subclasses) The sequences to be represented in the resulting guide tree. metric : function Function that returns a single distance value when given a pair of skbio.Sequence objects. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.TreeNode """ guide_dm = DistanceMatrix.from_iterable(sequences, metric=metric, key='id') guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def tfidf_covariance(texts, savepath): if not savepath.endswith("/"): savepath = savepath + "/" if os.path.exists(savepath + "__linkage_average.npy"): Z = np.load(savepath + "__linkage_average.npy") else: if not os.path.exists(savepath): os.makedirs(savepath) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(input=str, strip_accents='ascii', analyzer='word', max_features=5000) y = vectorizer.fit_transform(" ".join(text) for text in texts) Z = linkage(y.todense(), method='average', metric='euclidean') np.save(savepath + "__linkage_average.npy", Z) if os.path.exists(savepath + "__covariance__.npy"): Cov = np.load(savepath + "__covariance__.npy") observables = HierarchicalObservation(Cov) else: root, nodes = to_tree(Z, rd=True) assign_parents(root) adj_mat = get_adjacency_matrix(nodes) deg_mat = get_degree_matrix(nodes) sigma = 5 laplacian = np.diag(deg_mat) - adj_mat + 1 / (sigma**2) * np.eye( len(deg_mat)) Cov = np.linalg.inv(laplacian)[:len(texts), :len(texts)] np.save(savepath + "__covariance__.npy", Cov) observables = HierarchicalObservation(Cov) return observables
def __init__(self, flat_cluster, cluster = None, curve_list = None): from scipy.cluster.hierarchy import to_tree from numpy import asarray, sort self.flat = flat_cluster # FlatClusters object self.co_analysis = self.flat.get_co_analysis() #CoAnalysis object self.cluster = cluster #Cluster object if not cluster == None: self.curve_list = cluster.list_curve_indexes() else: self.curve_list = curve_list self.Z = self.co_analysis.get_hierarchical_cluster() root = to_tree(self.Z) # root of entire cluster! curves = asarray(self.curve_list) # list of curves in this cluster # Get the cluster node that corresponds to the curves in the cluster above self.cluster_node = get_cluster_node(root, root.left, root.right, curves) self.id = self.cluster_node.get_id() # Get the right and left cluster nodes self.left = self.cluster_node.left self.right = self.cluster_node.right # Get the left and right cluster lists self.left_list = sort(any_pre_order(root, self.left)) self.right_list = sort(any_pre_order(root, self.right))
def scipy_algo(dataset, abstract=False): doc_proc = dp.DocumentsProcessor(dataset) tfidf_matrix, f_score_dict = doc_proc.get_data(abstract) svd = TruncatedSVD(tfidf_matrix.shape[0]) lsa = make_pipeline(svd, Normalizer(copy=False)) #tfidf_matrix = lsa.fit_transform(tfidf_matrix) print 'starting clustering after lsa: found %s document and %s features' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) linkage_matrix = hr.average(tfidf_matrix.toarray()) #linkage_matrix = hr.average(tfidf_matrix) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) print_f_score_dict(f) avg_f_score = average_f_score(f, tfidf_matrix.shape[0]) print 'average f_score: %s' % avg_f_score return avg_f_score
def get_clusters_hac(data: DataFrame, dist_metric: str, height: int = None, show_dendrogram: bool = False, show_chart: bool = False): """ Use Hierarchical Agglomerative Clustering (HAC) to cluster phrase vectors Returns (list, np.ndarray, int) """ # Create a linkage matrix if dist_metric == "cosine": dist = 1 - cosine_similarity(list(data.vec)) else: dist = pairwise_distances(list(data.vec), metric=dist_metric) linkage_matrix = ward(dist) # Maximum cut point height is the height of the tree max_h = get_tree_height(hierarchy.to_tree(linkage_matrix)) + 1 # Use optimal height if no height is specified if height is None: height = get_optimal_height(data, linkage_matrix, max_h, show_chart) cluster_assignments = get_cluster_assignments_hac(linkage_matrix, height) # Optionally display the clustering dendrogram if show_dendrogram: dendrogram(linkage_matrix) plt.show() return cluster_assignments, linkage_matrix, max_h, height
def guide_tree_from_sequences(sequences, distance_fn=kmer_distance, display_tree=False): """ Build a UPGMA tree by applying distance_fn to sequences Parameters ---------- sequences : skbio.SequenceCollection The sequences to be represented in the resulting guide tree. sequence_distance_fn : function Function that returns and skbio.DistanceMatrix given an skbio.SequenceCollection. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.TreeNode """ guide_dm = sequences.distances(distance_fn) guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def tfidf_covariance(texts, savepath): if not savepath.endswith("/"): savepath = savepath + "/" if os.path.exists(savepath + "__linkage_average.npy"): Z = np.load(savepath + "__linkage_average.npy") else: if not os.path.exists(savepath): os.makedirs(savepath) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(input = str, strip_accents = 'ascii', analyzer ='word', max_features=5000) y = vectorizer.fit_transform(" ".join(text) for text in texts) Z = linkage(y.todense(), method='average', metric='euclidean') np.save(savepath + "__linkage_average.npy", Z) if os.path.exists(savepath + "__covariance__.npy"): Cov = np.load(savepath + "__covariance__.npy") observables = HierarchicalObservation(Cov) else: root, nodes = to_tree(Z, rd=True) assign_parents(root) adj_mat = get_adjacency_matrix(nodes) deg_mat = get_degree_matrix(nodes) sigma = 5 laplacian = np.diag(deg_mat) - adj_mat + 1/(sigma**2) * np.eye(len(deg_mat)) Cov = np.linalg.inv(laplacian)[:len(texts), :len(texts)] np.save(savepath + "__covariance__.npy", Cov) observables = HierarchicalObservation(Cov) return observables
def create_cluster_heatmap(self, compress=False, compressed_value="median", write_data=True): """Creates cluster heatmap representation in inchlib format. By setting compress parameter to True you can cut the dendrogram in a distance to decrease the row size of the heatmap to specified count. When compressing the type of the resulted value of merged rows is given by the compressed_value parameter (median, mean). When the metadata are nominal (text values) the most frequent is the result after compression. By setting write_data to False the data features won't be present in the resulting format.""" self.dendrogram = {"data": self.__get_cluster_heatmap__(write_data)} self.compress = compress self.compressed_value = compressed_value self.compress_cluster_threshold = 0 if self.compress and self.compress >= 0: self.compress_cluster_threshold = self.__get_distance_threshold__(compress) print("Distance threshold for compression:", self.compress_cluster_threshold) if self.compress_cluster_threshold >= 0: self.__compress_data__() else: self.compress = False if self.header and write_data: self.dendrogram["data"]["feature_names"] = [h for h in self.header] elif self.header and not write_data: self.dendrogram["data"]["feature_names"] = [] if self.axis == "both" and len(self.cluster_object.column_clustering): column_dendrogram = hcluster.to_tree(self.cluster_object.column_clustering) self.dendrogram["column_dendrogram"] = self.__get_column_dendrogram__()
def __get_column_dendrogram__(self): root, nodes = hcluster.to_tree(self.cluster_object.column_clustering, rd=True) node_id2node = {} dendrogram = {"nodes":{}} for node in nodes: node_id = node.id if node.count == 1: node_id2node[node_id] = {"count":1, "distance":0} else: node_left_child = node.get_left().id node_right_child = node.get_right().id node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child} for n in node_id2node: node = node_id2node[n] if node["count"] != 1: node_id2node[node["left_child"]]["parent"] = n node_id2node[node["right_child"]]["parent"] = n for n in node_id2node: if not n in dendrogram["nodes"]: dendrogram["nodes"][n] = node_id2node[n] return dendrogram
def make_interactive_tree(matrix=None,labels=None): '''make interactive tree will return complete html for an interactive tree :param title: a title for the plot, if not defined, will be left out. ''' from scipy.cluster.hierarchy import ( dendrogram, linkage, to_tree ) d3 = None from scipy.cluster.hierarchy import cophenet from scipy.spatial.distance import pdist if isinstance(matrix,pandas.DataFrame): Z = linkage(matrix, 'ward') # clusters T = to_tree(Z, rd=False) if labels == None: labels = matrix.index.tolist() lookup = dict(zip(range(len(labels)), labels)) # Create a dendrogram object without plotting dend = dendrogram(Z,no_plot=True, orientation="right", leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels labels=labels) d3 = dict(children=[], name="root") add_node(T, d3) label_tree(d3["children"][0],lookup) else: bot.warning('Please provide data as pandas Data Frame.') return d3
def create_cluster_heatmap(self, compress=False, compressed_value="median", write_data=True): """Creates cluster heatmap representation in inchlib format. By setting compress parameter to True you can cut the dendrogram in a distance to decrease the row size of the heatmap to specified count. When compressing the type of the resulted value of merged rows is given by the compressed_value parameter (median, mean). When the metadata are nominal (text values) the most frequent is the result after compression. By setting write_data to False the data features won't be present in the resulting format.""" self.dendrogram = {"data": self.__get_cluster_heatmap__(write_data)} self.compress = compress self.compressed_value = compressed_value self.compress_cluster_treshold = 0 if self.compress and self.compress >= 0: self.compress_cluster_treshold = self.__get_distance_treshold__(compress) print("Distance treshold for compression:", self.compress_cluster_treshold) if self.compress_cluster_treshold >= 0: self.__compress_data__() else: self.compress = False if self.header and write_data: self.dendrogram["data"]["feature_names"] = [h for h in self.header] elif self.header and not write_data: self.dendrogram["data"]["feature_names"] = [] if self.axis == "both" and len(self.cluster_object.column_clustering): column_dendrogram = hcluster.to_tree(self.cluster_object.column_clustering) self.dendrogram["column_dendrogram"] = self.__get_column_dendrogram__()
def guide_tree_from_sequences(sequences, metric=kmer_distance, display_tree = False): """ Build a UPGMA tree by applying metric to sequences Parameters ---------- sequences : list of skbio.Sequence objects (or subclasses) The sequences to be represented in the resulting guide tree. metric : function Function that returns a single distance value when given a pair of skbio.Sequence objects. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.TreeNode """ guide_dm = DistanceMatrix.from_iterable( sequences, metric=metric, key='id') guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def clusters_to_json(clusters, labels): T = hcl.to_tree(clusters, rd=False) # Create dictionary for labeling nodes by their IDs id2name = dict(zip(range(len(labels)), labels)) # Initialize nested dictionary for d3, then recursively iterate through tree d3Dendro = dict(children=[], name="Root1") add_node(T, d3Dendro) leafNames_list = [] sys.setrecursionlimit(15000) label_tree(d3Dendro["children"][0], id2name, leafNames_list) # Output to JSON json.dump(d3Dendro, open(OUT_JSON_FILE, "w"), sort_keys=True, indent=4) with open(MODEL_DIR+"\\leafNames_list.txt", 'w') as fp: indx = 0 for line in leafNames_list: def join_list(line_list, joined_list): for word in line_list: if type(word) == int: # find word join_list(leafNames_list[word], joined_list) else: joined_list.append(str(word)) joined_list = [] join_list(line, joined_list) fp.write(str(indx)+"\t"+'--'.join(joined_list)+"\n") # fp.write(str(indx) + "\t" + "--".join(str(x) for x in line) + "\n") indx += 1
def cluster_sequences(sequences, minsize=5): """ Cluster the given sequences into groups of similar sequences. Return a triple that contains a pandas.DataFrame with the edit distances, the linkage result, and a list that maps sequence ids to their cluster id. If an entry is zero in that list, it means that the sequence is not part of a cluster. """ matrix = distances(sequences) linkage = hierarchy.linkage(distance.squareform(matrix), method='average') # Linkage columns are: # 0, 1: merged clusters, 2: distance, 3: number of nodes in cluster inner = inner_nodes(hierarchy.to_tree(linkage)) prev = linkage[:, 2].max() # highest distance clusters = [0] * len(sequences) cl = 1 for n in inner: if n.dist > 0 and prev / n.dist < 0.8 \ and n.left.count >= minsize and n.right.count >= minsize: for id in collect_ids(n.left): # Do not overwrite previously assigned ids if clusters[id] == 0: clusters[id] = cl cl += 1 prev = n.dist # At the end of the above loop, we have not processed the rightmost # subtree. In our experiments, it never contains true novel sequences, # so we omit it. return pd.DataFrame(matrix), linkage, clusters
def _build_graph(self): self.root_ = to_tree(self.z) self.root_id_ = str(self.root_.id) self.graph_ = nx.DiGraph() self.graph_.add_node(self.root_id_) for node in self._walk(self.root_): label = str(self.labels_dict.get(node.id, node.id)) self.graph_.nodes[label]['model'] = None self.graph_.nodes[label]['flat_classes'] = list( map(self.labels_dict.get, node.pre_order())) self.graph_.nodes[label]['left'] = None self.graph_.nodes[label]['right'] = None if node.left: label_left = str( self.labels_dict.get(node.left.id, node.left.id)) self.graph_.add_node(label_left) self.graph_.add_edge(label, label_left) self.graph_.nodes[label]['left'] = label_left if node.right: label_right = str( self.labels_dict.get(node.right.id, node.right.id)) self.graph_.add_node(label_right) self.graph_.add_edge(label, label_right) self.graph_.nodes[label]['right'] = label_right self.classes_ = list(node for node in self.graph_.nodes() if node != self.root_id_) self.paths_ = {} for class_ in self.classes_: self.paths_[class_] = nx.shortest_path(self.graph_, self.root_id_, class_)
def test_Q_subtree_pre_order(self): # Tests that pre_order() works when called on sub-trees. X = hierarchy_test_data.Q_X Z = linkage(X, 'single') node = to_tree(Z) assert_equal(node.pre_order(), (node.get_left().pre_order() + node.get_right().pre_order()))
def cluster_alchemy(dataset, gamma=None, filter=False): doc_proc = dp.DocumentsProcessor(dataset) if gamma: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy(gamma=gamma, filter=filter) else: tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy() print 'starting clustering: found %s document and %s features' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) linkage_matrix = hr.average(tfidf_matrix.toarray()) t = hr.to_tree(linkage_matrix, rd=True) clusters = {} for node in t[1]: if not node.is_leaf(): l = [] clusters[node.get_id()] = collect_leaf_nodes(node, l) f = f_score(clusters, f_score_dict) l = print_f_score_dict(f) params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0]) params['all_fscore'] = l print 'average f_score: %s' % params['avg_f_score'] return params
def get_hier_tree(self, method='single'): """Get a tree data structure describing the clustering order of based on the hierarchical clustering methods and the currently set genes. Calls scipy.cluster.hierarchy.to_tree | Args: | method (str): clustering method to employ. Valid entries are | 'single', 'complete', 'weighted' and 'average'. | Refer to Scipy documentation for further details. | Returns: | root_node (ClusterNode): the root node of the tree. Access child | members with .left and .right, while .id | holds the number of the corresponding | cluster. Refer to Scipy documentation for | further details. """ if self._needs_recalc: self._recalc() Z = self.get_linkage(method=method) return hierarchy.to_tree(Z)
def guide_tree_from_sequences(sequences, distance_fn=kmer_distance, display_tree = False): """ Build a UPGMA tree by applying distance_fn to sequences Parameters ---------- sequences : skbio.SequenceCollection The sequences to be represented in the resulting guide tree. sequence_distance_fn : function Function that returns and skbio.DistanceMatrix given an skbio.SequenceCollection. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.TreeNode """ guide_dm = sequences.distances(distance_fn) guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def average_dendogram(similarity_matrix, book_names): linkage_matrix = average( similarity_matrix ) # Define the linkage_matrix using ward clustering pre-computed distances assignments = fcluster(linkage_matrix, 3, depth=5) clusters = get_clusters_with_hierarchy(to_tree(linkage_matrix)) return [assignments, clusters]
def main(): distance_matrix, labels = data.gen_distance_matrix() np.save('../labels', labels) linkage_matrix = linkage(distance_matrix, 'ward') plt.figure(figsize=(25, 10)).subplots_adjust(bottom=0.25) dendrogram( linkage_matrix, leaf_rotation=90., leaf_font_size=16., labels=labels, ) plt.show() root, node_list = to_tree(linkage_matrix, rd=True) num_clusters = input("number of clusters: ") heap = [] heapq.heappush(heap, (1 / root.dist, root)) while len(heap) < num_clusters: current = heapq.heappop(heap)[1] heapq.heappush(heap, (1 / current.left.dist, current.left)) heapq.heappush(heap, (1 / current.right.dist, current.right)) num_cats = 0 for cluster in heap: dir_name = "cat" + str(num_cats) + "/" if not os.path.exists(dir_name): os.mkdir(dir_name) else: rmtree(dir_name) os.mkdir(dir_name) num_cats += 1 create_category(linkage_matrix, cluster[1], labels, src="generatedpictures", dest=dir_name)
def test_iris_subtree_pre_order(self): # Tests that pre_order() works when called on sub-trees. X = eo['iris'] Y = pdist(X) Z = linkage(X, 'single') node = to_tree(Z) assert_equal(node.pre_order(), (node.get_left().pre_order() + node.get_right().pre_order()))
def plot_leaf_ordering(X, method, metric): dists = distance.squareform(distance.pdist(X, metric=metric)) dists2 = distance.squareform(distance.pdist(X.T, metric=metric)) Z = hierarchy.linkage(X, method=method, metric=metric) Z2 = hierarchy.linkage(X.T, method=method, metric=metric) t,rd = hierarchy.to_tree(Z, True) t2,rd2 = hierarchy.to_tree(Z2, True) M = optimal_scores(Z, rd, dists) order_tree(Z, rd, M) M2 = optimal_scores(Z2, rd2, dists2) order_tree(Z2, rd2, M2) rr = t.pre_order() rr2 = t2.pre_order() import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec fig = plt.figure(figsize=(8,8)) gs = GridSpec(2, 2, top=0.95, bottom=0.05, left=0.05, right=0.95, hspace=0.01, wspace=0.01, width_ratios=(1,3), height_ratios=(1,3)) ax01 = fig.add_subplot(gs[0,1]) ax10 = fig.add_subplot(gs[1,0]) ax11 = fig.add_subplot(gs[1,1]) hierarchy.dendrogram(Z2, ax=ax01) ax01.set_axis_off() hierarchy.dendrogram(Z, orientation='right', ax=ax10) ax10.set_axis_off() ax11.matshow(X[np.ix_(rr,rr2)], cmap="Blues", aspect="auto") ax11.tick_params(**{s:'off' for s in ('top', 'bottom', 'right')}) ax11.tick_params(labeltop='off', labelleft='off', labelright='on') ax11.set_xticks(np.arange(len(rr2))) ax11.set_xticklabels(rr2, fontsize=5.0) ax11.set_yticks(np.arange(len(rr))) ax11.set_yticklabels(rr, fontsize=5.0) plt.show()
def __init__(self, clustering): self.cluster_object = clustering self.datatype = clustering.datatype self.axis = clustering.clustering_axis self.clustering = clustering.clustering self.tree = hcluster.to_tree(self.clustering) self.data = clustering.data self.data_names = clustering.data_names self.header = clustering.header self.dendrogram = False
def get_clustering_as_tree(vectors, linkage=constants.linkage_method_default, distance=constants.distance_metric_default, progress=progress): is_distance_and_linkage_compatible(distance, linkage) progress.update('Clustering data with "%s" linkage using "%s" distance' % (linkage, distance)) linkage = hierarchy.linkage(vectors, metric=distance, method=linkage) progress.update('Recovering the tree from the clustering result') tree = hierarchy.to_tree(linkage, rd=False) return tree
def test_node_compare(): np.random.seed(23) nobs = 50 X = np.random.randn(nobs, 4) Z = scipy.cluster.hierarchy.ward(X) tree = to_tree(Z) assert_(tree > tree.get_left()) assert_(tree.get_right() > tree.get_left()) assert_(tree.get_right() == tree.get_right()) assert_(tree.get_right() != tree.get_left())
def mat2tree(mat, nodeNames=None, dosvg=True): #http://stackoverflow.com/questions/9364609/converting-ndarray-generated-by-hcluster-into-a-newick-string-for-use-with-ete2 import scipy.cluster.hierarchy as hier hasTree = False try: from ete2 import Tree import StringIO hasTree = True except: pass T = hier.to_tree( mat ) root = Tree() root.dist = 0 root.name = 'root' item2node = {T: root} to_visit = [T] while to_visit: node = to_visit.pop() cl_dist = node.dist / 2.0 for ch_node in [node.left, node.right]: if ch_node: ch = Tree() ch.dist = cl_dist ch.name = str(ch_node.id) if nodeNames: if ch_node.id < len(nodeNames): ch.name = nodeNames[ ch_node.id ] item2node[ch_node] = ch item2node[node ].add_child(ch) to_visit.append(ch_node) svg = "" if dosvg: fnm = tempfile.mkstemp(suffix=".svg", prefix=os.path.basename(sys.argv[0]) + '_tmp_', text=True, dir=TMP_DIR)[1] #output = StringIO.StringIO() if os.path.exists( fnm ): print fnm root.render(fnm) with open(fnm, 'r') as fhd: svg = fhd.read() os.remove(fnm) #print svg return (root, svg)
def __init__(self,clust,video,thumbsize=(60,60)): """ :param cluster: cluster return by videoclustering :type: numarray :param video: video object :type: video :param key_frames_id: array of key frame number :type: array """ self.w = 0 self.key_frames_id = clust.keys self.cluster = sch.to_tree(clust.cluster) self.video = video self.key_frames = [] self.thumbsize = thumbsize
def make_tree_json(row_clusters, df_by_gene): T= to_tree(row_clusters) # Create dictionary for labeling nodes by their IDs labels = list(df_by_gene.index) id2name = dict(zip(range(len(labels)), labels)) # Initialize nested dictionary for d3, then recursively iterate through tree d3Dendro = dict(children=[], name="Root1") add_node( T, d3Dendro ) label_tree( d3Dendro["children"][0], id2name ) # Output to JSON json.dump(d3Dendro, open(os.path.join(path_to_file,"d3-dendrogram.json"), "w"), sort_keys=True, indent=4) return cc
def HierarchicalClustering(SP): print "Start HierarchicalClustering"; n = len(SP); idx = 0; print "number of Stay points = %d"%n; D = [0 for i in xrange(n*(n - 1)/2)]; for i in xrange(n): for j in xrange(i + 1,n): D[idx] = SP[i].Haversine(SP[j]); idx += 1; Z = HC.linkage(D); HC.dendrogram(Z,100,'level'); plt.show(); print "End HierarchicalClustering" return HC.to_tree(Z);
def linkage_to_newick(matrix: np.ndarray, labels: List[str]): """Convert a linkage matrix to a newick formatted tree. :param matrix: The linkage matrix. :param labels: Names of the tree node. :return: The newick representation of the linkage matrix. """ # Convert the linkage matrix to a ClusterNode object. tree = to_tree(matrix, False) # Define the helper recursive function to build the newick tree. def _build_newick_tree(node: ClusterNode, newick: str, parent_dist: float, leaf_names: List[str]) -> str: """Recursively build the newick tree. :param node: The tree node currently being converted to. :param newick: The current newick representation of the tree. :param parent_dist: The distance to parent node. :param leaf_names: Names of the tree node. :return: """ # If node is leaf, enclose. if node.is_leaf(): return f"{leaf_names[node.id]}" \ f":{(parent_dist - node.dist) / 2}{newick}" else: # Write the distance. newick = f"):{(parent_dist - node.dist) / 2}{newick}" \ if len(newick) > 0 else ");" # Recursive call to expand the tree. newick = _build_newick_tree( newick=newick, node=node.get_left(), parent_dist=node.dist, leaf_names=leaf_names) newick = _build_newick_tree( newick=f",{newick}", node=node.get_right(), parent_dist=node.dist, leaf_names=leaf_names) # Enclose the tree at the beginning. return f"({newick}" # Trigger the recursive function. return _build_newick_tree( node=tree, newick="", parent_dist=tree.dist, leaf_names=labels)
def ete_tree(self, labels=None): if sys.version_info[0] == 2: from ete2 import Tree, NodeStyle, TreeStyle elif sys.version_info[0] == 3: from ete3 import Tree, NodeStyle, TreeStyle else: raise ValueError('Your version of Python is not supported.') from scipy.cluster.hierarchy import to_tree T = to_tree(self.to_linkage_matrix()) root = Tree() root.dist = 0 root.name = "root" item2node = {T: root} to_visit = [T] while to_visit: node = to_visit.pop() cl_dist = node.dist / 2.0 for ch_node in [node.left, node.right]: if ch_node: ch = Tree() ch.dist = cl_dist ch.name = str(ch_node.id) item2node[node].add_child(ch) item2node[ch_node] = ch to_visit.append(ch_node) if labels != None: for leaf in root.get_leaves(): leaf.name = str(labels[int(leaf.name)]) ts = TreeStyle() ts.show_leaf_name = True # Draws nodes as small red spheres of diameter equal to 10 pixels nstyle = NodeStyle() nstyle["shape"] = None nstyle["size"] = 0 # Gray dashed branch lines nstyle["hz_line_type"] = 1 nstyle["hz_line_color"] = "#cccccc" # Applies the same static style to all nodes in the tree. Note that, # if "nstyle" is modified, changes will affect to all nodes for n in root.traverse(): n.set_style(nstyle) return root
def optimal_ordering(Z, dists): # Z - linkage matrix # dists - the distance matrix # get the tree and a list of handles to its leaves tree,rd = hierarchy.to_tree(Z, True) # Generate scores M = optimal_scores(Z, rd, dists) # re-order the tree accordingly order_tree(Z, rd, M) # new leaf ordering row_reorder = tree.pre_order() return row_reorder