示例#1
0
def make_tree(X, C, method='single'):
    if method == 'single':
        tree = to_tree(single(C))
    elif method == 'ward':
        tree = to_tree(ward(X))
    elif method == 'average':
        tree = to_tree(average(C))
    return Tree(root=construct_node(tree))
示例#2
0
def make_tree(X, C, method='single'):
    if method == 'single':
        tree = to_tree(single(C))
    elif method == 'ward':
        tree = to_tree(ward(X))
    elif method == 'average':
        tree = to_tree(average(C))
    return Tree(root=construct_node(tree))
示例#3
0
文件: loader.py 项目: biobakery/halla
 def load_linkages(self):
     if not isfile(join(self.input_dir, 'X_linkage.npy')):
         self.name = 'AllA'
         return
     self.name = 'HAllA'
     self.X_linkage = np.load(join(self.input_dir, 'X_linkage.npy'))
     self.Y_linkage = np.load(join(self.input_dir, 'Y_linkage.npy'))
     self.X_tree = sch.to_tree(self.X_linkage)
     self.Y_tree = sch.to_tree(self.Y_linkage)
def plot_leaf_ordering(X, method, metric):
    dists = distance.squareform(distance.pdist(X, metric=metric))
    dists2 = distance.squareform(distance.pdist(X.T, metric=metric))

    Z = hierarchy.linkage(X, method=method, metric=metric)
    Z2 = hierarchy.linkage(X.T, method=method, metric=metric)

    t, rd = hierarchy.to_tree(Z, True)
    t2, rd2 = hierarchy.to_tree(Z2, True)

    M = optimal_scores(Z, rd, dists)
    order_tree(Z, rd, M)
    M2 = optimal_scores(Z2, rd2, dists2)
    order_tree(Z2, rd2, M2)

    rr = t.pre_order()
    rr2 = t2.pre_order()

    import matplotlib.pyplot as plt
    from matplotlib.gridspec import GridSpec

    fig = plt.figure(figsize=(8, 8))
    gs = GridSpec(2,
                  2,
                  top=0.95,
                  bottom=0.05,
                  left=0.05,
                  right=0.95,
                  hspace=0.01,
                  wspace=0.01,
                  width_ratios=(1, 3),
                  height_ratios=(1, 3))

    ax01 = fig.add_subplot(gs[0, 1])
    ax10 = fig.add_subplot(gs[1, 0])
    ax11 = fig.add_subplot(gs[1, 1])

    hierarchy.dendrogram(Z2, ax=ax01)
    ax01.set_axis_off()
    hierarchy.dendrogram(Z, orientation='right', ax=ax10)
    ax10.set_axis_off()

    ax11.matshow(X[np.ix_(rr, rr2)], cmap="Blues", aspect="auto")
    ax11.tick_params(**{s: 'off' for s in ('top', 'bottom', 'right')})
    ax11.tick_params(labeltop='off', labelleft='off', labelright='on')

    ax11.set_xticks(np.arange(len(rr2)))
    ax11.set_xticklabels(rr2, fontsize=5.0)
    ax11.set_yticks(np.arange(len(rr)))
    ax11.set_yticklabels(rr, fontsize=5.0)

    plt.show()
示例#5
0
def classify_by_scores(M, threshold, loci, return_file_names=None):

    M_array = ssd.squareform(M)

    Z = linkage(M_array, method='average')

    root = to_tree(Z)
    root = clone_graph(root)

    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    total_count = 1

    pool = []

    while True:
        cur_node = id2node[leaf_ids[i]]
        parent_dist = cur_node.parent.dist

        while parent_dist < threshold:
            cur_node = cur_node.parent
            parent_dist = cur_node.parent.dist

        cur_leaf_ids = get_leaves(cur_node)

        pool.append([id for id in cur_leaf_ids])

        total_count += cur_node.count

        i += len(cur_leaf_ids)

        if i >= len(leaf_ids)-1:
            break
        cnt += 1

    clusters = [l for l in pool if len(l) > 1]
    singles = [l[0] for l in pool if len(l) == 1]

    clusters = sorted(clusters, key=lambda x: len(x), reverse=True)

    if return_file_names:

        clusters_fn = []

        for cluster in clusters:

            clusters_fn.append([os.path.basename(loci[i].file_name) for i in cluster])

        singles_fn = [ os.path.basename(loci[i].file_name) for i in singles]

        return singles_fn, clusters_fn

    else:

        return singles, clusters
示例#6
0
def linkage_to_newick(dataframe, output_file):
    """
    Thanks to https://github.com/biocore/scikit-bio/issues/1579
    Input :  Z = linkage matrix, labels = leaf labels
    Output:  Newick formatted tree string
    """
    dataframe_only_samples = dataframe.set_index(dataframe['Position'].astype(int)).drop(['Position','N','Samples'], axis=1) #extract three first colums and use 'Position' as index
    labelList = dataframe_only_samples.columns.tolist()
    Z = shc.linkage(dataframe_only_samples.T, method='average')

    tree = shc.to_tree(Z, False)
    def buildNewick(node, newick, parentdist, leaf_names):
        if node.is_leaf():
            #print("%s:%f%s" % (leaf_names[node.id], parentdist - node.dist, newick))
            return "%s:%f%s" % (leaf_names[node.id], parentdist - node.dist, newick)
        else:
            if len(newick) > 0:
                newick = f"):{(parentdist - node.dist)/2}{newick}"
            else:
                newick = ");"
            newick = buildNewick(node.get_left(), newick, node.dist, leaf_names)
            newick = buildNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names)
            newick = "(%s" % (newick)
            #print(newick)
            return newick

    with open(output_file, 'w') as f:
        f.write(buildNewick(tree, "", tree.dist, labelList))
    return buildNewick(tree, "", tree.dist, labelList)
示例#7
0
def mkClustaloNewickTree(fastaFileName, verbose=False):
    """
    make a newick tree using sequences in a fasta file.
    first, use mkDistanceMatrix to compute a distance matrix between the
    sequences, then use the scipy.cluster module to compute a tree.
    This tree is then converted into a newick string with the function
    getNewick
    input:
    - fastaFileName -- file with sequences
    - verbose -- print messages
    output:
    - newick_str -- the newick string representing the tree
    """
    ## get the distance matrix
    distmat, distmat_header = mkClustaloDistanceMatrix(fastaFileName, verbose=verbose)
    flat_distmat = aux.mkFlatDistMat(distmat)
    ## use scipy.cluster.hierarchy to make a dendogram
    Z = sclush.linkage(flat_distmat, method='average') ## UPGMA, cf. MhcCluster
    ## scale Z such that depth is 1
    Z = scaleHClus(Z)
    ## optionally plot the dendogram
    #fig, (ax1) = plt.subplots(1, 1, figsize=(20,5))
    #sclush.dendrogram(Z, orientation='top', labels=distmat_header, ax=ax1)
    #plt.setp(ax1.get_xticklabels(), rotation=90)
    tree = sclush.to_tree(Z, False)
    newick_str = getNewick(tree, "", tree.dist, distmat_header)
    return newick_str
示例#8
0
def cluster_dandelion_2(dataset, gamma=0.91, filter=False):
    #duplicato, mi serve solo per tornare la linkage_matrix
    doc_proc = dp.DocumentsProcessor(dataset)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion(
            gamma=gamma, filter=filter)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_dandelion()

    svd = TruncatedSVD(tfidf_matrix.shape[0])
    lsa = make_pipeline(svd, Normalizer(copy=False))

    tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    #linkage_matrix = hr.average(tfidf_matrix.toarray())
    linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    return linkage_matrix
示例#9
0
def CreateHeiarchicalTree(cluster_set, params_set, interval, print_outs=False):
    L = len(cluster_set)
    if L > 1:
        Dmatrix = zeros((L, L))
        leaf_index_cluster_dict = {}
        if print_outs:
            print "calculating function distances"
        for i in range(L):
            p1 = params_set[i]
            c1 = cluster_set[i]

            leaf_index_cluster_dict[i] = (c1, p1)
            for j in range(L):

                p2 = params_set[j]
                Dmatrix[i, j] = Cdist(p1, p2, interval)

        if print_outs:
            print "converting to square form"
        Dmatrix_c = squareform(Dmatrix)
        if print_outs:
            print "hclustering"
        linkageMatrix = hcluster.linkage(Dmatrix_c)
        if print_outs:
            print "creating tree"
        Root, node_list = to_tree(linkageMatrix, rd=True)
    else:
        raise Exception("Cannot generate a tree from a single cluster")
    return Root, node_list, leaf_index_cluster_dict
示例#10
0
def plot_dendrogram(Z, dendogram_file_name):

    root = to_tree(Z)
    threshold = root.dist / 3.0
    all_leaves = get_leaves(root)

    plt.figure(figsize=(30, 30))
    title = 'Hierarchical Clustering Dendrogram( %d leaves)' % len(all_leaves)
    xlabel = 'loci'
    ylabel = 'distance'

    fancy_dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=4.,  # font size for the x axis labels
        annotate_above=10,
        max_d=threshold,
        title=title,
        xlabel=xlabel,
        ylabel=ylabel)

    # plt.savefig(os.path.join(report_path, 'dendrogram_distance_array.eps'), format='eps', dpi=900)
    if dendogram_file_name.endswith('pdf'):
        plt.savefig(dendogram_file_name, format='pdf')
    elif dendogram_file_name.endswith('png'):
        plt.savefig(dendogram_file_name, format='png')
    else:
        raise NotImplemented('File format has to be either png or pdf')

    plt.close()
    return threshold
示例#11
0
def reconstruct_scipy_linkage(df_mutation_table, path_out_newick):

    from scipy.cluster import hierarchy

    # get newick without distance
    def getNewick(node, newick, parentdist, leaf_names):
        if node.is_leaf():
            return "%s%s" % (leaf_names[node.id], newick)
        else:
            if len(newick) > 0:
                newick = ")%s" % (newick)
            else:
                newick = ");"
            newick = getNewick(node.get_left(), newick, node.dist, leaf_names)
            newick = getNewick(node.get_right(), ",%s" % (newick), node.dist,
                               leaf_names)
            newick = "(%s" % (newick)
            return newick

    tdf = df_mutation_table.drop(columns='root').transpose()
    link = scipy.cluster.hierarchy.linkage(tdf)
    tree = hierarchy.to_tree(link, False)
    newick_str = getNewick(tree, "", tree.dist, tdf.index)

    with open(path_out_newick, 'wt') as fout:
        fout.write(newick_str)
        fout.write('\n')

    return tree
示例#12
0
    def _process_block():
        """Initialize nested dictionary for d3, then recursively iterate through tree and create the dict."""
        tree = to_tree(linkage, rd=False)

        _add_node(tree, bcluster_dendro)
        _label_tree(bcluster_dendro["children"][-1])  # get the last element
        return bcluster_dendro
示例#13
0
def ward_dynamicTreeCut(rmsd_mat, tau=5):
    n = rmsd_mat.shape[0]
    dend = get_ward_dendrogram(rmsd_mat)
    tree = to_tree(dend)
    breakpoints = dynamicTreeCut(tree, n, tau)
    comm_assing = report_assingments(breakpoints, tree)
    return comm_assing
示例#14
0
    def __get_column_dendrogram__(self):
	#root and nodes have the coloumn clustered data
        root, nodes = hcluster.to_tree(self.cluster_object.column_clustering, rd=True)
	#node_id2node is a list
        node_id2node = {}
	#dendogram is a graph having node as starting address and a list followed by every node
        dendrogram = {"nodes":{}}
	
	#iterate through all nodes
        for node in nodes:
	    print ("id is:", id)
            node_id = node.id
	    # if node is leaf node
            if node.count == 1:
                node_id2node[node_id] = {"count":1, "distance":0}

            else:
	    # assign left and right child in form of graph to a node_id2
                node_left_child = node.get_left().id
                node_right_child = node.get_right().id
                node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child}

	#assigning parent as the number of node in id2node
        for n in node_id2node:
            node = node_id2node[n]
            if node["count"] != 1:
                node_id2node[node["left_child"]]["parent"] = n
                node_id2node[node["right_child"]]["parent"] = n

	#if array list of nodes is not present in the dandrogram
        for n in node_id2node:
             if not n in dendrogram["nodes"]:
                dendrogram["nodes"][n] = node_id2node[n]

        return dendrogram
示例#15
0
def hierarchical_clustering_to_dendrogram(clustering):
    """Converts an array representing a clustering to a dendrogram.

    Args:
        clustering (ndarray): A hierarchical clustering matrix, in the form
            returned by scipy.hierarchical.linkage.

    Returns:
        (networkx.DiGraph): A dendrogram. Each node in the dendrogram has the
        'distance' attribute, which is the threshold at which its children
        are merged in the clustering.
    """
    root = _hierarchy.to_tree(clustering)

    tree = _nx.DiGraph()
    tree.add_node(root.id, distance=root.dist)

    if root.left:
        queue = [(root, root.left), (root, root.right)]

    while queue:
        parent, child = queue.pop(0)

        tree.add_edge(parent.id, child.id)
        tree.node[child.id]['distance'] = float(child.dist)

        if child.left:
            queue.append((child, child.left))

        if child.right:
            queue.append((child, child.right))

    return tree
示例#16
0
  def to_dict(self, correlation_matrix, linkage_matrix):

    from scipy.cluster import hierarchy
    tree = hierarchy.to_tree(linkage_matrix, rd=False)
    leaves_list = hierarchy.leaves_list(linkage_matrix)

    d = {}

    # http://w3facility.org/question/scipy-dendrogram-to-json-for-d3-js-tree-visualisation/
    # https://gist.github.com/mdml/7537455

    def add_node(node):
      if node.is_leaf(): return
      cluster_id = node.get_id() - len(linkage_matrix) - 1
      row = linkage_matrix[cluster_id]
      d[cluster_id+1] = {
        'datasets': [i+1 for i in sorted(node.pre_order())],
        'height': row[2],
      }

      # Recursively add the current node's children
      if node.left: add_node(node.left)
      if node.right: add_node(node.right)

    add_node(tree)

    return d
示例#17
0
def plot_dendrogram(Z, dendogram_file_name):

    root = to_tree(Z)
    threshold = root.dist / 3.0
    all_leaves = get_leaves(root)

    plt.figure(figsize=(30, 30))
    title = 'Hierarchical Clustering Dendrogram( %d leaves)' % len(all_leaves)
    xlabel = 'loci'
    ylabel = 'distance'

    fancy_dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=4.,  # font size for the x axis labels
        annotate_above=10,
        max_d=threshold,
        title=title,
        xlabel=xlabel,
        ylabel=ylabel
    )

    # plt.savefig(os.path.join(report_path, 'dendrogram_distance_array.eps'), format='eps', dpi=900)
    if dendogram_file_name.endswith('pdf'):
        plt.savefig(dendogram_file_name, format='pdf')
    elif dendogram_file_name.endswith('png'):
        plt.savefig(dendogram_file_name, format='png')
    else:
        raise NotImplemented('File format has to be either png or pdf')

    plt.close()
    return threshold
示例#18
0
    def to_dict(self, linkage_matrix):

        from scipy.cluster import hierarchy
        tree = hierarchy.to_tree(linkage_matrix, rd=False)
        leaves_list = hierarchy.leaves_list(linkage_matrix)

        d = {}

        # http://w3facility.org/question/scipy-dendrogram-to-json-for-d3-js-tree-visualisation/
        # https://gist.github.com/mdml/7537455

        def add_node(node):
            if node.is_leaf(): return
            cluster_id = node.get_id() - len(linkage_matrix) - 1
            row = linkage_matrix[cluster_id]
            d[cluster_id + 1] = {
                'datasets': [i + 1 for i in sorted(node.pre_order())],
                'height': row[2],
            }

            # Recursively add the current node's children
            if node.left: add_node(node.left)
            if node.right: add_node(node.right)

        add_node(tree)

        return d
示例#19
0
def optMDL(df):

    Z = getDist(df)
    tree = sc.to_tree(Z, rd=True)[1]
    minMDL = 1000000
    optK = 0
    desLength = 0
    DList = []
    for n_cluster in range(1, 11, 1):  #range(df.shape[0]+1)
        N = fcluster(Z, n_cluster, criterion='maxclust')
        L, M = sc.leaders(Z, N)
        leaders = list(L)
        print(leaders)
        leafDict = {}

        for node in tree:
            if node.get_id() in leaders:
                key = node.get_id()

                if node.get_count() > 1:

                    dist = getleafdict(node)
                else:
                    dist = {key: 0}

                leafDict[key] = dist

        desLength = binning(leafDict) + n_cluster * np.log2(df.shape[0])
        DList.append(desLength)

        #if desLength
        if desLength < minMDL:
            minMDL = desLength
            optK = n_cluster
    return optK, minMDL, DList
示例#20
0
    def test_mirac_wrong_args(self):
        x = np.zeros((10, 10))
        # wrong min_cl_n
        with pytest.raises(ValueError) as excinfo:
            cluster.MIRAC(x, metric='euclidean', min_cl_n=-0.1)

        with pytest.raises(ValueError) as excinfo:
            cluster.MIRAC(x, metric='euclidean', min_cl_n=-0.1)
        # wrong cl_mdl_scale_factor
        with pytest.raises(ValueError) as excinfo:
            cluster.MIRAC(x, metric='euclidean', cl_mdl_scale_factor=-0.1)
        # wrong encode type
        with pytest.raises(ValueError) as excinfo:
            cluster.MIRAC(x, metric='euclidean', encode_type='1')

        with pytest.raises(ValueError) as excinfo:
            cluster.MIRAC(x, metric='euclidean', encode_type=1)

        with pytest.raises(ValueError) as excinfo:
            cluster.MIRAC(x, metric='euclidean', dim_reduct_method='NONN')

        # hac tree n_leaves different from n_samples
        z = sch.linkage([[0], [5], [6], [8], [9], [12]],
                        method='single', optimal_ordering=True)
        hct = eda.HClustTree(sch.to_tree(z))
        with pytest.raises(ValueError) as excinfo:
            cluster.MIRAC(x, metric='euclidean', hac_tree=hct)
示例#21
0
def get_motiftrees(motifs,
                   buckettable,
                   method="ward",
                   metric="euclidean",
                   outputdir="MotifTree/"):

    os.mkdir(outputdir)

    # select only features present in tree
    bt_sel = buckettable[buckettable['#OTU ID'].isin(
        list(set(buckettable['#OTU ID']) & set(motifs['scans'])))]
    bt_sel.to_csv(outputdir + 'Buckettable_Motifs.tsv', sep='\t', index=False)

    motifs = motifs[pd.notnull(
        motifs['motif'])]  # remove features, which do not contain any motif
    motifs = motifs.loc[:, (motifs != 0).any(axis=0)]
    motifs.index = motifs['scans']
    motifs = motifs.filter(like='motif_')  # select all motif columns

    Z = scipy.cluster.hierarchy.linkage(motifs, method=method, metric=metric)
    leaf_names = motifs.index  # remove white space from leaf labels
    tree = hierarchy.to_tree(Z, False)

    f = open(outputdir + 'Tree_Motifs.txt', 'w')
    f.write(getNewick(tree, "", tree.dist, leaf_names))
    f.close()
示例#22
0
def objectlm_covariance(matrix, savepath, metric="cosine"):
    if not savepath.endswith("/"):
        savepath = savepath + "/"
    if os.path.exists(savepath + "__linkage_average.npy"):
        Z = np.load(savepath + "__linkage_average.npy")
    else:
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        Z = linkage(matrix, method='average', metric=metric)
        np.save(savepath + "__linkage_average.npy", Z)
    if os.path.exists(savepath + "__covariance__.npy"):
        Cov = np.load(savepath + "__covariance__.npy")
        observables = HierarchicalObservation(Cov)
    else:
        root, nodes = to_tree(Z, rd=True)
        assign_parents(root)
        adj_mat = get_adjacency_matrix(nodes)
        deg_mat = get_degree_matrix(nodes)
        sigma = 5
        laplacian = np.diag(deg_mat) - adj_mat + 1 / (sigma**2) * np.eye(
            len(deg_mat))
        Cov = np.linalg.inv(laplacian)[:matrix.shape[0], :matrix.shape[0]]
        np.save(savepath + "__covariance__.npy", Cov)
        observables = HierarchicalObservation(Cov)
    return observables
    def __init__(self,
                 num_topics,
                 metric='jensenshannon',
                 method='ward',
                 unique_scale=True,
                 topn=None):
        """
        Saves linkage matrix `Z´ and `nodelist´
        args:
            num_topics (int): Selects LDA model.
            metric (str): Metric passed to scipy.spatial.distance.pdist 
            method (str): Method passed to scipy.cluster.hierarchy
            unique_scale (bool): Scale word proba by uniqueness
            topn (int, optional): only consider X words (don't use)
        """

        self.num_topics = num_topics
        self.metric = metric
        self.method = method
        self.scale = 200

        folder_path = os.path.join(params().paths['lda'],
                                   'lda_model_' + str(self.num_topics))
        file_path = os.path.join(folder_path, 'trained_lda')
        self.lda_model = gensim.models.LdaMulticore.load(file_path)
        topics = self.lda_model.get_topics()
        if unique_scale:
            topics = topics / (topics.sum(axis=0))
        if topn:
            topics.sort(axis=1)
            topics = np.flip(topics, axis=1)
            topics = topics[:, 0:topn]
        y = pdist(topics, metric=self.metric)
        self.Z = hierarchy.linkage(y, method=self.method)
        rootnode, self.nodelist = hierarchy.to_tree(self.Z, rd=True)
示例#24
0
 def check_leaves_list_iris(self, method):
     # Tests leaves_list(Z) on the Iris data set
     X = eo['iris']
     Y = pdist(X)
     Z = linkage(X, method)
     node = to_tree(Z)
     assert_equal(node.pre_order(), leaves_list(Z))
示例#25
0
    def linkage_matrix_to_dict(linkage_matrix):
        tree = hierarchy.to_tree(linkage_matrix, rd=False)

        d = {}

        # http://w3facility.org/question/scipy-dendrogram-to-json-for-d3-js-tree-visualisation/
        # https://gist.github.com/mdml/7537455

        def add_node(node):
            if node.is_leaf():
                return
            cluster_id = node.get_id() - len(linkage_matrix) - 1
            row = linkage_matrix[cluster_id]
            d[cluster_id + 1] = {
                "datasets": [i + 1 for i in sorted(node.pre_order())],
                "height": row[2],
            }

            # Recursively add the current node's children
            if node.left:
                add_node(node.left)
            if node.right:
                add_node(node.right)

        add_node(tree)

        return OrderedDict(sorted(d.items()))
def guide_tree_from_sequences(sequences,
                              metric=kmer_distance,
                              display_tree=False):
    """ Build a UPGMA tree by applying metric to sequences

    Parameters
    ----------
    sequences : list of skbio.Sequence objects (or subclasses)
      The sequences to be represented in the resulting guide tree.
    metric : function
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects.
    display_tree : bool, optional
      Print the tree before returning.

    Returns
    -------
    skbio.TreeNode

    """
    guide_dm = DistanceMatrix.from_iterable(sequences, metric=metric, key='id')
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm,
                             labels=guide_dm.ids,
                             orientation='right',
                             link_color_func=lambda x: 'black')
    return guide_tree
示例#27
0
def tfidf_covariance(texts, savepath):
    if not savepath.endswith("/"):
        savepath = savepath + "/"
    if os.path.exists(savepath + "__linkage_average.npy"):
        Z = np.load(savepath + "__linkage_average.npy")
    else:
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(input=str,
                                     strip_accents='ascii',
                                     analyzer='word',
                                     max_features=5000)
        y = vectorizer.fit_transform(" ".join(text) for text in texts)
        Z = linkage(y.todense(), method='average', metric='euclidean')
        np.save(savepath + "__linkage_average.npy", Z)

    if os.path.exists(savepath + "__covariance__.npy"):
        Cov = np.load(savepath + "__covariance__.npy")
        observables = HierarchicalObservation(Cov)
    else:
        root, nodes = to_tree(Z, rd=True)
        assign_parents(root)
        adj_mat = get_adjacency_matrix(nodes)
        deg_mat = get_degree_matrix(nodes)
        sigma = 5
        laplacian = np.diag(deg_mat) - adj_mat + 1 / (sigma**2) * np.eye(
            len(deg_mat))
        Cov = np.linalg.inv(laplacian)[:len(texts), :len(texts)]
        np.save(savepath + "__covariance__.npy", Cov)
        observables = HierarchicalObservation(Cov)
    return observables
    def __init__(self, flat_cluster, cluster = None, curve_list = None):
        from scipy.cluster.hierarchy import to_tree
        from numpy import asarray, sort

        self.flat = flat_cluster # FlatClusters object
        self.co_analysis = self.flat.get_co_analysis() #CoAnalysis object
        self.cluster = cluster #Cluster object

        if not cluster == None:
            self.curve_list = cluster.list_curve_indexes()
        else:
            self.curve_list = curve_list

        self.Z = self.co_analysis.get_hierarchical_cluster()

        root = to_tree(self.Z) # root of entire cluster!
        curves = asarray(self.curve_list) # list of curves in this cluster

        # Get the cluster node that corresponds to the curves in the cluster above
        self.cluster_node = get_cluster_node(root, root.left, root.right, curves)
        self.id = self.cluster_node.get_id()

        # Get the right and left cluster nodes
        self.left = self.cluster_node.left
        self.right = self.cluster_node.right

        # Get the left and right cluster lists
        self.left_list = sort(any_pre_order(root, self.left))
        self.right_list = sort(any_pre_order(root, self.right))
示例#29
0
def scipy_algo(dataset, abstract=False):
    doc_proc = dp.DocumentsProcessor(dataset)
    tfidf_matrix, f_score_dict = doc_proc.get_data(abstract)

    svd = TruncatedSVD(tfidf_matrix.shape[0])
    lsa = make_pipeline(svd, Normalizer(copy=False))

    #tfidf_matrix = lsa.fit_transform(tfidf_matrix)

    print 'starting clustering after lsa: found %s document and %s features' \
          % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])

    linkage_matrix = hr.average(tfidf_matrix.toarray())
    #linkage_matrix = hr.average(tfidf_matrix)

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    print_f_score_dict(f)

    avg_f_score = average_f_score(f, tfidf_matrix.shape[0])
    print 'average f_score: %s' % avg_f_score
    return avg_f_score
示例#30
0
def get_clusters_hac(data: DataFrame,
                     dist_metric: str,
                     height: int = None,
                     show_dendrogram: bool = False,
                     show_chart: bool = False):
    """
    Use Hierarchical Agglomerative Clustering (HAC) to cluster phrase vectors

    Returns (list, np.ndarray, int)
    """
    # Create a linkage matrix
    if dist_metric == "cosine":
        dist = 1 - cosine_similarity(list(data.vec))
    else:
        dist = pairwise_distances(list(data.vec), metric=dist_metric)
    linkage_matrix = ward(dist)

    # Maximum cut point height is the height of the tree
    max_h = get_tree_height(hierarchy.to_tree(linkage_matrix)) + 1

    # Use optimal height if no height is specified
    if height is None:
        height = get_optimal_height(data, linkage_matrix, max_h, show_chart)

    cluster_assignments = get_cluster_assignments_hac(linkage_matrix, height)

    # Optionally display the clustering dendrogram
    if show_dendrogram:
        dendrogram(linkage_matrix)
        plt.show()

    return cluster_assignments, linkage_matrix, max_h, height
def guide_tree_from_sequences(sequences,
                              distance_fn=kmer_distance,
                              display_tree=False):
    """ Build a UPGMA tree by applying distance_fn to sequences

    Parameters
    ----------
    sequences : skbio.SequenceCollection
      The sequences to be represented in the resulting guide tree.
    sequence_distance_fn : function
      Function that returns and skbio.DistanceMatrix given an
      skbio.SequenceCollection.
    display_tree : bool, optional
      Print the tree before returning.

    Returns
    -------
    skbio.TreeNode

    """
    guide_dm = sequences.distances(distance_fn)
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm,
                             labels=guide_dm.ids,
                             orientation='right',
                             link_color_func=lambda x: 'black')
    return guide_tree
示例#32
0
def tfidf_covariance(texts, savepath):
    if not savepath.endswith("/"):
        savepath = savepath + "/"
    if os.path.exists(savepath + "__linkage_average.npy"):
        Z = np.load(savepath + "__linkage_average.npy")
    else:
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(input = str,
                                 strip_accents = 'ascii',
                                 analyzer ='word',
                                 max_features=5000)
        y = vectorizer.fit_transform(" ".join(text) for text in texts)
        Z = linkage(y.todense(), method='average', metric='euclidean')
        np.save(savepath + "__linkage_average.npy", Z)

    if os.path.exists(savepath + "__covariance__.npy"):
        Cov = np.load(savepath + "__covariance__.npy")
        observables = HierarchicalObservation(Cov)
    else:
        root, nodes = to_tree(Z, rd=True)
        assign_parents(root)
        adj_mat = get_adjacency_matrix(nodes)
        deg_mat = get_degree_matrix(nodes)
        sigma = 5
        laplacian = np.diag(deg_mat) - adj_mat + 1/(sigma**2) * np.eye(len(deg_mat))
        Cov = np.linalg.inv(laplacian)[:len(texts), :len(texts)]
        np.save(savepath + "__covariance__.npy", Cov)
        observables = HierarchicalObservation(Cov)
    return observables
示例#33
0
    def create_cluster_heatmap(self, compress=False, compressed_value="median", write_data=True):
        """Creates cluster heatmap representation in inchlib format. By setting compress parameter to True you can
        cut the dendrogram in a distance to decrease the row size of the heatmap to specified count. 
        When compressing the type of the resulted value of merged rows is given by the compressed_value parameter (median, mean).
        When the metadata are nominal (text values) the most frequent is the result after compression.
        By setting write_data to False the data features won't be present in the resulting format."""
        self.dendrogram = {"data": self.__get_cluster_heatmap__(write_data)}

        self.compress = compress
        self.compressed_value = compressed_value
        self.compress_cluster_threshold = 0
        if self.compress and self.compress >= 0:
            self.compress_cluster_threshold = self.__get_distance_threshold__(compress)
            print("Distance threshold for compression:", self.compress_cluster_threshold)
            if self.compress_cluster_threshold >= 0:
                self.__compress_data__()
        else:
            self.compress = False

        if self.header and write_data:
            self.dendrogram["data"]["feature_names"] = [h for h in self.header]
        elif self.header and not write_data:
            self.dendrogram["data"]["feature_names"] = []
        
        if self.axis == "both" and len(self.cluster_object.column_clustering):
            column_dendrogram = hcluster.to_tree(self.cluster_object.column_clustering)            
            self.dendrogram["column_dendrogram"] = self.__get_column_dendrogram__()
示例#34
0
    def __get_column_dendrogram__(self):
        root, nodes = hcluster.to_tree(self.cluster_object.column_clustering, rd=True)
        node_id2node = {}
        dendrogram = {"nodes":{}}

        for node in nodes:
            node_id = node.id
            if node.count == 1:
                node_id2node[node_id] = {"count":1, "distance":0}

            else:
                node_left_child = node.get_left().id
                node_right_child = node.get_right().id
                node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child}

        for n in node_id2node:
            node = node_id2node[n]
            if node["count"] != 1:
                node_id2node[node["left_child"]]["parent"] = n
                node_id2node[node["right_child"]]["parent"] = n

        for n in node_id2node:
             if not n in dendrogram["nodes"]:
                dendrogram["nodes"][n] = node_id2node[n]

        return dendrogram
示例#35
0
def make_interactive_tree(matrix=None,labels=None):
    '''make interactive tree will return complete html for an interactive tree
    :param title: a title for the plot, if not defined, will be left out.
    '''
    from scipy.cluster.hierarchy import (
        dendrogram, 
        linkage,
        to_tree
    )

    d3 = None
    from scipy.cluster.hierarchy import cophenet
    from scipy.spatial.distance import pdist

    if isinstance(matrix,pandas.DataFrame):
        Z = linkage(matrix, 'ward') # clusters
        T = to_tree(Z, rd=False)

        if labels == None:
            labels = matrix.index.tolist()
        lookup = dict(zip(range(len(labels)), labels))

        # Create a dendrogram object without plotting
        dend = dendrogram(Z,no_plot=True,
                      orientation="right",
                      leaf_rotation=90.,  # rotates the x axis labels
                      leaf_font_size=8.,  # font size for the x axis labels
                      labels=labels)

        d3 = dict(children=[], name="root")
        add_node(T, d3)
        label_tree(d3["children"][0],lookup)
    else:
        bot.warning('Please provide data as pandas Data Frame.')
    return d3
示例#36
0
    def __get_column_dendrogram__(self):
        root, nodes = hcluster.to_tree(self.cluster_object.column_clustering, rd=True)
        node_id2node = {}
        dendrogram = {"nodes":{}}

        for node in nodes:
            node_id = node.id
            if node.count == 1:
                node_id2node[node_id] = {"count":1, "distance":0}

            else:
                node_left_child = node.get_left().id
                node_right_child = node.get_right().id
                node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child}

        for n in node_id2node:
            node = node_id2node[n]
            if node["count"] != 1:
                node_id2node[node["left_child"]]["parent"] = n
                node_id2node[node["right_child"]]["parent"] = n

        for n in node_id2node:
             if not n in dendrogram["nodes"]:
                dendrogram["nodes"][n] = node_id2node[n]

        return dendrogram
示例#37
0
    def create_cluster_heatmap(self, compress=False, compressed_value="median", write_data=True):
        """Creates cluster heatmap representation in inchlib format. By setting compress parameter to True you can
        cut the dendrogram in a distance to decrease the row size of the heatmap to specified count. 
        When compressing the type of the resulted value of merged rows is given by the compressed_value parameter (median, mean).
        When the metadata are nominal (text values) the most frequent is the result after compression.
        By setting write_data to False the data features won't be present in the resulting format."""
        self.dendrogram = {"data": self.__get_cluster_heatmap__(write_data)}

        self.compress = compress
        self.compressed_value = compressed_value
        self.compress_cluster_treshold = 0
        if self.compress and self.compress >= 0:
            self.compress_cluster_treshold = self.__get_distance_treshold__(compress)
            print("Distance treshold for compression:", self.compress_cluster_treshold)
            if self.compress_cluster_treshold >= 0:
                self.__compress_data__()
        else:
            self.compress = False

        if self.header and write_data:
            self.dendrogram["data"]["feature_names"] = [h for h in self.header]
        elif self.header and not write_data:
            self.dendrogram["data"]["feature_names"] = []
        
        if self.axis == "both" and len(self.cluster_object.column_clustering):
            column_dendrogram = hcluster.to_tree(self.cluster_object.column_clustering)            
            self.dendrogram["column_dendrogram"] = self.__get_column_dendrogram__()
def guide_tree_from_sequences(sequences,
                              metric=kmer_distance,
                              display_tree = False):
    """ Build a UPGMA tree by applying metric to sequences

    Parameters
    ----------
    sequences : list of skbio.Sequence objects (or subclasses)
      The sequences to be represented in the resulting guide tree.
    metric : function
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects.
    display_tree : bool, optional
      Print the tree before returning.

    Returns
    -------
    skbio.TreeNode

    """
    guide_dm = DistanceMatrix.from_iterable(
                    sequences, metric=metric, key='id')
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right',
               link_color_func=lambda x: 'black')
    return guide_tree
示例#39
0
def clusters_to_json(clusters, labels):
    T = hcl.to_tree(clusters, rd=False)
    # Create dictionary for labeling nodes by their IDs
    id2name = dict(zip(range(len(labels)), labels))

    # Initialize nested dictionary for d3, then recursively iterate through tree
    d3Dendro = dict(children=[], name="Root1")
    add_node(T, d3Dendro)

    leafNames_list = []
    sys.setrecursionlimit(15000)
    label_tree(d3Dendro["children"][0], id2name, leafNames_list)

    # Output to JSON
    json.dump(d3Dendro, open(OUT_JSON_FILE, "w"), sort_keys=True, indent=4)

    with open(MODEL_DIR+"\\leafNames_list.txt", 'w') as fp:
        indx = 0
        for line in leafNames_list:

            def join_list(line_list, joined_list):

                for word in line_list:
                    if type(word) == int:
                        # find word
                        join_list(leafNames_list[word], joined_list)
                    else:
                        joined_list.append(str(word))

            joined_list = []
            join_list(line, joined_list)
            fp.write(str(indx)+"\t"+'--'.join(joined_list)+"\n")
            # fp.write(str(indx) + "\t" + "--".join(str(x) for x in line) + "\n")
            indx += 1
示例#40
0
def cluster_sequences(sequences, minsize=5):
    """
    Cluster the given sequences into groups of similar sequences.

    Return a triple that contains a pandas.DataFrame with the edit distances,
    the linkage result, and a list that maps sequence ids to their cluster id.
    If an entry is zero in that list, it means that the sequence is not part of
    a cluster.
    """
    matrix = distances(sequences)
    linkage = hierarchy.linkage(distance.squareform(matrix), method='average')
    # Linkage columns are:
    # 0, 1: merged clusters, 2: distance, 3: number of nodes in cluster
    inner = inner_nodes(hierarchy.to_tree(linkage))
    prev = linkage[:, 2].max()  # highest distance
    clusters = [0] * len(sequences)
    cl = 1
    for n in inner:
        if n.dist > 0 and prev / n.dist < 0.8 \
                and n.left.count >= minsize and n.right.count >= minsize:
            for id in collect_ids(n.left):
                # Do not overwrite previously assigned ids
                if clusters[id] == 0:
                    clusters[id] = cl
            cl += 1
        prev = n.dist
    # At the end of the above loop, we have not processed the rightmost
    # subtree. In our experiments, it never contains true novel sequences,
    # so we omit it.

    return pd.DataFrame(matrix), linkage, clusters
    def _build_graph(self):
        self.root_ = to_tree(self.z)
        self.root_id_ = str(self.root_.id)
        self.graph_ = nx.DiGraph()
        self.graph_.add_node(self.root_id_)
        for node in self._walk(self.root_):
            label = str(self.labels_dict.get(node.id, node.id))
            self.graph_.nodes[label]['model'] = None
            self.graph_.nodes[label]['flat_classes'] = list(
                map(self.labels_dict.get, node.pre_order()))
            self.graph_.nodes[label]['left'] = None
            self.graph_.nodes[label]['right'] = None
            if node.left:
                label_left = str(
                    self.labels_dict.get(node.left.id, node.left.id))
                self.graph_.add_node(label_left)
                self.graph_.add_edge(label, label_left)
                self.graph_.nodes[label]['left'] = label_left
            if node.right:
                label_right = str(
                    self.labels_dict.get(node.right.id, node.right.id))
                self.graph_.add_node(label_right)
                self.graph_.add_edge(label, label_right)
                self.graph_.nodes[label]['right'] = label_right

        self.classes_ = list(node for node in self.graph_.nodes()
                             if node != self.root_id_)

        self.paths_ = {}
        for class_ in self.classes_:
            self.paths_[class_] = nx.shortest_path(self.graph_, self.root_id_,
                                                   class_)
示例#42
0
 def test_Q_subtree_pre_order(self):
     # Tests that pre_order() works when called on sub-trees.
     X = hierarchy_test_data.Q_X
     Z = linkage(X, 'single')
     node = to_tree(Z)
     assert_equal(node.pre_order(), (node.get_left().pre_order()
                                     + node.get_right().pre_order()))
示例#43
0
def cluster_alchemy(dataset, gamma=None, filter=False):
    doc_proc = dp.DocumentsProcessor(dataset)
    if gamma:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy(gamma=gamma, filter=filter)
    else:
        tfidf_matrix, f_score_dict, params = doc_proc.get_data_with_alchemy()

    print 'starting clustering: found %s document and %s features' \
          % (tfidf_matrix.shape[0], tfidf_matrix.shape[1])

    linkage_matrix = hr.average(tfidf_matrix.toarray())

    t = hr.to_tree(linkage_matrix, rd=True)

    clusters = {}

    for node in t[1]:
        if not node.is_leaf():
            l = []
            clusters[node.get_id()] = collect_leaf_nodes(node, l)

    f = f_score(clusters, f_score_dict)

    l = print_f_score_dict(f)

    params['avg_f_score'] = average_f_score(f, tfidf_matrix.shape[0])
    params['all_fscore'] = l

    print 'average f_score: %s' % params['avg_f_score']
    return params
示例#44
0
 def test_Q_subtree_pre_order(self):
     # Tests that pre_order() works when called on sub-trees.
     X = hierarchy_test_data.Q_X
     Z = linkage(X, 'single')
     node = to_tree(Z)
     assert_equal(node.pre_order(), (node.get_left().pre_order()
                                     + node.get_right().pre_order()))
示例#45
0
    def get_hier_tree(self, method='single'):
        """Get a tree data structure describing the clustering order of based
        on the hierarchical clustering methods and the currently set genes.

        Calls scipy.cluster.hierarchy.to_tree

        | Args:
        |   method (str): clustering method to employ. Valid entries are
        |                 'single', 'complete', 'weighted' and 'average'.
        |                 Refer to Scipy documentation for further details.

        | Returns:
        |   root_node (ClusterNode): the root node of the tree. Access child
        |                            members with .left and .right, while .id
        |                            holds the number of the corresponding
        |                            cluster. Refer to Scipy documentation for
        |                            further details.

        """

        if self._needs_recalc:
            self._recalc()

        Z = self.get_linkage(method=method)
        return hierarchy.to_tree(Z)
def guide_tree_from_sequences(sequences,
                              distance_fn=kmer_distance,
                              display_tree = False):
    """ Build a UPGMA tree by applying distance_fn to sequences

    Parameters
    ----------
    sequences : skbio.SequenceCollection
      The sequences to be represented in the resulting guide tree.
    sequence_distance_fn : function
      Function that returns and skbio.DistanceMatrix given an
      skbio.SequenceCollection.
    display_tree : bool, optional
      Print the tree before returning.

    Returns
    -------
    skbio.TreeNode

    """
    guide_dm = sequences.distances(distance_fn)
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right',
               link_color_func=lambda x: 'black')
    return guide_tree
示例#47
0
def average_dendogram(similarity_matrix, book_names):
    linkage_matrix = average(
        similarity_matrix
    )  # Define the linkage_matrix using ward clustering pre-computed distances
    assignments = fcluster(linkage_matrix, 3, depth=5)
    clusters = get_clusters_with_hierarchy(to_tree(linkage_matrix))
    return [assignments, clusters]
def main():
    distance_matrix, labels = data.gen_distance_matrix()
    np.save('../labels', labels)
    linkage_matrix = linkage(distance_matrix, 'ward')
    plt.figure(figsize=(25, 10)).subplots_adjust(bottom=0.25)
    dendrogram(
        linkage_matrix,
        leaf_rotation=90.,
        leaf_font_size=16.,
        labels=labels,
    )
    plt.show()
    root, node_list = to_tree(linkage_matrix, rd=True)
    num_clusters = input("number of clusters: ")
    heap = []
    heapq.heappush(heap, (1 / root.dist, root))
    while len(heap) < num_clusters:
        current = heapq.heappop(heap)[1]
        heapq.heappush(heap, (1 / current.left.dist, current.left))
        heapq.heappush(heap, (1 / current.right.dist, current.right))
    num_cats = 0
    for cluster in heap:
        dir_name = "cat" + str(num_cats) + "/"
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)
        else:
            rmtree(dir_name)
            os.mkdir(dir_name)
        num_cats += 1
        create_category(linkage_matrix,
                        cluster[1],
                        labels,
                        src="generatedpictures",
                        dest=dir_name)
示例#49
0
 def test_iris_subtree_pre_order(self):
     # Tests that pre_order() works when called on sub-trees.
     X = eo['iris']
     Y = pdist(X)
     Z = linkage(X, 'single')
     node = to_tree(Z)
     assert_equal(node.pre_order(), (node.get_left().pre_order()
                                     + node.get_right().pre_order()))
def plot_leaf_ordering(X, method, metric):
    dists = distance.squareform(distance.pdist(X, metric=metric))
    dists2 = distance.squareform(distance.pdist(X.T, metric=metric))

    Z = hierarchy.linkage(X, method=method, metric=metric)
    Z2 = hierarchy.linkage(X.T, method=method, metric=metric)

    t,rd = hierarchy.to_tree(Z, True)
    t2,rd2 = hierarchy.to_tree(Z2, True)

    M = optimal_scores(Z, rd, dists)
    order_tree(Z, rd, M)
    M2 = optimal_scores(Z2, rd2, dists2)
    order_tree(Z2, rd2, M2)

    rr = t.pre_order()
    rr2 = t2.pre_order()

    import matplotlib.pyplot as plt
    from matplotlib.gridspec import GridSpec

    fig = plt.figure(figsize=(8,8))
    gs = GridSpec(2, 2, top=0.95, bottom=0.05, left=0.05, right=0.95,
                  hspace=0.01, wspace=0.01,
                  width_ratios=(1,3), height_ratios=(1,3))

    ax01 = fig.add_subplot(gs[0,1])
    ax10 = fig.add_subplot(gs[1,0])
    ax11 = fig.add_subplot(gs[1,1])

    hierarchy.dendrogram(Z2, ax=ax01)
    ax01.set_axis_off()
    hierarchy.dendrogram(Z, orientation='right', ax=ax10)
    ax10.set_axis_off()

    ax11.matshow(X[np.ix_(rr,rr2)], cmap="Blues", aspect="auto")
    ax11.tick_params(**{s:'off' for s in ('top', 'bottom', 'right')})
    ax11.tick_params(labeltop='off', labelleft='off', labelright='on')

    ax11.set_xticks(np.arange(len(rr2)))
    ax11.set_xticklabels(rr2, fontsize=5.0)
    ax11.set_yticks(np.arange(len(rr)))
    ax11.set_yticklabels(rr, fontsize=5.0)

    plt.show()
示例#51
0
 def __init__(self, clustering):
     self.cluster_object = clustering
     self.datatype = clustering.datatype
     self.axis = clustering.clustering_axis
     self.clustering = clustering.clustering
     self.tree = hcluster.to_tree(self.clustering)
     self.data = clustering.data
     self.data_names = clustering.data_names
     self.header = clustering.header
     self.dendrogram = False
示例#52
0
def get_clustering_as_tree(vectors, linkage=constants.linkage_method_default, distance=constants.distance_metric_default, progress=progress):
    is_distance_and_linkage_compatible(distance, linkage)

    progress.update('Clustering data with "%s" linkage using "%s" distance' % (linkage, distance))
    linkage = hierarchy.linkage(vectors, metric=distance, method=linkage)

    progress.update('Recovering the tree from the clustering result')
    tree = hierarchy.to_tree(linkage, rd=False)

    return tree
示例#53
0
def test_node_compare():
    np.random.seed(23)
    nobs = 50
    X = np.random.randn(nobs, 4)
    Z = scipy.cluster.hierarchy.ward(X)
    tree = to_tree(Z)
    assert_(tree > tree.get_left())
    assert_(tree.get_right() > tree.get_left())
    assert_(tree.get_right() == tree.get_right())
    assert_(tree.get_right() != tree.get_left())
示例#54
0
def mat2tree(mat, nodeNames=None, dosvg=True):
    #http://stackoverflow.com/questions/9364609/converting-ndarray-generated-by-hcluster-into-a-newick-string-for-use-with-ete2

    import scipy.cluster.hierarchy as hier

    hasTree = False
    try:
        from ete2 import Tree
        import StringIO
        hasTree = True
    except:
        pass


    T         = hier.to_tree( mat )
    root      = Tree()
    root.dist = 0
    root.name = 'root'
    item2node = {T: root}
    to_visit  = [T]

    while to_visit:
        node = to_visit.pop()
        cl_dist = node.dist / 2.0

        for ch_node in [node.left, node.right]:

            if ch_node:
                ch                 = Tree()
                ch.dist            = cl_dist
                ch.name            = str(ch_node.id)

                if nodeNames:
                    if ch_node.id < len(nodeNames):
                        ch.name    = nodeNames[ ch_node.id ]

                item2node[ch_node] = ch
                item2node[node   ].add_child(ch)
                to_visit.append(ch_node)

    svg = ""
    if dosvg:
        fnm = tempfile.mkstemp(suffix=".svg", prefix=os.path.basename(sys.argv[0]) + '_tmp_', text=True, dir=TMP_DIR)[1]
        #output = StringIO.StringIO()
        if os.path.exists( fnm ):
            print fnm
            root.render(fnm)

            with open(fnm, 'r') as fhd:
                svg = fhd.read()
            os.remove(fnm)
        #print svg

    return (root, svg)
示例#55
0
    def __init__(self,clust,video,thumbsize=(60,60)):
        """
        :param cluster: cluster return by videoclustering
        :type: numarray
        :param video: video object 
        :type: video
        :param key_frames_id: array of key frame number  
        :type: array
        """
	self.w = 0
        self.key_frames_id = clust.keys
        self.cluster = sch.to_tree(clust.cluster)
        self.video = video
        self.key_frames = []
	self.thumbsize = thumbsize
示例#56
0
def make_tree_json(row_clusters, df_by_gene):
    T= to_tree(row_clusters)

    # Create dictionary for labeling nodes by their IDs
    labels = list(df_by_gene.index)
    id2name = dict(zip(range(len(labels)), labels))

    # Initialize nested dictionary for d3, then recursively iterate through tree
    d3Dendro = dict(children=[], name="Root1")
    add_node( T, d3Dendro )
    label_tree( d3Dendro["children"][0], id2name )
    # Output to JSON
    json.dump(d3Dendro, open(os.path.join(path_to_file,"d3-dendrogram.json"), "w"), sort_keys=True, indent=4)

    return cc
def HierarchicalClustering(SP):
	print "Start HierarchicalClustering";
	n = len(SP); idx = 0; 
	print "number of Stay points = %d"%n;
	D = [0 for i in xrange(n*(n - 1)/2)];
	for i in xrange(n):
		for j in xrange(i + 1,n):
			D[idx] = SP[i].Haversine(SP[j]);
			idx += 1;
	
	Z = HC.linkage(D);
	HC.dendrogram(Z,100,'level');
	plt.show();	
	print "End HierarchicalClustering"
	return HC.to_tree(Z);
示例#58
0
    def linkage_to_newick(matrix: np.ndarray, labels: List[str]):
        """Convert a linkage matrix to a newick formatted tree.

        :param matrix: The linkage matrix.
        :param labels: Names of the tree node.
        :return: The newick representation of the linkage matrix.
        """
        # Convert the linkage matrix to a ClusterNode object.
        tree = to_tree(matrix, False)

        # Define the helper recursive function to build the newick tree.
        def _build_newick_tree(node: ClusterNode,
                               newick: str,
                               parent_dist: float,
                               leaf_names: List[str]) -> str:
            """Recursively build the newick tree.

            :param node: The tree node currently being converted to.
            :param newick: The current newick representation of the tree.
            :param parent_dist: The distance to parent node.
            :param leaf_names: Names of the tree node.
            :return:
            """
            # If node is leaf, enclose.
            if node.is_leaf():
                return f"{leaf_names[node.id]}" \
                    f":{(parent_dist - node.dist) / 2}{newick}"
            else:
                # Write the distance.
                newick = f"):{(parent_dist - node.dist) / 2}{newick}" \
                    if len(newick) > 0 else ");"
                # Recursive call to expand the tree.
                newick = _build_newick_tree(
                    newick=newick,
                    node=node.get_left(),
                    parent_dist=node.dist,
                    leaf_names=leaf_names)
                newick = _build_newick_tree(
                    newick=f",{newick}",
                    node=node.get_right(),
                    parent_dist=node.dist,
                    leaf_names=leaf_names)
                # Enclose the tree at the beginning.
                return f"({newick}"

        # Trigger the recursive function.
        return _build_newick_tree(
            node=tree, newick="", parent_dist=tree.dist, leaf_names=labels)
    def ete_tree(self, labels=None):
        if sys.version_info[0] == 2:
            from ete2 import Tree, NodeStyle, TreeStyle
        elif sys.version_info[0] == 3:
            from ete3 import Tree, NodeStyle, TreeStyle
        else:
            raise ValueError('Your version of Python is not supported.')

        from scipy.cluster.hierarchy import to_tree

        T = to_tree(self.to_linkage_matrix())
        root = Tree()
        root.dist = 0
        root.name = "root"
        item2node = {T: root}
        to_visit = [T]
        while to_visit:
            node = to_visit.pop()
            cl_dist = node.dist / 2.0
            for ch_node in [node.left, node.right]:
                if ch_node:
                    ch = Tree()
                    ch.dist = cl_dist
                    ch.name = str(ch_node.id)
                    item2node[node].add_child(ch)
                    item2node[ch_node] = ch
                    to_visit.append(ch_node)
        if labels != None:
            for leaf in root.get_leaves():
                leaf.name = str(labels[int(leaf.name)])

        ts = TreeStyle()
        ts.show_leaf_name = True

        # Draws nodes as small red spheres of diameter equal to 10 pixels
        nstyle = NodeStyle()
        nstyle["shape"] = None
        nstyle["size"] = 0

        # Gray dashed branch lines
        nstyle["hz_line_type"] = 1
        nstyle["hz_line_color"] = "#cccccc"

        # Applies the same static style to all nodes in the tree. Note that,
        # if "nstyle" is modified, changes will affect to all nodes
        for n in root.traverse():
           n.set_style(nstyle)
        return root
def optimal_ordering(Z, dists):
    # Z - linkage matrix
    # dists - the distance matrix

    # get the tree and a list of handles to its leaves
    tree,rd = hierarchy.to_tree(Z, True)

    # Generate scores
    M = optimal_scores(Z, rd, dists)
    # re-order the tree accordingly
    order_tree(Z, rd, M)

    # new leaf ordering
    row_reorder = tree.pre_order()

    return row_reorder