示例#1
0
def newickToMatrix(newickFile, type):
    '''Take a newick file return a matrix of distances between leaf nodes.
        the distance will either be number of internal nodes between leaves or
        total branch length between leaves (depending on what is set as `type`'''
    with open(newickFile, 'r') as f:
        newickString = f.read()
    tree = Tree(newickString.replace(';', ' ') + ';', format=1)

    leafNodeNames = getLeafNodeNames(tree)

    distanceMatrix = []
    for leafNode in leafNodeNames:
        distancesFromNode = []
        for otherLeafNode in leafNodeNames:
            if leafNode == otherLeafNode:
                distance = 0
            else:
                if type == 'topology':
                    distance = int(
                        tree.get_distance(leafNode,
                                          otherLeafNode,
                                          topology_only=True))
                else:
                    distance = tree.get_distance(leafNode, otherLeafNode)
            distancesFromNode.append(distance)
        distanceMatrix.append(distancesFromNode)
    return (distanceMatrix)
def parent_to_tip_distances(parent: Tree, children: Tree, estimate=False):
    """
    Function utilizing ete3's tree object for calculating distances between a reference node (parent)
     and query nodes (children).
    The `estimate` flag will cause the parent's edge length to be included in the distance calculation.

    :param parent: A reference node Tree instance
    :param children: A list of query nodes, also Tree instances
    :param estimate: Boolean indicating whether these distances are to be used for estimating the edge length ranges
    :return: list() of all branch distances between the parent node and the tips
    """
    branch_distances = list()
    # Calculate distance between parent and all descendants
    for child_node in children:
        if isinstance(child_node, Tree):
            distal_length = parent.get_distance(child_node.name)
        elif isinstance(child_node, str):
            distal_length = parent.get_distance(child_node)
        elif isinstance(child_node, int):
            distal_length = parent.get_distance(str(child_node))
        else:
            logging.error("Cannot handle type '" + type(child_node) +
                          "' for child.")
            raise AssertionError()
        if estimate:
            distal_length += parent.dist
        branch_distances.append(distal_length)
    return branch_distances
示例#3
0
def calc_distances(treePath):
    '''
    This function calculates the distance between all pairs of tips on a given
    tree.
    input:
        treePath: `str`, file path to tree
    output:
        `pandas dataframe`, dataframe summarizing the tree. Columns include the
        names of the two sequences, the identifier for the branch (`seq_id`),
        the two HA groups the sequences come from and the branch length.
    '''
    df = {"sequence1": [], "sequence2": [], "seq_id": [], "distance": []}
    treeName = os.path.basename(treePath)
    with open(treePath) as f:  # workaround for file I/O deprecation in `ete3`
        treeString = f.read()
    t = Tree(treeString)
    leaves = [leaf.name for leaf in t.iter_leaves()]
    for pair in itertools.combinations(leaves, 2):
        seqs = [pair[0], pair[1]]
        seqs.sort()
        df["sequence1"].append(seqs[0])
        df["sequence2"].append(seqs[1])
        df["seq_id"].append("{0}_{1}".format(seqs[0], seqs[1]))
        df["distance"].append(t.get_distance(seqs[0], seqs[1]))
    df = pd.DataFrame(df)
    return df
示例#4
0
def get_anc_order(tree_file, ancestors, tips_to_root=False):
    """
    Orders input ancestors with respect to their position in the species tree. Can be ordered from
    root to tips (default) or tips to root.

    Args:
        tree_file (str): Path to the input newick formatted tree.
        ancestors (list of str): list of ancestor names

    Returns:
        OrderedDict: ancestor names in the requested order (keys) and list of ancestors in the
        input list that are below it (values).
    """

    tree = Tree(tree_file, format=1)
    tree.prune([i for i in tree.get_leaves()])
    dist_to_root = {i: tree.get_distance(i) for i in ancestors}
    anc_order = sorted(dist_to_root, key=dist_to_root.get)

    if tips_to_root:
        anc_order = anc_order[::-1]

    anc_order_dict = OrderedDict()
    for anc in anc_order:

        anc_order_dict[anc] = []
        anc_node = search_one_node(tree, anc)

        for anc2 in ancestors:

            if anc != anc2:
                if is_below(anc_node, anc2):
                    anc_order_dict[anc].append(anc2)

    return anc_order_dict
示例#5
0
def calc_distance_mat(target):
    fp = "{}/{}/cluster.phb".format(direc, target)
    tree = Tree(fp)

    strain_lst = get_strain_lst(target, full=True)
    size = len(strain_lst)
    distance_mat = -np.ones((size, size))
    for i in range(size):
        for j in range(i + 1, size):
            distance_mat[i][j] = tree.get_distance(strain_lst[i],
                                                   strain_lst[j])
            distance_mat[j][i] = distance_mat[i][j]
    return distance_mat
def draw_ete3_tree(organism, snplist, tree_file_name, config, c):
	'''Draws a phylogenetic tree using ETE3

	Keyword arguments:
	organism -- the organism of which to make a tree
	snplist -- a list of the SNP names, positions and state
	file_name -- the name of the out-file _tree.pdf will be added

	'''
	newick = tree_to_newick(organism, config, c)
	tree = Tree(newick, format=1)
	tree_depth = int(tree.get_distance(tree.get_farthest_leaf()[0]))
	for n in tree.traverse():
		# Nodes are set to red colour
		nstyle = NodeStyle()
		nstyle["fgcolor"] = "#BE0508"
		nstyle["size"] = 10
		nstyle["vt_line_color"] = "#000000"
		nstyle["hz_line_color"] = "#000000"
		nstyle["vt_line_type"] = 0
		nstyle["hz_line_type"] = 0
		nstyle["vt_line_width"] = 2
		nstyle["hz_line_width"] = 2
		## ['B.3', 'T', 'C', 'A']
		for snp in snplist.keys():
			if n.name == snp and snplist[snp] == 0:
				# If the SNP is missing due to a gap, make it grey
				nstyle["fgcolor"] = "#DDDDDD"
				nstyle["size"] = 10
				nstyle["vt_line_color"] = "#DDDDDD"
				nstyle["hz_line_color"] = "#DDDDDD"
				nstyle["vt_line_type"] = 1
				nstyle["hz_line_type"] = 1
			elif n.name == snp and snplist[snp] == 1:
				nstyle["fgcolor"] = "#99FF66"
				nstyle["size"] = 15
				nstyle["vt_line_color"] = "#000000"
				nstyle["hz_line_color"] = "#000000"
				nstyle["vt_line_type"] = 0
				nstyle["hz_line_type"] = 0

		n.set_style(nstyle)
	ts = TreeStyle()
	ts.show_leaf_name = False  # Do not print(leaf names, they are added in layout)
	ts.show_scale = False  # Do not show the scale
	ts.layout_fn = self.CanSNPer_tree_layout  # Use the custom layout
	ts.optimal_scale_level = 'full'  # Fully expand the branches of the tree
	if config["dev"]:
		print("#[DEV] Tree file: %s" % tree_file_name)
	tree.render(tree_file_name, tree_style=ts, width=tree_depth * 500)
示例#7
0
def get_closest_leave(leaves_to_index: dict, prepostorder_leaves: list,
                      tree: Tree, leave: str) -> Tuple[str, float, float]:
    index = leaves_to_index[leave]
    if index == 0:
        closest = prepostorder_leaves[index + 1]
    elif index == len(prepostorder_leaves) - 1:
        closest = prepostorder_leaves[index - 1]
    else:
        leave_1 = prepostorder_leaves[index - 1]
        leave_2 = prepostorder_leaves[index + 1]
        dist_1 = tree.get_distance(leave_1, leave)
        dist_2 = tree.get_distance(leave_2, leave)
        if dist_1 < dist_2:
            closest = leave_1
        else:
            closest = leave_2
    if closest != leave:
        dist = tree.get_distance(leave, closest)
        top_distance = tree.get_distance(leave, closest, True)
    else:
        logger.warning(f"Nearest neighbor to node {leave} is itself!")
        dist = 0
        top_distance = 0
    return closest, dist, top_distance
示例#8
0
def newick_to_pairwise_nodes(newick_string):
    # we load a tree
    # ((((H,K)D,(F,I)G)B,E)A,((L,(N,Q)O)J,(P,S)M)C);
    # newick_string = newick_string + "i_root"
    t = Tree(newick_string, format=1)
    # t = t + "i_root"
    nodes = []
    edges = []
    dic_id = {}
    cont = 0

    for node in t.traverse("preorder"):
        # Do some analysis on node
        if node.name == '':
            node.name = 'i_root'

        if node.name == 'NoName':
            node.name = "i_" + node.name + '_' + str(cont)
            nodes.append({"id": cont, "name": node.name})
        else:
            nodes.append({"id": cont, "name": node.name})

        dic_id[node.name] = cont

        cont = cont + 1

    for node in t.traverse("preorder"):
        ancestor = ""
        # print (node.name)
        # print("antecesor")
        for anc in node.iter_ancestors():
            if anc:
                ancestor = anc
            break

        if ancestor != "":
            # print(ancestor.name, ", ", node.name, format(t.get_distance(ancestor, node),"f"))
            edges.append({
                "source":
                dic_id[ancestor.name],
                "target":
                dic_id[node.name],
                "edgeWidth":
                format(t.get_distance(ancestor, node), "f")
            })

    json = {"nodes": nodes, "links": edges}
    return str(json).replace("'", '"')
示例#9
0
def saturation(fafile, trfile):
    #compute pairwise %Id
    aln = AlignIO.read(open(fafile), 'fasta')
    calculator = DistanceCalculator('blosum62')
    dm = calculator.get_distance(aln)
    pwdists, lfpairs = [], []
    for i, j in combinations(range(len(dm.names)), 2):
        lfpairs.append((dm.names[i], dm.names[j]))
        pwdists.append(dm[i][j])

    #Compute patristic Distance from ML Tree
    t = Tree(open(trfile).readline())
    padists = [t.get_distance(lf1, lf2) for lf1, lf2 in lfpairs]
    slope, intersect = np.polyfit(padists, pwdists, 1)

    return slope
示例#10
0
def main(clusterFilepath, strainFilepath, phbFilepath, outFilepath):
    cluster_df = pd.read_csv(clusterFilepath, dtype="object")
    strain_lst = [s.strip() for s in open(strainFilepath, 'r').readlines()]
    t = Tree(phbFilepath)

    print("Calc distance matrix")
    distance_mat = -np.ones((len(strain_lst), len(strain_lst)))
    for i, node1 in enumerate(strain_lst):
        for j, node2 in enumerate(strain_lst):
            if i != j:
                distance_mat[i, j] = t.get_distance(node1, node2)

    pattern = r"([^()]+)(\([0-9]+\))?"
    r = re.compile(pattern)

    dct_lst = []
    for _, row in cluster_df.iterrows():
        if _ % 100 == 0:
            print(_)

        dct = {}
        dct["family"] = row["family"]

        msk = row[strain_lst].isnull()
        for sidx in range(len(strain_lst)):
            if msk[sidx]:
                x = np.ma.array(distance_mat[sidx], mask=msk)
                qidx = x.argmin()
                assert distance_mat[sidx, qidx] >= 0

                #drop (num) notation from orfId
                orfId_lst = row[strain_lst[qidx]].split(' ')
                new_lst = []
                for orfId in orfId_lst:
                    new_lst.append(r.findall(orfId)[0][0])
                dct[strain_lst[sidx]] = ' '.join(new_lst)
        dct_lst.append(dct)
    out_df = pd.DataFrame(dct_lst)
    out_df = out_df[["family"] + strain_lst]
    out_df.to_csv(outFilepath, index=False)
    print("OUTPUT to {}".format(outFilepath))
示例#11
0
def get_branch_lens():
    """
    Parse input tree and retrieve branch lengths
    @return:
    """
    tree = Tree('renamed.tre')
    dist_mat = []
    for i, taxon1 in enumerate(taxa):
        dist_mat.append([])
        node1 = tree & taxon1
        for taxon2 in taxa:
            node2 = tree & taxon2
            dist = tree.get_distance(node1, node2)
            dist_mat[i].append(dist)
    length = int((len(taxa) - 1) / 2)
    taxa_dict = {}
    for i, taxon in enumerate(taxa):
        bran_lens = sorted(dist_mat[i], reverse=True)[0:length]
        taxa_dict[taxon] = sum(bran_lens) / length
    series = pd.Series(taxa_dict)
    size = int(len(taxa) / 3)

    return series, size
示例#12
0
import numpy as np
from ete3 import Tree
import ete3

if __name__ == "__main__":

    # use ete3's get_distance function to compute pairwise additive distances between leaves in tree

    tree = Tree("../../data/tree/tree.nw")
    list_nodes = list(tree.get_leaves())

    print(len(list_nodes))

    list_names = []

    dmat = np.zeros((len(list_nodes), len(list_nodes)))

    for i in range(len(list_nodes)):
        list_names.append(list_nodes[i].name)
        for j in range(i, len(list_nodes)):

            d = tree.get_distance(list_nodes[i],
                                  list_nodes[j],
                                  topology_only=False)
            dmat[i, j] = dmat[j, i] = round(d, 5)

    dist_df = pd.DataFrame(data=dmat, index=list_names, columns=list_names)
    dist_df.to_csv("../../data/tree/tree_distancematrix.txt",
                   sep='\t',
                   header=True)
示例#13
0
#         |                             |         |
#         |                    /--------|          \-F
#         |                   |         |
#         |          /--------|          \-G
#         |         |         |
#          \--------|          \-H
#                   |
#                    \-E
#
# Locate some nodes
A = t & "A"
C = t & "C"
# Calculate distance from current node
print "The distance between A and C is", A.get_distance("C")
# Calculate distance between two descendants of current node
print "The distance between A and C is", t.get_distance("A", "C")
# Calculate the toplogical distance (number of nodes in between)
print "The number of nodes between A and D is ", t.get_distance("A", "D", topology_only=True)
# Calculate the farthest node from E within the whole structure
farthest, dist = (t & "E").get_farthest_node()
print "The farthest node from E is", farthest.name, "with dist=", dist
# Calculate the farthest node from E within the whole structure,
# regarding the number of nodes in between as distance value
# Note that the result is differnt.
farthest, dist = (t & "E").get_farthest_node(topology_only=True)
print "The farthest (topologically) node from E is", farthest.name, "with", dist, "nodes in between"
# Calculate farthest node from an internal node
farthest, dist = t.get_farthest_node()
print "The farthest node from root is is", farthest.name, "with dist=", dist
#
# The program results in the following information:
示例#14
0
k = 0
for s1 in spe_l:
    h = s1.name
    if "Cultervirus" in s1.name:
        h = h.split("_")[-1]
    if "Carbovirus" in s1.name:
        k += 1
        h = h.split("_")[-1] + str(k)
    header.append(h)
    l_d = []
    for s2 in spe_l:
        if s1 == s2:
            d = 0
            l_d.append(d)
        else:
            d_d = t.get_distance(s1.name, s2.name)
            l_d.append(d_d)

    # extract minmum distance
    min_d = sorted(l_d)[1]
    spe_d_l.append(l_d)

df_d = pd.DataFrame(spe_d_l,
                    index=[str(i) for i in header],
                    columns=[str(i) for i in header])

## 5. Define genetic distance to separate different viral species
df_species = pd.read_table(borna_species,
                           sep='\t',
                           names=("node", "name", "species"))
borna_species_l = set([i.strip() for i in df_species.species.to_list()])
示例#15
0
def create_original_tree(meta_information_list, avg_list, root, name, sci,
                         save_labels):

    save_string = name + ".pdf"
    save_string_colorbar = name + "_colorbar.pdf"
    final_save = name + "_final" + ".pdf"
    if sci:
        nc_val = [x[1] for x in avg_list]
    else:
        nc_val = [round(x[1], 3) for x in avg_list]
    if not avg_list:
        avg_list = [[root, 1.0], ["V", 0.0]]

    mx = max([x[1] for x in avg_list])
    mn = min([x[1] for x in avg_list])

    if mn == mx:
        mn = mn - 0.001
        mx = mx + 0.001
    if sci:
        mx = math.log(mx)
        if mn == 0:
            mn = 0.000001
        mn = math.log(mn)

    colorbar(mn, mx, save_labels, save_string_colorbar)
    leaf_val = [x[0] for x in avg_list]
    tree = dict()
    for virus_info in meta_information_list:
        for index in range(0, len(virus_info)):
            virus_info[index] = virus_info[index].replace(";",
                                                          "").replace(":", "")
            if virus_info[index] not in tree.keys(
            ) and index + 1 < len(virus_info):
                tree[virus_info[index]] = {virus_info[index + 1]: 1}
                if index < len(virus_info) and index > 0:
                    d1 = {virus_info[index]: 1}
                    tree[virus_info[index - 1]].update(d1)
            else:
                if index != 0:
                    d1 = {virus_info[index]: 1}
                    tree[virus_info[index - 1]].update(d1)

    newick_tree, const_bool = newickify(tree, root_node=root)

    if const_bool:
        t = Tree(newick_tree, quoted_node_names=True, format=1)
        ts, t = set_default_TreeStyle(t, False)
        t.set_style(ts)

        count = 0
        for name, value in zip(leaf_val, nc_val):
            matching_nodes = t.search_nodes(name=name)
            if matching_nodes:
                dst = t.get_distance(root, matching_nodes[0])
                if sci:
                    if value == 0:
                        rgb_color = rgb2(mn, mx, math.log(0.000001))
                        rgb_color = ('#%02x%02x%02x' % rgb_color)
                        complexity = TextFace(value,
                                              fgcolor=rgb_color,
                                              fsize=200,
                                              bold=True)
                        change_tree_branch(matching_nodes, rgb_color, dst)
                    else:
                        rgb_color = rgb2(mn, mx, math.log(value))
                        rgb_color = ('#%02x%02x%02x' % rgb_color)
                        complexity = TextFace("{:.2e}".format(value),
                                              fgcolor=rgb_color,
                                              fsize=200,
                                              bold=True)
                        change_tree_branch(matching_nodes, rgb_color, dst)
                else:
                    rgb_color = rgb2(mn, mx, value)
                    rgb_color = ('#%02x%02x%02x' % rgb_color)
                    complexity = TextFace(value,
                                          fgcolor=rgb_color,
                                          fsize=200,
                                          bold=True)
                    change_tree_branch(matching_nodes, rgb_color, dst)

                virus_name = TextFace(matching_nodes[0].name,
                                      fgcolor=rgb_color,
                                      fsize=200,
                                      bold=True)
                # matching_nodes[0].add_face(face=complexity, column=1, position="branch-bottom")
                matching_nodes[0].add_face(face=virus_name,
                                           column=1,
                                           position="branch-top")
            else:
                print("ERROR->", avg_list[count])
            count += 1
        t.render(save_string,
                 tree_style=ts,
                 dpi=1000,
                 h=120000,
                 w=120000,
                 units="px")

    return const_bool
示例#16
0
#!/usr/bin/env python3

import sys
import glob
from ete3 import Tree

proteins = ["CARD1", "DSRM1", "DSRM2", "DSRM3", "RD1"]

for protein in proteins:
    fname = "../%s/brlens_and_labels.tre" % protein
    tre = Tree(fname, format=1)

    outf = open("../%s/dists_from_root.csv" % protein, "w")
    outf.write("Node,branchLength,DistFromRoot,NodesFromRoot\n")

    for node in tre.traverse("preorder"):
        id = node.name
        blen = node.dist
        dist_from_root = tre.get_distance(node)
        nodes_from_root = tre.get_distance(node, topology_only=True)
        outf.write("%s,%f,%f,%d\n" %
                   (id, blen, dist_from_root, nodes_from_root))
    outf.close()
示例#17
0
class TreeDataset(GeneExpressionDataset):
    """Forms a ``GeneExpressionDataset`` with a corresponding Tree structure relatating
    every cell.

    This is the dataset class that will be used to interact with the TreeVAE model. It's
    important to observe here that this function does not take in expression data from a CSV
    or sparse matrix, for example, but rather assumes that an scVI GeneExpressionDataset has
    already been created. The resulting API of the dataset remains very similar to that of a
    typical GeneExpressionDataset but with the addition of a tree (of class `ete3.Tree`) that
    will be used as a prior during model fitting.

    :param expr: ``scvi.dataset.GeneExpressionDataset`` instance.
    :param tree: file path to tree to read in from ``ete3.Tree`` instance.
    """

    def __init__(
        self, expr: GeneExpressionDataset, tree=None, filtering=True
    ):

        if tree is not None and type(tree) == str:
            self.tree = Tree(tree, 1)
            # polytomy is not a problem anymore: message passing deals with general trees
            # self.tree.resolve_polytomy(recursive=True)
        else:
            self.tree = tree

        if self.tree is None:
            logger.error(
                "Must provide a tree file path or a tree if you're using TreeDataset."
            )

        # assert we have barcode labels for cells
        if "barcodes" not in expr.cell_attribute_names:
            logger.error("Must provide cell barcode, or names, as a cell attribute.")

        super().__init__()

        # set some tree attributes
        self.populate_treedataset(expr)

        # keeping the cells in the tree and Gene expression dataset (not needed for simulations)
        # self.filter_cells_by_tree()
        if filtering:
            self.filter_cells_by_count()

    def populate_treedataset(self, expr):
        """
        Populate the TreeDataset with respect to an GeneExpressionDataset that is
        passed in.

        :param expr: A ``scvi.dataset.GeneExpressionDataset`` instance.
        """

        # set distance
        for n in self.tree.traverse():
            n.distance = self.tree.get_distance(n)

        self.populate_from_datasets([expr])

    def populate(self):

        tree = self.tree
        if tree is None and self.tree is not None:
            self.tree = Tree(tree, 1)
        else:
            logger.error(
                "Must provide a tree file path or a tree if you're using TreeDataset."
            )

        # set distance
        for n in self.tree.traverse():
            n.distance = self.tree.get_distance(n)

        self.populate_from_datasets([expr])

        self.populate_treedataset(expr=self)

        self.filter_cells_by_tree()

        self.filter_cells_by_count()

    def filter_cells_by_tree(self):
        """
        Prunes away cells that don't appear consistently between the tree object and the
        RNA expression dataset.
        """
        leaves = self.tree.get_leaf_names()
        keep_barcodes = np.intersect1d(leaves, self.barcodes)
        self.tree.prune(keep_barcodes)

        return self.filter_cells_by_attribute(keep_barcodes, on="barcodes")
示例#18
0
        #print "root phylum not monophyletic!"
        #print t.get_ascii(attributes=["name", "phylum"], show_internal=False)
        for pnd in t.search_nodes(phylum='Porifera'):
            try:
                t.set_outgroup(t & pnd)
            except:
                #print 'trying another root...'
                continue
    #Check phylum mononphyly
    Monophyletic = phylum_mono(t, TaxPhyl)
    mct = '/'.join(
        map(str, [
            Counter(Monophyletic.values())[cat] for cat in [True, False, None]
        ]))
    #Calculate p-dists and reject taxa deviation from median by an order of magnitude
    pdists = dict((leaf.name, round(t.get_distance(leaf), 5)) for leaf in t)
    rejected, kept = reject_outliers(pdists)
    #print len(rejected),rejected,len(pdists)
    for tax in rejected['tax']:
        nbTaxReject[tax] += 1

    #generate new alignments with rejected...
    filtered = []
    fafile = '{0}/{1}.al.hc.tr.fa'.format(path, gid)
    for rec in SeqIO.parse(fafile, 'fasta'):
        if rec.id in set(kept['tax']):
            filtered.append(rec)
    SeqIO.write(filtered, '{0}/{1}.al.hc.tr.ft.fa'.format(path, gid), 'fasta')

    #calculate saturation
    satSlp = saturation(fafile, trfile)
示例#19
0
text = t.write()
with open(new_tree, 'w') as f1:
    f1.write(text)

import pandas as pd
from tqdm import tqdm
from collections import defaultdict

t = Tree(intree)
# all_g = set([convert_genome_ID_rev(_.split('_')[0]) for _ in t.get_leaf_names()])
all_ids = t.get_leaf_names()

id_dict = defaultdict(dict)
for g1 in tqdm(t.get_leaves()):
    for g2 in t.get_leaves():
        id_dict[g1.name][g2.name] = t.get_distance(g1, g2)
dis = pd.DataFrame.from_dict(id_dict)

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2,
                random_state=0,
                precompute_distances=True,
                tol=1e-10).fit(dis.values)
kmeans.labels_

id2info = defaultdict(list)
for idx, id in enumerate(dis.index):
    new_name = convert_genome_ID_rev(id.split('_')[0]) + '_' + id
    id2info[new_name] = [str(kmeans.labels_[idx])]
from api_tools.itol_func import *
示例#20
0
        return 0.0
    try:
        return matrix[(a, b)]
    except KeyError:
        return matrix[(b, a)]


for tip_a, tip_b in itertools.permutations(lineages.keys(), 2):
    d = sum([n.dist for n in lineages[tip_a] ^ lineages[tip_b]])
    matrix[(tip_a, tip_b)] = d
    #if len(matrix) % 10000 == 0:
    #    print >>sys.stderr, len(matrix)

leaves = t.get_leaf_names()
print '\t'.join(['#names'] + leaves)
for tip_a in leaves:
    row = [tip_a]
    for tip_b in leaves:
        row.append(get_dist(tip_a, tip_b))
    print '\t'.join(map(str, row))

# test

import random

s = random.sample(matrix.keys(), 1000)
for a, b in s:
    d0 = get_dist(a, b)
    d1 = t.get_distance(a, b)
    if round(d0, 8) != round(d1, 8):
        print >> sys.stderr, a, b, d0, d1
#!/usr/bin/env python3
from ete3 import Tree
import sys

with open(sys.argv[1], 'r') as treefile:
    nwk_string = ''
    for line in treefile:
        nwk_string += line.rstrip("\n")

tree = Tree(nwk_string)

tree.set_outgroup(tree.get_common_ancestor("CELEG", "CINOP"))

print("CBRIG", "CREMA", tree.get_distance("CBRIG", "CREMA"))
print("CNIGO", "CREMA", tree.get_distance("CNIGO", "CREMA"))
print("CTROP", "CREMA", tree.get_distance("CTROP", "CREMA"))
print("CWALL", "CREMA", tree.get_distance("CWALL", "CREMA"))
print("CELEG", "CREMA", tree.get_distance("CELEG", "CREMA"))
print("CINOP", "CREMA", tree.get_distance("CINOP", "CREMA"))
def random_tree(trees):
    '''
    Randomly choose a tree and find two nodes for inheritance
    '''
    #Randomly choose a tree
    while True:
        tree = choice(open(trees).readlines())
        t = Tree(tree, format=1)
        tips = []
        nodes = []
        k = 1
        for node in t.traverse():
            if node.is_leaf():
                tips.append(node.name)
            elif not node.is_root():
                node.add_features(name='n' + str(k))
                nodes.append(node.name)
                k += 1
        nodes = list(filter(None, nodes))

        #Randomly choose two nodes for inheritance
        timeout1 = time.time() + 60
        timeout2 = time.time() + 90
        while True:
            rn2 = Tree(tree, format=1)
            rn = sample(nodes, 2)
            rn1 = t.search_nodes(name=rn[0])[0]
            rn2 = t.search_nodes(name=rn[1])[0]
            if time.time() <= timeout1:
                if (len(rn1.get_leaves()) <= 2) or (len(rn2.get_leaves()) <=
                                                    2):
                    continue
                elif rn2 in rn1.get_descendants():
                    continue
                elif rn1 in rn2.get_descendants():
                    continue
                elif rn2 in rn1.get_sisters():
                    continue
                else:
                    r_tips = []
                    r_nodes = []
                    for node in rn1.traverse():
                        if node.is_leaf():
                            r_tips.append(node.name)
                        else:
                            r_nodes.append(node.name)
                    root1 = t.get_common_ancestor(r_tips)
                    root2 = []
                    for node in rn2.traverse():
                        if node.is_leaf():
                            r_tips.append(node.name)
                            root2.append(node.name)
                        else:
                            r_nodes.append(node.name)
                    root2 = t.get_common_ancestor(root2)
                    dist = t.get_distance(root1, root2, topology_only=True)
                    tree = topology_dist(t, nodes, r_nodes, r_tips,
                                         branchProbabilityDist)
                    return [tree, nodes, tips, r_nodes, r_tips, dist]
            elif time.time() <= timeout2:
                if (len(rn1.get_leaves()) < 2) or (len(rn2.get_leaves()) < 2):
                    continue
                elif rn2 in rn1.get_descendants():
                    continue
                elif rn1 in rn2.get_descendants():
                    continue
                elif rn2 in rn1.get_sisters():
                    continue
                else:
                    r_tips = []
                    r_nodes = []
                    root1 = []
                    for node in rn1.traverse():
                        if node.is_leaf():
                            r_tips.append(node.name)
                            root1.append(node.name)
                        else:
                            r_nodes.append(node.name)
                    root1 = t.get_common_ancestor(root1)
                    root2 = []
                    for node in rn2.traverse():
                        if node.is_leaf():
                            r_tips.append(node.name)
                            root2.append(node.name)
                        else:
                            r_nodes.append(node.name)
                    root2 = t.get_common_ancestor(root2)
                    dist = t.get_distance(root1, root2, topology_only=True)
                    tree = topology_dist(t, nodes, r_nodes, r_tips,
                                         branchProbabilityDist)
                    return [tree, nodes, tips, r_nodes, r_tips, dist]
            else:
                break
示例#23
0
def draw_tree(the_tree, colour, back_color, label, out_file, the_scale, extend,
              bootstrap, group_file, grid_options, the_table, pres_abs,
              circular):
    t = Tree(the_tree, quoted_node_names=True)
    #    t.ladderize()
    font_size = 8
    font_type = 'Heveltica'
    font_gap = 3
    font_buffer = 10
    o = t.get_midpoint_outgroup()
    t.set_outgroup(o)
    the_leaves = []
    for leaves in t.iter_leaves():
        the_leaves.append(leaves)
    groups = {}
    num = 0
    # set cutoff value for clades as 1/20th of the distance between the furthest two branches
    # assign nodes to groups
    last_node = None
    ca_list = []
    if not group_file is None:
        style = NodeStyle()
        style['size'] = 0
        style["vt_line_color"] = '#000000'
        style["hz_line_color"] = '#000000'
        style["vt_line_width"] = 1
        style["hz_line_width"] = 1
        for n in t.traverse():
            n.set_style(style)
        with open(group_file) as f:
            group_dict = {}
            for line in f:
                group_dict[line.split()[0]] = line.split()[1]
        for node in the_leaves:
            i = node.name
            for j in group_dict:
                if j in i:
                    if group_dict[j] in groups:
                        groups[group_dict[j]].append(i)
                    else:
                        groups[group_dict[j]] = [i]
        coloured_nodes = []
        for i in groups:
            the_col = i
            style = NodeStyle()
            style['size'] = 0
            style["vt_line_color"] = the_col
            style["hz_line_color"] = the_col
            style["vt_line_width"] = 2
            style["hz_line_width"] = 2
            if len(groups[i]) == 1:
                ca = t.search_nodes(name=groups[i][0])[0]
                ca.set_style(style)
                coloured_nodes.append(ca)
            else:
                ca = t.get_common_ancestor(groups[i])
                ca.set_style(style)
                coloured_nodes.append(ca)
                tocolor = []
                for j in ca.children:
                    tocolor.append(j)
                while len(tocolor) > 0:
                    x = tocolor.pop(0)
                    coloured_nodes.append(x)
                    x.set_style(style)
                    for j in x.children:
                        tocolor.append(j)
            ca_list.append((ca, the_col))
        if back_color:
            # for each common ancestor node get it's closest common ancestor neighbour and find the common ancestor of those two nodes
            # colour the common ancestor then add it to the group - continue until only the root node is left
            while len(ca_list) > 1:
                distance = float('inf')
                for i, col1 in ca_list:
                    for j, col2 in ca_list:
                        if not i is j:
                            parent = t.get_common_ancestor(i, j)
                            getit = True
                            the_dist = t.get_distance(i, j)
                            if the_dist <= distance:
                                distance = the_dist
                                the_i = i
                                the_j = j
                                the_i_col = col1
                                the_j_col = col2
                ca_list.remove((the_i, the_i_col))
                ca_list.remove((the_j, the_j_col))
                rgb1 = strtorgb(the_i_col)
                rgb2 = strtorgb(the_j_col)
                rgb3 = ((rgb1[0] + rgb2[0]) / 2, (rgb1[1] + rgb2[1]) / 2,
                        (rgb1[2] + rgb2[2]) / 2)
                new_col = colorstr(rgb3)
                new_node = t.get_common_ancestor(the_i, the_j)
                the_col = new_col
                style = NodeStyle()
                style['size'] = 0
                style["vt_line_color"] = the_col
                style["hz_line_color"] = the_col
                style["vt_line_width"] = 2
                style["hz_line_width"] = 2
                new_node.set_style(style)
                coloured_nodes.append(new_node)
                ca_list.append((new_node, new_col))
                for j in new_node.children:
                    tocolor.append(j)
                while len(tocolor) > 0:
                    x = tocolor.pop(0)
                    if not x in coloured_nodes:
                        coloured_nodes.append(x)
                        x.set_style(style)
                        for j in x.children:
                            tocolor.append(j)
    elif colour:
        distances = []
        for node1 in the_leaves:
            for node2 in the_leaves:
                if node1 != node2:
                    distances.append(t.get_distance(node1, node2))
        distances.sort()
        clade_cutoff = distances[len(distances) / 4]
        for node in the_leaves:
            i = node.name
            if not last_node is None:
                if t.get_distance(node, last_node) <= clade_cutoff:
                    groups[group_num].append(i)
                else:
                    groups[num] = [num, i]
                    group_num = num
                    num += 1
            else:
                groups[num] = [num, i]
                group_num = num
                num += 1
            last_node = node
        for i in groups:
            num = groups[i][0]
            h = num * 360 / len(groups)
            the_col = hsl_to_str(h, 0.5, 0.5)
            style = NodeStyle()
            style['size'] = 0
            style["vt_line_color"] = the_col
            style["hz_line_color"] = the_col
            style["vt_line_width"] = 2
            style["hz_line_width"] = 2
            if len(groups[i]) == 2:
                ca = t.search_nodes(name=groups[i][1])[0]
                ca.set_style(style)
            else:
                ca = t.get_common_ancestor(groups[i][1:])
                ca.set_style(style)
                tocolor = []
                for j in ca.children:
                    tocolor.append(j)
                while len(tocolor) > 0:
                    x = tocolor.pop(0)
                    x.set_style(style)
                    for j in x.children:
                        tocolor.append(j)
            ca_list.append((ca, h))
        # for each common ancestor node get it's closest common ancestor neighbour and find the common ancestor of those two nodes
        # colour the common ancestor then add it to the group - continue until only the root node is left
        while len(ca_list) > 1:
            distance = float('inf')
            got_one = False
            for i, col1 in ca_list:
                for j, col2 in ca_list:
                    if not i is j:
                        parent = t.get_common_ancestor(i, j)
                        getit = True
                        for children in parent.children:
                            if children != i and children != j:
                                getit = False
                                break
                        if getit:
                            the_dist = t.get_distance(i, j)
                            if the_dist <= distance:
                                distance = the_dist
                                the_i = i
                                the_j = j
                                the_i_col = col1
                                the_j_col = col2
                                got_one = True
            if not got_one:
                break
            ca_list.remove((the_i, the_i_col))
            ca_list.remove((the_j, the_j_col))
            new_col = (the_i_col + the_j_col) / 2
            new_node = t.get_common_ancestor(the_i, the_j)
            the_col = hsl_to_str(new_col, 0.5, 0.3)
            style = NodeStyle()
            style['size'] = 0
            style["vt_line_color"] = the_col
            style["hz_line_color"] = the_col
            style["vt_line_width"] = 2
            style["hz_line_width"] = 2
            new_node.set_style(style)
            ca_list.append((new_node, new_col))
    # if you just want a black tree
    else:
        style = NodeStyle()
        style['size'] = 0
        style["vt_line_color"] = '#000000'
        style["hz_line_color"] = '#000000'
        style["vt_line_width"] = 1
        style["hz_line_width"] = 1
        for n in t.traverse():
            n.set_style(style)
    color_list = [(240, 163, 255), (0, 117, 220), (153, 63, 0), (76, 0, 92),
                  (25, 25, 25), (0, 92, 49), (43, 206, 72), (255, 204, 153),
                  (128, 128, 128), (148, 255, 181), (143, 124, 0),
                  (157, 204, 0), (194, 0, 136), (0, 51, 128), (255, 164, 5),
                  (255, 168, 187), (66, 102, 0), (255, 0, 16), (94, 241, 242),
                  (0, 153, 143), (224, 255, 102), (116, 10, 255), (153, 0, 0),
                  (255, 255, 128), (255, 255, 0), (255, 80, 5), (0, 0, 0),
                  (50, 50, 50)]
    up_to_colour = {}
    ts = TreeStyle()
    column_list = []
    width_dict = {}
    if not grid_options is None:
        colour_dict = {}
        type_dict = {}
        min_val_dict = {}
        max_val_dict = {}
        leaf_name_dict = {}
        header_count = 0
        the_columns = {}
        if grid_options == 'auto':
            with open(the_table) as f:
                headers = f.readline().rstrip().split('\t')[1:]
                for i in headers:
                    the_columns[i] = [i]
                    type_dict[i] = 'colour'
                    colour_dict[i] = {'empty': '#FFFFFF'}
                    width_dict[i] = 20
                    up_to_colour[i] = 0
                    column_list.append(i)
        else:
            with open(grid_options) as g:
                for line in g:
                    if line.startswith('H'):
                        name, type, width = line.rstrip().split('\t')[1:]
                        if name in the_columns:
                            the_columns[name].append(name + '_' +
                                                     str(header_count))
                        else:
                            the_columns[name] = [
                                name + '_' + str(header_count)
                            ]
                        width = int(width)
                        name = name + '_' + str(header_count)
                        header_count += 1
                        colour_dict[name] = {'empty': '#FFFFFF'}
                        type_dict[name] = type
                        width_dict[name] = width
                        column_list.append(name)
                        up_to_colour[name] = 0
                        min_val_dict[name] = float('inf')
                        max_val_dict[name] = 0
                    elif line.startswith('C'):
                        c_name, c_col = line.rstrip().split('\t')[1:]
                        if not c_col.startswith('#'):
                            c_col = colorstr(map(int, c_col.split(',')))
                        colour_dict[name][c_name] = c_col
        val_dict = {}
        with open(the_table) as f:
            headers = f.readline().rstrip().split('\t')[1:]
            column_no = {}
            for num, i in enumerate(headers):
                if i in the_columns:
                    column_no[num] = i
            for line in f:
                name = line.split('\t')[0]
                leaf_name = None
                for n in t.traverse():
                    if n.is_leaf():
                        if name.split('.')[0] in n.name:
                            leaf_name = n.name
                if leaf_name is None:
                    continue
                else:
                    leaf_name_dict[leaf_name] = name
                vals = line.rstrip().split('\t')[1:]
                if name in val_dict:
                    sys.exit('Duplicate entry found in table.')
                else:
                    val_dict[name] = {}
                for num, val in enumerate(vals):
                    if num in column_no and val != '':
                        for q in the_columns[column_no[num]]:
                            column_name = q
                            if type_dict[column_name] == 'colour':
                                val_dict[name][column_name] = val
                                if not val in colour_dict[column_name]:
                                    colour_dict[column_name][val] = colorstr(
                                        color_list[up_to_colour[column_name] %
                                                   len(color_list)])
                                    up_to_colour[column_name] += 1
                            elif type_dict[column_name] == 'text':
                                val_dict[name][column_name] = val
                            elif type_dict[column_name] == 'colour_scale_date':
                                year, month, day = val.split('-')
                                year, month, day = int(year), int(month), int(
                                    day)
                                the_val = datetime.datetime(
                                    year, month, day, 0, 0,
                                    0) - datetime.datetime(
                                        1970, 1, 1, 0, 0, 0)
                                val_dict[name][
                                    column_name] = the_val.total_seconds()
                                if the_val.total_seconds(
                                ) < min_val_dict[column_name]:
                                    min_val_dict[
                                        column_name] = the_val.total_seconds()
                                if the_val.total_seconds(
                                ) > max_val_dict[column_name]:
                                    max_val_dict[
                                        column_name] = the_val.total_seconds()
                            elif type_dict[column_name] == 'colour_scale':
                                the_val = float(val)
                                val_dict[name][column_name] = the_val
                                if the_val < min_val_dict[column_name]:
                                    min_val_dict[column_name] = the_val
                                if the_val > max_val_dict[column_name]:
                                    max_val_dict[column_name] = the_val
                            else:
                                sys.exit('Unknown column type')
        if not out_file is None:
            new_desc = open(out_file + '.new_desc', 'w')
        else:
            new_desc = open('viridis.new_desc', 'w')
        ts.legend_position = 3
        leg_column = 0
        for num, i in enumerate(column_list):
            nameF = TextFace(font_gap * ' ' + i.rsplit('_', 1)[0] +
                             ' ' * font_buffer,
                             fsize=font_size,
                             ftype=font_type,
                             tight_text=True)
            nameF.rotation = -90
            ts.aligned_header.add_face(nameF, column=num + 1)
            new_desc.write('H\t' + i.rsplit('_', 1)[0] + '\t' + type_dict[i] +
                           '\t' + str(width_dict[i]) + '\n')
            x = num * 200
            if type_dict[i] == 'colour':
                ts.legend.add_face(TextFace(
                    font_gap * ' ' + i.rsplit('_', 1)[0] + ' ' * font_buffer,
                    fsize=font_size,
                    ftype=font_type,
                    tight_text=True),
                                   column=leg_column + 1)
                ts.legend.add_face(RectFace(width_dict[i], 20, '#FFFFFF',
                                            '#FFFFFF'),
                                   column=leg_column)
                for num2, j in enumerate(colour_dict[i]):
                    new_desc.write('C\t' + j + '\t' + colour_dict[i][j] + '\n')
                    ts.legend.add_face(TextFace(font_gap * ' ' + j +
                                                ' ' * font_buffer,
                                                fsize=font_size,
                                                ftype=font_type,
                                                tight_text=True),
                                       column=leg_column + 1)
                    ts.legend.add_face(RectFace(width_dict[i], 20,
                                                colour_dict[i][j],
                                                colour_dict[i][j]),
                                       column=leg_column)
                leg_column += 2
            elif type_dict[i] == 'colour_scale':
                ts.legend.add_face(TextFace(
                    font_gap * ' ' + i.rsplit('_', 1)[0] + ' ' * font_buffer,
                    fsize=font_size,
                    ftype=font_type,
                    tight_text=True),
                                   column=leg_column + 1)
                ts.legend.add_face(RectFace(width_dict[i], 20, '#FFFFFF',
                                            '#FFFFFF'),
                                   column=leg_column)
                for num2 in range(11):
                    y = num2 * 20 + 30
                    val = (max_val_dict[i] - min_val_dict[i]) * num2 / 10.0
                    h = val / (max_val_dict[i] - min_val_dict[i]) * 270
                    s = 0.5
                    l = 0.5
                    colour = hsl_to_str(h, s, l)
                    ts.legend.add_face(TextFace(font_gap * ' ' + str(val) +
                                                ' ' * font_buffer,
                                                fsize=font_size,
                                                ftype=font_type,
                                                tight_text=True),
                                       column=leg_column + 1)
                    ts.legend.add_face(RectFace(width_dict[i], 20, colour,
                                                colour),
                                       column=leg_column)
                leg_column += 2
            elif type_dict[i] == 'colour_scale_date':
                ts.legend.add_face(TextFace(
                    font_gap * ' ' + i.rsplit('_', 1)[0] + ' ' * font_buffer,
                    fsize=font_size,
                    ftype=font_type,
                    tight_text=True),
                                   column=leg_column + 1)
                ts.legend.add_face(RectFace(width_dict[i], 20, '#FFFFFF',
                                            '#FFFFFF'),
                                   column=leg_column)
                for num2 in range(11):
                    y = num2 * 20 + 30
                    val = (max_val_dict[i] - min_val_dict[i]) * num2 / 10.0
                    h = val / (max_val_dict[i] - min_val_dict[i]) * 360
                    s = 0.5
                    l = 0.5
                    colour = hsl_to_str(h, s, l)
                    days = str(int(val / 60 / 60 / 24)) + ' days'
                    ts.legend.add_face(TextFace(font_gap * ' ' + days +
                                                ' ' * font_buffer,
                                                fsize=font_size,
                                                ftype=font_type,
                                                tight_text=True),
                                       column=leg_column + 1)
                    ts.legend.add_face(RectFace(width_dict[i], 20, colour,
                                                colour),
                                       column=leg_column)
                leg_column += 2
            for n in t.traverse():
                if n.is_leaf():
                    name = leaf_name_dict[n.name]
                    if i in val_dict[name]:
                        val = val_dict[name][i]
                    else:
                        val = 'empty'
                    if type_dict[i] == 'colour':
                        n.add_face(RectFace(width_dict[i], 20,
                                            colour_dict[i][val],
                                            colour_dict[i][val]),
                                   column=num + 1,
                                   position="aligned")
                    elif type_dict[i] == 'colour_scale' or type_dict[
                            i] == 'colour_scale_date':
                        if val == 'empty':
                            colour = '#FFFFFF'
                        else:
                            h = (val - min_val_dict[i]) / (
                                max_val_dict[i] - min_val_dict[i]) * 360
                            s = 0.5
                            l = 0.5
                            colour = hsl_to_str(h, s, l)
                        n.add_face(RectFace(width_dict[i], 20, colour, colour),
                                   column=num + 1,
                                   position="aligned")
                    elif type_dict[i] == 'text':
                        n.add_face(TextFace(font_gap * ' ' + val +
                                            ' ' * font_buffer,
                                            fsize=font_size,
                                            ftype=font_type,
                                            tight_text=True),
                                   column=num + 1,
                                   position="aligned")
    if not pres_abs is None:
        starting_col = len(column_list) + 1
        subprocess.Popen('makeblastdb -out tempdb -dbtype prot -in ' +
                         pres_abs[0],
                         shell=True).wait()
        folder = pres_abs[1]
        len_dict = {}
        gene_list = []
        ts.legend.add_face(TextFace(font_gap * ' ' + 'Gene present/absent' +
                                    ' ' * font_buffer,
                                    fsize=font_size,
                                    ftype=font_type,
                                    tight_text=True),
                           column=starting_col + 1)
        ts.legend.add_face(RectFace(20, 20, '#FFFFFF', '#FFFFFF'),
                           column=starting_col)
        ts.legend.add_face(TextFace(font_gap * ' ' + 'Gene present/absent' +
                                    ' ' * font_buffer,
                                    fsize=font_size,
                                    ftype=font_type,
                                    tight_text=True),
                           column=starting_col + 1)
        ts.legend.add_face(RectFace(20, 20, "#5ba965", "#5ba965"),
                           column=starting_col)
        ts.legend.add_face(TextFace(font_gap * ' ' + 'Gene present/absent' +
                                    ' ' * font_buffer,
                                    fsize=font_size,
                                    ftype=font_type,
                                    tight_text=True),
                           column=starting_col + 1)
        ts.legend.add_face(RectFace(20, 20, "#cb5b4c", "#cb5b4c"),
                           column=starting_col)
        with open(pres_abs[0]) as f:
            for line in f:
                if line.startswith('>'):
                    name = line.split()[0][1:]
                    gene_list.append(name)
                    len_dict[name] = 0
                    nameF = TextFace(font_gap * ' ' + name + ' ' * font_buffer,
                                     fsize=font_size,
                                     ftype=font_type,
                                     tight_text=True)
                    nameF.rotation = -90
                    ts.aligned_header.add_face(nameF,
                                               column=starting_col +
                                               len(gene_list) - 1)
                else:
                    len_dict[name] += len(line.rstrip())
        min_length = 0.9
        min_ident = 90
        for n in t.iter_leaves():
            the_name = n.name
            if the_name[0] == '"' and the_name[-1] == '"':
                the_name = the_name[1:-1]
            if the_name.endswith('.ref'):
                the_name = the_name[:-4]
            if not os.path.exists(folder + '/' + the_name):
                for q in os.listdir(folder):
                    if q.startswith(the_name):
                        the_name = q
            if not os.path.exists(the_name + '.blast'):
                subprocess.Popen(
                    'blastx -query ' + folder + '/' + the_name +
                    ' -db tempdb -outfmt 6 -num_threads 24 -out ' + the_name +
                    '.blast',
                    shell=True).wait()
            gotit = set()
            with open(the_name + '.blast') as b:
                for line in b:
                    query, subject, ident, length = line.split()[:4]
                    ident = float(ident)
                    length = int(length)
                    if ident >= min_ident and length >= min_length * len_dict[
                            subject]:
                        gotit.add(subject)
            for num, i in enumerate(gene_list):
                if i in gotit:
                    colour = "#5ba965"
                else:
                    colour = "#cb5b4c"
                n.add_face(RectFace(20, 20, colour, colour),
                           column=num + starting_col,
                           position="aligned")
        # for num, i in enumerate(gene_list):
        #     x = (starting_col + num) * 200
        #     svg.writeString(i, x+50, 20, 12)
        #     y = 30
        #     svg.drawOutRect(x + 50, y, 12, 12, strtorgb('#5ba965'), strtorgb('#5ba965'), lt=0)
        #     svg.writeString('present', x + 70, y + 12, 12)
        #     y = 50
        #     svg.drawOutRect(x + 50, y, 12, 12, strtorgb('#cb5b4c'), strtorgb('#cb5b4c'), lt=0)
        #     svg.writeString('absent', x + 70, y + 12, 12)

    # Set these to False if you don't want bootstrap/distance values
    ts.show_branch_length = label
    ts.show_branch_support = bootstrap
    ts.show_leaf_name = False
    for node in t.traverse():
        if node.is_leaf():
            node.add_face(AttrFace("name",
                                   fsize=font_size,
                                   ftype=font_type,
                                   tight_text=True,
                                   fgcolor='black'),
                          column=0,
                          position="aligned")

    ts.margin_left = 20
    ts.margin_right = 100
    ts.margin_top = 20
    ts.margin_bottom = 20
    if extend:
        ts.draw_guiding_lines = True
    ts.scale = the_scale
    if not circular is None:
        ts.mode = "c"
        ts.arc_start = 0
        ts.arc_span = 360
    if out_file is None:
        t.show(tree_style=ts)
    else:
        t.render(out_file, w=210, units='mm', tree_style=ts)
示例#24
0
	no_n=re.search(r'_N\d+', line)
	if no_n:
		no_n_str= no_n.group()	
		no_n_str=re.sub('_N','',no_n_str)
		no_N_dict[acc_str]=no_n_str
	
log_file.write("The number of clusters are:" + str(cluster_cnt))
log_file.close()
cdhit_file.close()

#print "Tree from FastTree program is being used to calculate root to leaf distances..."
#Passing in the tree generated by FastTree
FastTree=Tree(args.input2)
#Getting the root of the tree
root=FastTree.get_tree_root()
#Loop through each leaf of the tree
for leaf in FastTree:
	#Convert 'leaf' to string to allow manipulation
	leaf_str=str(leaf)
	acc_nu=re.search(r'\w{2}\d+.\d{1}_\d{4}|\w{2}_\d+.\d{1}_\d{4}',leaf_str)
	acc_nu=str(acc_nu.group())
	acc_nu=re.sub('_\d{4}$','',acc_nu)
	rt_lf=FastTree.get_distance(root,leaf)
	#Make a dictionary using acc_nu as key 
	branlength_dict[acc_nu]=rt_lf
	#Using the generated dictionaries to print the relevant information to a tab delimited file
	tsv_file.write(acc_nu + "\t" + year_dict[acc_nu] + "\t" + str(rt_lf) + "\t" + clust_dict[acc_nu] + "\t" + no_N_dict[acc_nu] + "\n")
tsv_file.close


#         |                             |         |
#         |                    /--------|          \-F
#         |                   |         |
#         |          /--------|          \-G
#         |         |         |
#          \--------|          \-H
#                   |
#                    \-E
#
# Locate some nodes
A = t&"A"
C = t&"C"
# Calculate distance from current node
print "The distance between A and C is",  A.get_distance("C")
# Calculate distance between two descendants of current node
print "The distance between A and C is",  t.get_distance("A","C")
# Calculate the toplogical distance (number of nodes in between)
print "The number of nodes between A and D is ",  \
    t.get_distance("A","D", topology_only=True)
# Calculate the farthest node from E within the whole structure
farthest, dist = (t&"E").get_farthest_node()
print "The farthest node from E is", farthest.name, "with dist=", dist
# Calculate the farthest node from E within the whole structure,
# regarding the number of nodes in between as distance value
# Note that the result is differnt.
farthest, dist = (t&"E").get_farthest_node(topology_only=True)
print "The farthest (topologically) node from E is", \
    farthest.name, "with", dist, "nodes in between"
# Calculate farthest node from an internal node
farthest, dist = t.get_farthest_node()
print "The farthest node from root is", farthest.name, "with dist=", dist
示例#26
0
for line in tt:
    target_taxa.append(line.rstrip())
tt.close()

#now read in a collection of trees, calc branch lengths over sample, summarise and print out
branch_lengths = defaultdict(list) #key = taxa, value = list of brlens
treefile = open(sys.argv[3])
for line in treefile:
    curr_tree = Tree(line.rstrip())
    root_node = curr_tree.get_common_ancestor(outgroups)
    if curr_tree != root_node:
        curr_tree.set_outgroup(root_node)
    print curr_tree
    #bundle = curr_tree.check_monophyly(values=outgroups,target_attr='name')
    #print bundle
    #if bundle[0] == False:
    #    continue
    #find common ancestor of the target taxa, and use this as the reference node for calculating branch lengths. This might not always be the measure you want!
    reference_node = curr_tree.get_common_ancestor(target_taxa)
    #if reference_node != curr_tree:
    #    curr_tree.set_outgroup(reference_node)
    #calc distance from root to each branch of interest
    for taxon in target_taxa:
        dist = curr_tree.get_distance(taxon, reference_node) 
        branch_lengths[taxon].append(dist)

#now compute the credible intervals of the branch length for each of the target taxa
for taxon in branch_lengths:
    mean, var, std = stats.bayes_mvs(branch_lengths[taxon], alpha=0.95)
    print taxon + "\t" + str(mean[0]) + "\t" + str(mean[1][0]) + "\t" + str(mean[1][1])
示例#27
0
    if a == b:
        return 0.0
    try:
        return matrix[(a, b)]
    except KeyError:
        return matrix[(b, a)]

for tip_a, tip_b in itertools.permutations(lineages.keys(), 2):
    d = sum([n.dist for n in lineages[tip_a] ^ lineages[tip_b]])
    matrix[(tip_a, tip_b)] = d
    #if len(matrix) % 10000 == 0:
    #    print >>sys.stderr, len(matrix)

leaves = t.get_leaf_names()
print '\t'.join(['#names'] + leaves)
for tip_a in leaves:
    row = [tip_a]
    for tip_b in leaves:
        row.append(get_dist(tip_a, tip_b))
    print '\t'.join(map(str, row))


# test

import random
s = random.sample(matrix.keys(), 1000)
for a,b in s:
    d0 = get_dist(a, b)
    d1 = t.get_distance(a, b)
    if round(d0, 8) != round(d1, 8):
        print >>sys.stderr, a, b, d0, d1
示例#28
0
tree_file = 'real_data/Yersinia_pestis/tree.nwk'
tree_file_out = 'real_data/Yersinia_pestis/tree_%s_left.nwk'

res = 1000
infercars_file = f'real_data/Yersinia_pestis/{res}/blocks_unique_coords.infercars'
infercars_file_out = f'real_data/Yersinia_pestis/{res}/blocks_unique_coords_%s_left.infercars'

t = Tree(tree_file)

n = len(t.get_leaves())

m = np.zeros((n, n))
for i, leaf1 in enumerate(t.get_leaves()):
    for j, leaf2 in enumerate(t.get_leaves()):
        m[i, j] = t.get_distance(leaf1, leaf2)

cls = AgglomerativeClustering(n_clusters=None,
                              affinity='precomputed',
                              linkage='average',
                              distance_threshold=0.0004).fit_predict(m)

print(cls)
print(np.unique(cls))

used = defaultdict(bool)
survivors = []

for cl, leaf in zip(cls, t.get_leaves()):
    if not used[cl]:
        used[cl] = True
示例#29
0
#         |                             |         |
#         |                    /--------|          \-F
#         |                   |         |
#         |          /--------|          \-G
#         |         |         |
#          \--------|          \-H
#                   |
#                    \-E
#
# Locate some nodes
A = t&"A"
C = t&"C"
# Calculate distance from current node
print "The distance between A and C is",  A.get_distance("C")
# Calculate distance between two descendants of current node
print "The distance between A and C is",  t.get_distance("A","C")
# Calculate the toplogical distance (number of nodes in between)
print "The number of nodes between A and D is ",  \
    t.get_distance("A","D", topology_only=True)
# Calculate the farthest node from E within the whole structure
farthest, dist = (t&"E").get_farthest_node()
print "The farthest node from E is", farthest.name, "with dist=", dist
# Calculate the farthest node from E within the whole structure,
# regarding the number of nodes in between as distance value
# Note that the result is differnt.
farthest, dist = (t&"E").get_farthest_node(topology_only=True)
print "The farthest (topologically) node from E is", \
    farthest.name, "with", dist, "nodes in between"
# Calculate farthest node from an internal node
farthest, dist = t.get_farthest_node()
print "The farthest node from root is is", farthest.name, "with dist=", dist
示例#30
0
		sfs[pop-1] += 1
		nIndsSFS += 1
	fn.write(leaf.name+"\t"+"\t".join(str(x) for x in sfs)+"\n")

f.close()
fn.close()
sys.stdout.write('S')

#======================================================#
# FORCE ULTRAMETRIC in the output tree, defined in reference to the furthest leaf
if maxNumberOfSpecies == -1: maxNumberOfSpecies = nTrueSpecies + 1
if force_ultrametric:
	if nTrueSpecies <= maxNumberOfSpecies:
		tree_dist = t.get_farthest_leaf()[1]
		for l in t:
			dst = t.get_distance(l)
			if dst != tree_dist:
				l.dist += tree_dist - dst
		sys.stdout.write('u')
	else:
		sys.stdout.write('\nERROR. Too many final species: will not force ultrametricity.\n')

#======================================================#
# EXPORT phylo
t.write(format=5, outfile=ophylo, dist_formatter='%0.20f')
## write the count of resultant species
f = open(ophylo, 'a')
f.write("\n"+str(nTrueSpecies)+"\n")
f.close()

if plot_trees:
示例#31
0
            ca = tree.get_common_ancestor(eukaryote_seqs)
            print sys.argv[1] + "\tEuks monophyletic\t" + str(len(eukaryote_seqs)) + "\t" + str(ca.support) 
        elif answer[0] == False:
            mono_groups = []
            target_group = ''
            for node in tree.get_monophyletic(values=['Eukaryote'], target_attr="domain"):
                if target_leaf in node:
                    target_group = node
                else:
                    mono_groups.append(node)
            size_target_group = len(target_group)
            #get distance
            shortest_distance = 999999999999999.0
            closest_other_group = ''
            for subtree in mono_groups:
                curr_distance = tree.get_distance(target_group, subtree, topology_only=True)
                if curr_distance < shortest_distance:
                    shortest_distance = curr_distance
                    closest_other_group = subtree
            #attempt to calculate distance on a version of the tree in which branches below some support threshold have been deleted
#            closest_leaves = []
 #           for leaf in closest_other_group:
  #              closest_leaves.append(leaf.name)
   #         target_leaves = []
    #        for leaf in target_group:
     #           target_leaves.append(leaf.name)
      #      collapsed_tree = tree
       #     for node in collapsed_tree:
        #        if node.support < 0.5:
         #           node.delete()
          #  target_ca = collapsed_tree.get_common_ancestor(target_leaves)
示例#32
0
def concept_similarity_measure_ex1(C1, C2):

    taxonomy = Tree("skills_taxonomy_tree_level_score.nw")
    # print("C1",C1,"\n","C2",C2)
    #taxonomy.show()

    N1 = 0  # the distance from Concept 1 to the least common subsumer
    N2 = 0  # the distance from Concept 2 to the least common subsumer
    N = 0  # the distance from  the least common subsumer to the root
    """ -----------------L   he shortest path between the tow concepts------------------------------"""

    node1 = taxonomy.search_nodes(name=C1)
    node2 = taxonomy.search_nodes(name=C2)
    # print('node1', node1)
    # print('node2', node2)
    # if the skill is not found in the taxonomy
    if node1 == [] or node2 == []:
        # print('skills not in taxonomy')
        # print(C1, C2)
        data = [C1, C2]

        # Vectorise the data
        vec = TfidfVectorizer()

        X = vec.fit_transform(
            data
        )  # `X` will now be a TF-IDF representation of the data, the first row of `X` corresponds to the first sentence in `data`

        # Calculate the pairwise cosine similarities (depending on the amount of data that you are going to have this could take a while)
        S = cosine_similarity(X)
        similarity = S[0, 1]

        # print('simmmms',similarity)
        l1 = l2 = 1.0  # how much should it be

    else:
        node1 = node1[0]
        node2 = node2[0]
        common = node1.get_common_ancestor(node2)
        # print(common.is_root())
        # print("common is ",common.name)
        """ ------------------N the distance from root node to the least common subsumer-------------------"""
        root = taxonomy.get_tree_root()
        N = taxonomy.get_distance(common, root, topology_only=False)
        # print("N = the distance between the common ancestor", common.name, "AND  ROOT IS ", N)
        """ ----------------N1 the distance from Concept 1 to the least common subsumer--------------------"""
        N1 = taxonomy.get_distance(C1, common, topology_only=False)
        # print("N1 = the distance between",C1,"AND  ROOT IS ",N1)
        """ ----------------N1 the distance from Concept 2 to the least common subsumer--------------------"""
        N2 = taxonomy.get_distance(C2, common, topology_only=False)
        # print("N2 = the distance between",C2, "AND  ROOT IS ", N2)
        """ -------------------------------COMPUTE THE MEASURE FORMULA----------------------------------------"""

        similarity = (2 * N) / (N1 + N2 + (2 * N))

        # print("similarity between ", C1,"and",C2, "is", similarity)
        # print("---------------------------------------------------")
        l1 = node1.level_score
        l2 = node2.level_score

    return similarity, l1, l2
示例#33
0
    nwk_string = ''
    for line in nwk:
        nwk_string += line.rstrip("\n")

tree = Tree(nwk_string)

# label nodes
node_num = 0
for node in tree.traverse("postorder"):
    if len(node.name) == 0:
        node.add_features(name=str(node_num))
        node_num += 1

#print tree.get_ascii(attributes=["name"], show_internal=True)

# parse list of leaves of interest
with open(sys.argv[2], 'r') as list:
    leaf_list = []
    for line in list:
        leaf_list.append(line.rstrip("\n"))

# node of interest
interest_node_name = sys.argv[3]
for node in tree.traverse("postorder"):
    if node.name == interest_node_name:
        interest_node = node

# print branch lengths
for leaf in leaf_list:
    print leaf + "\t" + str(tree.get_distance(leaf, interest_node))
示例#34
0
class PhyloTreeDistanceMatrix(object):

    _matrix_object_filename = 'phylo_matrix.txt'
    _tree_matrix_filename = 'tree_dist_matrix.txt'
    _matrix = {}
    _all_leaves = []

    def __init__(self, newick_file):
        self._t = Tree(newick_file)

    def create_distance_matrix_file(self, tree_matrix=_tree_matrix_filename, matrix_object=_matrix_object_filename):
        """create rooted phylogenetic tree and then use it to generate distance matrix file with distances between nodes"""
        R = self._t.get_midpoint_outgroup()
        self._t.set_outgroup(R)

        # need to use ordered dict to keep order of keys, no need in 3.6
        dist_matrix = OrderedDict()

        # get leaves from tree
        leaves = [node for node in self._t.get_leaves() if node.is_leaf()]

        # create distance matrix
        for leaf0 in leaves:
            dist_matrix[leaf0.name] = OrderedDict()
            for leaf1 in leaves:
                distance = self._t.get_distance(leaf0, leaf1)
                dist_matrix[leaf0.name][leaf1.name] = distance

        # save matrix as text file
        with open(tree_matrix, 'w') as f:
            l = ''
            for key in dist_matrix.keys():
                for key1 in dist_matrix[key].keys():
                    d = dist_matrix[key][key1]
                    l += (str(d) + ' ')
                line = str(key)+ ": " + l + '\n'
                f.write(line)
                l = ''
                line = ''
        f.close()

        # save matrix object
        with open(matrix_object, 'w') as phylo:
            pickle.dump(dist_matrix, phylo)
        phylo.close()

    def load_distance_matrix(self, matrix_file):
        """load matrix object from specific file"""
        matrix = pickle.load(matrix_file)
        self._matrix = matrix

    def get_matrix_item(self, rowname, colname):
        """return item from distance matrix specified by row and column"""
        return self._matrix[rowname][colname]

    def delete_nodes(self, default_seq_name="1A2P_defal", num_of_leaves=100, tree_file="new_tree.newick"):
        """delete leaves far from original sequence, after deleting the tree will contain num_of_nodes leaves """
        leaves_dict = {}
        self._all_leaves = [node for node in self._t.get_leaves() if node.is_leaf()]
        num_leaves = len(self._all_leaves)

        for item in self._all_leaves:
            leaves_dict[item.name] = self.get_matrix_item(default_seq_name, item.name)

        sorted_leaves_dict = sorted(leaves_dict.items(), key=lambda x: x[1], reverse=True)

        for item in sorted_leaves_dict:
            deleted_leaf = self._t.search_nodes(name=item[0])[0]
            if (num_leaves > num_of_leaves):
                deleted_leaf.delete()
                num_leaves -= 1

        self._t.write(format=1, outfile=tree_file)
        self.create_distance_matrix_file()

    def delete_clusters(self, strategy='mean', tree_file="new_tree.newick"):
        nodes = [node for node in self._t.get_leaves() if node.is_leaf()]
        sorted_nodes = {}
        if strategy == 'mean':
            mean = 0.0
            size = len(nodes)
            for node in nodes:
                mean += self.get_matrix_item("1A2P_defal", node.name)
            mean /= size
            print(mean)
            for item in nodes:
                sorted_nodes[item.name] = self.get_matrix_item('1A2P_defal', item.name)

            sorted_nodes = sorted(sorted_nodes.items(), key=lambda x: x[1], reverse=True)
            for item in sorted_nodes:
                if item[1] > mean:
                    deleted_leaf = self._t.search_nodes(name=item[0])[0]
                    deleted_leaf.delete()

            self._t.write(format=1, outfile=tree_file)
示例#35
0
    target_taxa.append(line.rstrip())
tt.close()

#now read in a collection of trees, calc branch lengths over sample, summarise and print out
branch_lengths = defaultdict(list)  #key = taxa, value = list of brlens
treefile = open(sys.argv[3])
for line in treefile:
    curr_tree = Tree(line.rstrip())
    root_node = curr_tree.get_common_ancestor(outgroups)
    if curr_tree != root_node:
        curr_tree.set_outgroup(root_node)
    print curr_tree
    #bundle = curr_tree.check_monophyly(values=outgroups,target_attr='name')
    #print bundle
    #if bundle[0] == False:
    #    continue
    #find common ancestor of the target taxa, and use this as the reference node for calculating branch lengths. This might not always be the measure you want!
    reference_node = curr_tree.get_common_ancestor(target_taxa)
    #if reference_node != curr_tree:
    #    curr_tree.set_outgroup(reference_node)
    #calc distance from root to each branch of interest
    for taxon in target_taxa:
        dist = curr_tree.get_distance(taxon, reference_node)
        branch_lengths[taxon].append(dist)

#now compute the credible intervals of the branch length for each of the target taxa
for taxon in branch_lengths:
    mean, var, std = stats.bayes_mvs(branch_lengths[taxon], alpha=0.95)
    print taxon + "\t" + str(mean[0]) + "\t" + str(mean[1][0]) + "\t" + str(
        mean[1][1])
示例#36
0
import scipy.spatial.distance
from itertools import combinations
import xml.etree.ElementTree as ET

#Read xml
tree = ET.parse('total_fs/stage4/total_fs_linked.tree.xml') 
root = tree.getroot() 
for child in root:
    print(child.tag, child.attrib)

for i in root.iter('Tree'):
    poptree = i.text
    
dendtree = Tree(poptree) #from ete3

leaves = dendtree.get_leaf_names()
n = len(leaves)
dmat = np.zeros((n,n))

#Generate the matrix
for l1,l2 in combinations(leaves,2):
    d = dendtree.get_distance(l1,l2)
    dmat[leaves.index(l1),leaves.index(l2)] = dmat[leaves.index(l2),leaves.index(l1)] = d
    
schlink = sch.linkage(scipy.spatial.distance.squareform(dmat),method='average',metric='euclidean', optimal_ordering=True)
np.savetxt('DistMat_fromFS.txt', schlink, fmt='%f')

#To load
#b = np.loadtxt('DistMat_fromFS.txt', dtype=float)