示例#1
0
def saveTrees(name1, name2, name3):
    # name1 - plik z opisem taksonomicznym kontigów w formacie tsv
    # name2 - lokalizacja pliku z drzewem otrzymanym przez PhyloMagnet
    # name3 - lokalizacja, w której ma zostać zapisane nowe drzewo
    f2 = open(name1).read()
    f2 = f2.split("\n")[:-1]
    na_s = f2[0].split("-")[0]
    o = {}
    for i in f2:
        name = i.split("-")[0]
        if name != na_s:
            t = Tree(name2 + na_s + ".newick")
            for leaf in t.get_leaves():
                if leaf.name in o.keys():
                    leaf.name = leaf.name + "_" + o[leaf.name]
            t.write(format=1, outfile=name3 + na_s + ".nw")
            na_s = name
            o = {}
        l = i.split("\t")
        c = l[0].split("-")[1:]
        c = '-'.join(c)
        c = "Q_C" + c[1:][:9] + c[1:][10:]
        o[c] = l[1]
    t = Tree(name2 + name + ".newick")
    for leaf in t.get_leaves():
        if leaf.name in o.keys():
            leaf.name = leaf.name + "_" + o[leaf.name]
    t.write(format=1, outfile=name3 + name + ".nw")
    return "Done"
def write_ancgenes(clustered_genes,
                   treedir,
                   out_ancgenes,
                   clusters_to_load=None):
    """
    Writes the output 3-columns file, tab-separated.

    Args:
        clustered_genes (dict): class of gene families
        treedir (str): path to the gene trees
        out_ancgenes (str): name of the output file
        clusters_to_load (list, optional): write only entries for these given family classes.

    """

    k = 0

    with open(out_ancgenes, 'w') as outfile:

        for gene in clustered_genes:

            cluster = clustered_genes[gene]

            #Load only required family classes
            if clusters_to_load is not None and cluster not in clusters_to_load:
                continue

            #try different name for the input tree given the tree directory
            treefile = treedir + '/' + gene + '.nhx'
            if not os.path.exists(treefile):
                treefile = treedir + '/' + gene + '.nh'

            if not os.path.exists(treefile):
                treefile = treedir + '/C_' + gene + '.nh'

            if not os.path.exists(treefile):
                treefile = treedir + "/" + gene + "_final.nhx"

            assert os.path.exists(
                treefile), f"The file {treefile} does not exist"

            tree = Tree(treefile)

            leaves = {
                '_'.join(i.name.split('_')[:-1])
                for i in tree.get_leaves()
            }

            if leaves == {''}:
                leaves = {i.name for i in tree.get_leaves()}

            descendants = sorted(list(leaves))

            if clusters_to_load is not None:
                cluster = str(clusters_to_load.index(cluster))

            outfile.write(gene + '\t' + ' '.join(descendants) + '\t' +
                          cluster + '\n')

            k += 1
示例#3
0
def is_proper_newick(newick_data, dont_raise=False, names_with_only_digits_ok=False):
    try:
        tree = Tree(newick_data, format=1)

        seen = set([])
        duplicates = set([])
        for leaf in tree.get_leaves():
            name = leaf.name
            if name in seen:
                duplicates.add(name)
            seen.add(name)

        if len(duplicates):
            raise Exception("Your newick tree contains duplicate leaves, here is a list of them: %s" % ", ".join(duplicates))

    except Exception as e:
        if dont_raise:
            return False
        else:
            raise FilesNPathsError("Your tree doesn't seem to be properly formatted. Here is what ETE had "
                                   "to say about this: '%s'. Pity :/" % e)

    names_with_only_digits = [n.name for n in tree.get_leaves() if n.name.isdigit()]
    if len(names_with_only_digits) and not names_with_only_digits_ok:
        raise FilesNPathsError("Your tree contains names that are composed of only digits (like this one: '%s'). Sadly, anvi'o "
                               "is not happy with such names in newick trees or clustering dendrograms :( Anvi'o developers "
                               "apologize for the inconvenience." % (names_with_only_digits[0]))

    return True
示例#4
0
文件: phy.py 项目: zhaokai2014/wgd
def phylogenetic_tree_to_cluster_format(tree, pairwise_estimates):
    """
    Convert a phylogenetic tree to a 'cluster' data structure as in
    ``fastcluster``. The first two columns indicate the nodes that are joined by
    the relevant node, the third indicates the distance (calculated from branch
    lengths in the case of a phylogenetic tree) and the fourth the number of
    leaves underneath the node. Note that the trees are rooted using
    midpoint-rooting.

    Example of the data structure (output from ``fastcluster``)::

        [[   3.            7.            4.26269776    2.        ]
         [   0.            5.           26.75703595    2.        ]
         [   2.            8.           56.16007598    2.        ]
         [   9.           12.           78.91813609    3.        ]
         [   1.           11.           87.91756528    3.        ]
         [   4.            6.           93.04790855    2.        ]
         [  14.           15.          114.71302639    5.        ]
         [  13.           16.          137.94616373    8.        ]
         [  10.           17.          157.29055403   10.        ]]

    :param tree: newick tree file
    :param pairwise_estimates: pairwise Ks estimates data frame (pandas)
        (only the index is used)
    :return: clustering data structure, pairwise distances dictionary
    """
    id_map = {
        pairwise_estimates.index[i]: i for i in range(len(pairwise_estimates))}
    t = Tree(tree)

    # midpoint rooting
    midpoint = t.get_midpoint_outgroup()
    if not midpoint:  # midpoint = None when their are only two leaves
        midpoint = list(t.get_leaves())[0]
    t.set_outgroup(midpoint)
    logging.debug('Tree after rooting:\n{}'.format(t.get_ascii()))

    # algorithm for getting cluster data structure
    n = len(id_map)
    out = []
    pairwise_distances = {}
    for node in t.traverse('postorder'):
        if node.is_leaf():
            node.name = id_map[node.name]
            id_map[node.name] = node.name  # add identity map for renamed nodes
            # to id_map for line below
            pairwise_distances[node.name] = {
                id_map[x.name]: node.get_distance(x) for x in t.get_leaves()
            }
        else:
            node.name = n
            n += 1
            children = node.get_children()
            out.append(
                [children[0].name, children[1].name,
                 children[0].get_distance(children[1]),
                 len(node.get_leaves())])
    return np.array(out), pairwise_distances
示例#5
0
def initialise(rate):
    tree = Tree()
    tree.add_features(extinct=False)
    tree.dist = 0.0
    node = random.choice(tree.get_leaves())
    tree = birth(tree, node)
    leaf_nodes = tree.get_leaves()
    wtime = random.expovariate(rate)
    for leaf in leaf_nodes:
        if not leaf.extinct:
            leaf.dist += wtime
    return tree
示例#6
0
def tree_distances(file):

    t = Tree(file)
    branch_len_out = open(file + ".patristic-dist.tsv", "w")
    avg_distance_leaves = 0

    # Computing patristic distance matrix
    header = ""
    all_leaves = t.get_leaves()
    for i in all_leaves:
        header = header + "\t" + i.name

    nb_of_distances = 0
    max_len = 0
    min_len = 9999999999999999
    branch_len_out.write(header+"\n")
    for leaf1 in all_leaves:
        row = ""
        row += str(leaf1.name)
        for leaf2 in all_leaves:
            distance = np.clip(leaf1.get_distance(leaf2), 0.0, 99999999999999999999999999)
            avg_distance_leaves += distance
            row += "\t%f" % distance
            nb_of_distances += 1
            if distance > max_len:
                max_len = distance
            if distance < min_len and distance > 0:
                min_len = distance

        branch_len_out.write(row+"\n")

    branch_len_out.close()
示例#7
0
def merge_trees_and_write(trees, outgr, outfile, keep_br=False):

    """
    Merges two subtrees independently resolved into a single tree and adds the outgroup gene.
    Writes the result to file.

    Args:
        trees (list of ete3.Tree): Tree(s) to merge
        outgr (str): Outgroup gene name
        outfile (str): Output filename
    """

    merged_tree = Tree()

    for tree in trees:
        merged_tree.add_child(tree)

    #merge the two and place outgroup correctly
    merged_final = Tree()
    merged_final.add_child(merged_tree)
    merged_final.add_child(name=outgr)
    merged_final.prune([i for i in merged_final.get_leaves()])

    if keep_br:
        merged_final.write(outfile=outfile)

    else:
        merged_final.write(outfile=outfile, format=9)
示例#8
0
def get_orthogroups_genes(ctree, outgr_gene_name):
    """
    Finds the two polytomies in the constrained tree topology.

    Args:
        ctree (str): input tree file in newick format.
        outgr_gene_name (str): gene name of the outgroup gene.

    Returns:
        dict: the 1 or 2 polytomy node(s) and their corresponding size.
        str: full outgroup gene name (with species tag)
    """

    ctree = Tree(ctree)
    orthogroups = {}
    outgr = ''

    for leaf in ctree.get_leaves():

        if outgr_gene_name != '_'.join(leaf.name.split('_')[:-1]):

            parent_node = leaf.up

            if parent_node not in orthogroups:

                orthogroups[parent_node] = len(parent_node.get_leaves())
        else:
            outgr = leaf.name

        if len(orthogroups) == 2:
            break

    return orthogroups, outgr
示例#9
0
def get_anc_order(tree_file, ancestors, tips_to_root=False):
    """
    Orders input ancestors with respect to their position in the species tree. Can be ordered from
    root to tips (default) or tips to root.

    Args:
        tree_file (str): Path to the input newick formatted tree.
        ancestors (list of str): list of ancestor names

    Returns:
        OrderedDict: ancestor names in the requested order (keys) and list of ancestors in the
        input list that are below it (values).
    """

    tree = Tree(tree_file, format=1)
    tree.prune([i for i in tree.get_leaves()])
    dist_to_root = {i: tree.get_distance(i) for i in ancestors}
    anc_order = sorted(dist_to_root, key=dist_to_root.get)

    if tips_to_root:
        anc_order = anc_order[::-1]

    anc_order_dict = OrderedDict()
    for anc in anc_order:

        anc_order_dict[anc] = []
        anc_node = search_one_node(tree, anc)

        for anc2 in ancestors:

            if anc != anc2:
                if is_below(anc_node, anc2):
                    anc_order_dict[anc].append(anc2)

    return anc_order_dict
示例#10
0
def smart_reroot(treefile, outgroupfile, outfile, format=0):
    """
    simple function to reroot Newick format tree using ete2

    Tree reading format options see here:
    http://packages.python.org/ete2/tutorial/tutorial_trees.html#reading-newick-trees
    """
    tree = Tree(treefile, format=format)
    leaves = [t.name for t in tree.get_leaves()][::-1]
    outgroup = []
    for o in must_open(outgroupfile):
        o = o.strip()
        for leaf in leaves:
            if leaf[:len(o)] == o:
                outgroup.append(leaf)
        if outgroup:
            break

    if not outgroup:
        print("Outgroup not found. Tree {0} cannot be rerooted.".format(treefile), file=sys.stderr)
        return treefile

    try:
        tree.set_outgroup(tree.get_common_ancestor(*outgroup))
    except ValueError:
        assert type(outgroup) == list
        outgroup = outgroup[0]
        tree.set_outgroup(outgroup)
    tree.write(outfile=outfile, format=format)

    logging.debug("Rerooted tree printed to {0}".format(outfile))
    return outfile
示例#11
0
def tree_distances(file):

    t = Tree(file)
    branch_len_out = open(file + ".patristic-dist.tsv", "w")
    avg_distance_leaves = 0

    # Computing patristic distance matrix
    header = ""
    all_leaves = t.get_leaves()
    for i in all_leaves:
        header = header + "\t" + i.name

    nb_of_distances = 0
    max_len = 0
    min_len = 9999999999999999
    branch_len_out.write(header + "\n")
    for leaf1 in all_leaves:
        row = ""
        row += str(leaf1.name)
        for leaf2 in all_leaves:
            distance = np.clip(leaf1.get_distance(leaf2), 0.0,
                               99999999999999999999999999)
            avg_distance_leaves += distance
            row += "\t%f" % distance
            nb_of_distances += 1
            if distance > max_len:
                max_len = distance
            if distance < min_len and distance > 0:
                min_len = distance

        branch_len_out.write(row + "\n")

    branch_len_out.close()
示例#12
0
    def RapidNJ(names,
                profiles,
                embeded,
                handle_missing='pair_delete',
                **params):
        dist = distance_matrix.get_distance('symmetric', profiles,
                                            handle_missing)

        dist_file = params['tempfix'] + 'dist.list'
        with open(dist_file, 'w') as fout:
            fout.write('    {0}\n'.format(dist.shape[0]))
            for n, d in enumerate(dist):
                fout.write('{0!s:10} {1}\n'.format(
                    n, ' '.join(['{:.6f}'.format(dd) for dd in d])))
        del dist, d
        Popen([
            params['RapidNJ_{0}'.format(platform.system())], '-n', '-x',
            dist_file + '_rapidnj.nwk', '-i', 'pd', dist_file
        ],
              stdout=PIPE,
              stderr=PIPE).communicate()
        tree = Tree(dist_file + '_rapidnj.nwk')
        for fname in glob(dist_file + '*'):
            os.unlink(fname)

        try:
            tree.set_outgroup(tree.get_midpoint_outgroup())
            tree.unroot()
        except:
            pass

        for leaf in tree.get_leaves():
            leaf.name = names[int(leaf.name.strip("'"))]
        return tree
示例#13
0
def closest_dna_dist(treefile):
    """
    Using get closest leaf in ete which according to the description gets the closest descendent leaf but may or may not!
    Note that this may not be symmetric.
    :param treefile: The tree file to read
    :return: a dict of a node and its closest leaf
    """

    global verbose
    if verbose:
        sys.stderr.write("Getting closest distances\n")
    tree = Tree(treefile)
    dist = {}
    leaves = tree.get_leaves()
    # prepopulate the hash
    for l in leaves:
        dist[l.name] = {}
    for i in range(len(leaves)):
        closest, distance = leaves[i].get_closest_leaf()
        dist[leaves[i].name][closest.name] = distance
        if verbose:
            sys.stderr.write("{} -> {} : {}\n".format(leaves[i].name, closest.name, distance))
    if verbose:
        sys.stderr.write("\tDone\n")
    return dist
示例#14
0
def is_proper_newick(newick_data, dont_raise=False):
    try:
        tree = Tree(newick_data, format=1)

        seen = set([])
        duplicates = set([])
        for leaf in tree.get_leaves():
            name = leaf.name
            if name in seen:
                duplicates.add(name)
            seen.add(name)

        if len(duplicates):
            raise Exception(
                "Your newick tree contains duplicate leaves, here is a list of them: %s"
                % ", ".join(duplicates))

    except Exception as e:
        if dont_raise:
            return False
        else:
            raise FilesNPathsError(
                "Your tree doesn't seem to be properly formatted. Here is what ETE had\
                                    to say about this: '%s'. Pity :/" % e)

    return True
示例#15
0
def time_tree(newick):
    tree = Tree(newick)
    t0 = time.time()
    sum3_dt = polynomial_sum3_performance(tree,
                                          tree.get_leaves().__len__() + 1)[1]
    tf = time.time()
    return tf - t0, sum3_dt
示例#16
0
def convert_tree(treefile, output, d_conv=None, text=''):
    """
    Converts gene IDs in an input tree. A conversion dictionary can be given, otherwise it is
    generated.

    Args:
        treefile (file): input tree in newick format.
        output (str): name for the output file.
        d_conv (dict, optional): Conversion from old to new IDs.
        text (str, optional): Debug information

    Returns:
        dict: Conversion old to new IDs.

    """

    tree = Tree(treefile)

    if not d_conv:

        leaves = [i.name for i in tree.get_leaves()]

        #For treebest-type IDs (i.e last '_' is followed by species name):
        #generated IDs are 3 letters from species name + a unique number.
        ids = [
            gene.split('_')[-1][0:3] + str(nb)
            for nb, gene in enumerate(leaves)
        ]
        d_conv = dict(zip(leaves, ids))

    leaves = tree.get_leaves()

    assert len(leaves) == len(
        d_conv), "Trees have different number of leaves {}".format(text)

    for leaf in leaves:

        assert leaf.name in d_conv, "{} present in {} but not in all trees".format(
            leaf.name, treefile)
        leaf.name = d_conv[leaf.name]

    tree.prune([i for i in tree.get_leaves()])

    tree.write(outfile=output, format=9)

    return d_conv
示例#17
0
 def simplify_names(
     input_path: str,
     output_path: str,
     names_translator: t.Optional[t.Dict[str, str]] = None,
 ) -> t.Optional[t.Dict[str, str]]:
     """
     :param input_path: path with the original sequence names
     :param output_path:  path to which the sequences with the new names will be written
     :param names_translator: translator of new to old names. if not provided, simple names will be generated and returned
     :return:
     """
     input_is_tree = False
     if ".nwk" in str(input_path):
         input_is_tree = True
     if not input_is_tree:
         seq_records = list(SeqIO.parse(input_path, "fasta"))
         if not names_translator:
             s = 1
             new_to_orig_name = dict()
             for record in seq_records:
                 new_to_orig_name[f"S{s}"] = record.description
                 record.description = record.id = record.name = f"S{s}"
                 s += 1
             SeqIO.write(seq_records, output_path, "fasta")
             return new_to_orig_name
         else:
             reversed_names_translator = {
                 names_translator[key]: key
                 for key in names_translator
             }
             for record in seq_records:
                 record.description = (
                     record.name) = record.id = reversed_names_translator[
                         record.description]
             SeqIO.write(seq_records, output_path, "fasta")
     else:
         with open(input_path, "r") as infile:
             tree_str = infile.read()
         tree = Tree(tree_str, format=1)
         tree_leaves = tree.get_leaves()
         if not names_translator:
             s = 1
             new_to_orig_name = dict()
             for leaf in tree_leaves:
                 new_to_orig_name[f"S{s}"] = leaf.name
                 leaf.name = f"S{s}"
                 s += 1
             tree.write(outfile=output_path, format=5)
             return new_to_orig_name
         else:
             reversed_names_translator = {
                 names_translator[key]: key
                 for key in names_translator
             }
             for leaf in tree_leaves:
                 leaf.name = reversed_names_translator[leaf.name]
             tree.write(outfile=output_path, format=5)
def make_tree_from_groups(subtree_leaves,
                          species_groups,
                          groups_are_genes=False):
    """
    Builds a gene tree from groups of species or groups of genes.

    Args:
        subtree_leaves (list of ete3.nodes): all genes to place in the tree
        species_groups (list of str): species to group together (first group is outgroup)
        groups_are_genes (bool, optional): set to True if species_groups are groups of genes

    Returns:
        ete3.Tree : resulting gene tree
        str : one outgroup gene name, to identify the tree
    """

    tree = Tree()

    outgr, group1, group2 = species_groups

    if not groups_are_genes:

        outgr = {i.name for i in subtree_leaves if i.S in outgr}
        group1 = {i.name for i in subtree_leaves if i.S in group1}
        group2 = {i.name for i in subtree_leaves if i.S in group2}

    outgr_gene = list(outgr)[0]
    if len(outgr) >= 2:
        outgr_node = tree.add_child(name='outgr_node')
        for i in outgr:
            outgr_node.add_child(name=i)

    else:
        outgr = outgr.pop()
        tree.add_child(name=outgr)

    if group1 and group2:
        next_node = tree.add_child(name="anc_3r")
        gr1 = next_node.add_child(name="gr1")
        for i in group1:
            gr1.add_child(name=i)

        gr2 = next_node.add_child(name="gr2")
        for i in group2:
            gr2.add_child(name=i)

    elif group1:
        next_node = tree.add_child(name="anc_3r")
        for i in group1:
            next_node.add_child(name=i)

    elif group2:
        next_node = tree.add_child(name="anc_3r")
        for i in group2:
            next_node.add_child(name=i)
    tree.prune(tree.get_leaves())
    return tree, outgr_gene
示例#19
0
def _add_observed_isotypes(
    tree: ete3.Tree,
    newidmap: Dict[str, str],
    isotype_order: Sequence[str],
    weight_matrix: Optional[Sequence[Sequence[float]]] = None,
):
    # Drop observed nodes as leaves and explode by observed isotype:
    # Descend internal observed nodes as leaves:
    newisotype = IsotypeTemplate(isotype_order,
                                 weight_matrix=weight_matrix).new
    for node in list(tree.iter_descendants()):
        if node.abundance > 0 and not node.is_leaf():
            newchild = ete3.TreeNode(name=node.name)
            newchild.add_feature("sequence", node.sequence)
            newchild.add_feature("abundance", node.abundance)
            node.abundance = 0
            node.add_child(child=newchild)
    # Now duplicate nodes which represent multiple isotypes
    for node in list(tree.get_leaves()):
        if node.abundance == 0:
            node.add_feature("isotype", newisotype("?"))
        else:
            try:
                thisnode_isotypemap = newidmap[node.name]
            except KeyError as e:
                warnings.warn(
                    f"The sequence name {e} labels an observed node, but no mapping to an original sequence ID was found."
                    " Isotype will be assumed ambiguous.")
                thisnode_isotypemap = {
                    "?": {f"Unknown_id_{n+1}"
                          for n in range(node.abundance)}
                }
            if "?" in thisnode_isotypemap:
                warnings.warn(
                    f"The sequence name {node.name} labels an observed node, and corresponds to sequence IDs for "
                    "which no observed isotype was provided. "
                    f" Isotype will be assumed ambiguous for: {', '.join(thisnode_isotypemap['?'])}"
                )
            # node.name had better be in newidmap, since this is an observed node
            if len(thisnode_isotypemap) > 1:
                for isotype, cell_ids in thisnode_isotypemap.items():
                    # add new node below this leaf node. Must be below, and not child
                    # of parent, to preserve max parsimony in case that node.up has
                    # different sequence from node.
                    newchild = ete3.TreeNode(name=node.name)
                    newchild.add_feature("abundance", len(cell_ids))
                    newchild.add_feature("sequence", node.sequence)
                    newchild.add_feature("isotype", newisotype(isotype))
                    node.add_child(child=newchild)
                node.abundance = 0
            else:
                node.isotype = newisotype(list(thisnode_isotypemap.keys())[0])
    # Now add ancestral ambiguous isotypes
    for node in tree.traverse():
        if not node.is_leaf():
            node.add_feature("isotype", newisotype("?"))
示例#20
0
def rerootTree(treefile, output, ogterm="OG--", fmat=3):
    intree = Tree(treefile)
    og = [x for x in intree.get_leaves() if x.name.startswith(ogterm)]
    if not len(og):
        return False

    og = og[0]
    intree.set_outgroup(og.name)
    intree.ladderize(direction=1)
    intree.write(outfile=output, format=fmat)
    return True
示例#21
0
def main():
    if args.exclpops is not None:
        excluded_populations = args.exclpops.split(',')
    else:
        excluded_populations = []
    if args.exclindivs is not None:
        excluded_individuals = args.exclindivs.split(',')
    else:
        excluded_individuals = []

    # i = 0

    with gzip.open(args.input_file_name, 'rb') as input_file:
        with gzip.open(args.output_file_name, 'wb') as output_file:
            header = True
            for line in input_file:

                fields = [x.decode() for x in line.split()]

                if header:
                    header = False
                    fields += [
                        'pruned_tmrca', 'pruned_tmrca_half', 'pruned_coal_half'
                    ]
                    s = '\t'.join(fields) + '\n'
                    output_file.write(s.encode())
                    continue

                tree = Tree(fields[32])  #.decode())

                included_leaves = list()
                for leaf in tree.get_leaves():
                    if not any(pop in leaf.name for pop in excluded_populations) \
                        and not any(indiv in leaf.name for indiv in excluded_individuals):
                        # included_leaves.append(leaf)
                        included_leaves.append(leaf.name)

                #tree.prune(included_leaves, preserve_branch_length=True)
                prune(tree, included_leaves)

                # hack to ensure there is no nondicotomic node under the root:
                if len(tree.children) == 1 and not tree.children[0].is_leaf():
                    tree.children[0].delete(preserve_branch_length=True)

                assert set(tree.get_leaf_names()) == set(included_leaves)

                # if not node.is_leaf() and len(node.children) == 1 and not node.children[0].is_leaf():
                #     node.children[0].delete(preserve_branch_length=True)

                tmrca, tmrca_half, coal_half = tmrca_stats(tree)

                fields += [str(tmrca), str(tmrca_half), str(coal_half)]
                s = '\t'.join(fields) + '\n'
                output_file.write(s.encode())
示例#22
0
def remove_anc(tree_file, out_file):
    """
    Removes any internal node name, such as ancestor names, in the input tree and writes it to a
    new file.

    Args:
        tree_file (str): Path to the input newick formatted tree.
        out_file (str): Path for the output file.
    """

    tree = Tree(tree_file, format=1)
    tree.prune([i for i in tree.get_leaves()])
    tree.write(outfile=out_file, format=9)
def get_scorpios_aore_tree(gene_list, treefile, outgroups, outgr_gene):
    """
    Loads the AORe gene tree built by SCORPiOs.

    Args:
        gene_list (dict): dict of gene_names (key) : species_names (value) to keep in the tree
        treefile (str): name of the input tree file
        outgroups (list of str): list of outgroup species to keep/add in tree
        outgr_gene (str): name of the outgroup gene

    Returns:
        ete3.Tree : the loaded tree
    """

    tree = Tree(treefile)
    tleaves = tree.get_leaves()

    #remove sp name
    for leaf in tleaves:
        leaf.name = '_'.join(leaf.name.split('_')[:-1])

    tree.prune([i for i in tleaves if i.name in gene_list])
    leaves = {i.name for i in tree.get_leaves()}
    if leaves != set(gene_list.keys()):

        diff = set(gene_list.keys()).difference(leaves)

        outgr_node = tree.get_leaves_by_name(outgr_gene)[0]
        outgr_t = Tree()
        for gened in diff:
            if gene_list[gened] in outgroups:
                outgr_t.add_child(name=gened)
            else:
                return None  #TODO: print the kind of cases covered here?
        outgr_t.add_child(name=outgr_gene)
        outgr_node.add_child(outgr_t)
    tree.prune(tree.get_leaves())

    return tree
示例#24
0
def read_clustertree_fromnewick(treefpath: str):
    """reads in clustertree as defined in write_clustertree_fromnewick

    Arguments:
    treefpath: newick tree with leaf clusternames = including |-delimited accs

    Returns:
    ete3.Tree with same names and added feature value of accs = list of subtree accs
    """
    ctree = Tree(treefpath)
    for lnode in ctree.get_leaves():
        accnames = lnode.name
        lnode.add_feature('accs', [x for x in accnames.strip('|').split('|')])
    return ctree
示例#25
0
    def ninja(names,
              profiles,
              embeded,
              handle_missing='pair_delete',
              **params):
        dist = distance_matrix.get_distance('symmetric', profiles,
                                            handle_missing)
        dist = dist / profiles.shape[1]
        dist_file = params['tempfix'] + 'dist.list'
        with open(dist_file, 'w') as fout:
            fout.write('    {0}\n'.format(dist.shape[0]))
            for n, d in enumerate(dist):
                fout.write('{0!s:10} {1}\n'.format(
                    n, ' '.join(['{:.6f}'.format(dd) for dd in d])))
        del dist, d
        free_memory = int(0.9 * psutil.virtual_memory().total / (1024.**2))
        ninja_out = Popen([
            'java', '-d64', '-Xmx' + str(free_memory) + 'M', '-jar',
            params['ninja_{0}'.format(
                platform.system())], '--in_type', 'd', dist_file
        ],
                          stdout=PIPE,
                          stderr=PIPE,
                          universal_newlines=True).communicate()
        if ninja_out[1].find('64-bit JVM') >= 0:
            ninja_out = Popen([
                'java', '-Xmx1200M', '-jar', params['ninja_{0}'.format(
                    platform.system())], '--in_type', 'd', dist_file
            ],
                              stdout=PIPE,
                              stderr=PIPE,
                              universal_newlines=True).communicate()
        with open(dist_file + '.nwk', 'wt') as fout:
            fout.write(ninja_out[0])
        tree = Tree(dist_file + '.nwk')
        for fname in glob(dist_file + '*'):
            os.unlink(fname)

        for node in tree.traverse():
            node.dist *= profiles.shape[1]

        try:
            tree.set_outgroup(tree.get_midpoint_outgroup())
            tree.unroot()
        except:
            pass

        for leaf in tree.get_leaves():
            leaf.name = names[int(leaf.name.strip("'"))]
        return tree
示例#26
0
def tree_distances_info(file,scale,seq_len):

    t = Tree(file)
    # branch_len_matrix_f = file + ".branches-len.tsv"
    branch_len_out = open(file + ".%d.patristic-dist.tsv" % seq_len, "w")
    tree_info = open(file + ".%d.tree-info.txt" % seq_len, "w")
    avg_distance_leaves = 0

    # Computing patristic distance matrix
    header = ""
    all_leaves = t.get_leaves()
    for i in all_leaves:
        header = header + "\t" + i.name

    nb_of_distances = 0
    max_len = 0
    min_len = 99999999999999
    branch_len_out.write(header+"\n")
    for leaf1 in all_leaves:
        row = ""
        row += str(leaf1.name)
        for leaf2 in all_leaves:
            avg_distance_leaves += leaf1.get_distance(leaf2)
            distance = leaf1.get_distance(leaf2)
            row += "\t%f" % distance
            nb_of_distances += 1
            if distance > max_len:
                max_len = distance
            if distance < min_len and distance > 0:
                min_len = distance

        branch_len_out.write(row+"\n")

    tree_info.write("Scale_factor(1=original-tree)\t%f\n" % scale)
    tree_info.write("Seq_Length\t%d\n" % seq_len)
    tree_info.write("Number_of_leaves_(taxa)\t%d\n" % len(all_leaves))
    tree_info.write("Minimal_patristic_distance\t%f\n" % min_len)
    tree_info.write("Maximal_patristic_distance\t%f\n" % max_len)
    tree_info.write("Average_patristic_distance\t%f\n" % (avg_distance_leaves/(nb_of_distances*scale)))

    print("Scale_factor(1=original-tree)\t%f" % scale)
    print("Seq_Length\t%d" % seq_len)
    print("Number_of_leaves_(taxa)\t%d" % len(all_leaves))
    print("Minimal_patristic_distance\t%f" % min_len)
    print("Maximal_patristic_distance\t%f" % max_len)
    print("Average_patristic_distance\t%f" % (avg_distance_leaves/(nb_of_distances*scale)))

    branch_len_out.close()
    tree_info.close()
示例#27
0
    def make_tree_fig(tree_file, out_name, tax_level=None):
        with open(tree_file) as handle:
            lines = handle.readlines()

            if len(lines) > 0:
                ete_tree = Tree(lines[0][:-1].replace(";IM", "-IM").replace(
                    ";CP", "-"))
            else:
                return None
        if tax_level:
            taxa = {}
            #            taxa = {xx : [x for x in xx.name.replace(" ","-").split("_")[1:] if len(x) > 0 and not x[0].isdigit() ] for xx in ete_tree.get_leaves()}
            for xx in ete_tree.get_leaves():
                id = xx.name
                taxon = taxas.get(id)
                if taxon:
                    xx.name = ";".join([id] + taxon if taxon else [])
                taxa[xx] = taxon[tax_level] if taxon and len(
                    taxon) > tax_level else None

            for leaf in taxa:
                leaf.set_style(NodeStyle())
                if taxa.get(leaf) and cols.get(taxa[leaf]):
                    leaf.img_style["bgcolor"] = cols[taxa[leaf]]
                elif "acI" in leaf.name:
                    leaf.img_style["bgcolor"] = cols['acI']
        else:
            taxa = None
        styl = TreeStyle()
        styl.mode = 'c'
        #        styl.arc_start = -180
        #        styl.arc_span = 180 #
        print(out_name)
        ete_tree.render(out_name,
                        w=len(ete_tree.get_leaves()) * 5,
                        tree_style=styl)
 def __init__(self, tree_paths, suffix=".aa.tre.renamed"):
     self.trees = {}
     tree_files = tree_paths
     for t in tree_files:
         this_marker = os.path.basename(t).replace(suffix, "")
         tree = Tree(t)
         for tip in [n for n in tree.get_leaves()]:
             spl = tip.name.split("&")
             spl_2 = spl[1].split("|")
             tree_name = spl[0]
             isolate = spl_2[0]
             phead = spl_2[1]
             tip.add_feature("isolate", isolate)
             tip.add_feature("gene", phead)
             tip.add_feature("genus", tip.name.split("_")[0])
         self.trees[this_marker] = tree
示例#29
0
def select_rep_genomes(genomedb,
                       treefile,
                       threshold=0.01,
                       output="rep_strains.txt"):
    t = Tree(os.path.abspath(treefile))
    o = open(os.path.abspath(output), 'w')
    good_strains = []
    to_skip = []

    strains = [x.name for x in t.get_leaves()]
    contigs = {}
    for line in open(os.path.join(genomedb, "genome_metadata.txt"), 'r'):
        if line.startswith("assembly_id"):
            continue
        else:
            vals = line.rstrip().split("\t")
            if vals[2] in strains:
                contigs[vals[2]] = int(vals[4])

    for node in t.iter_descendants("preorder"):

        if node in to_skip:
            continue
        else:
            leafnodes = [x for x in node.get_leaves()]
            if len(leafnodes) > 100:
                continue
            else:
                if len(leafnodes) > 1:
                    pairs = [p for p in combinations(leafnodes, 2)]
                    dist = 0.0
                    for p in pairs:
                        dist += node.get_distance(p[0], p[1])
                    if dist / len(pairs) < threshold:
                        leafnodes.sort(key=lambda x: contigs[x.name])
                        good_strains.append(leafnodes[0].name)
                        [
                            to_skip.append(desc)
                            for desc in node.iter_descendants("preorder")
                        ]
                if node.is_leaf():
                    good_strains.append(node.name)

    print(len(good_strains), "at threshold", threshold)
    o.write("\n".join(good_strains) + "\n")
    o.close()
    return
示例#30
0
def remove_outgroup(tree, outgr):
    """
    Loads a subtree and removes the outgroup gene.

    Args:
        tree (ete3.Tree): Input trree
        outgr (str): Outgroup gene name

    """
    tree = Tree(tree)
    leaves = [i.name for i in tree.get_leaves()]

    outgr_gene = [i for i in leaves if outgr == '_'.join(i.split('_')[:-1])][0]
    tree.set_outgroup(tree & outgr_gene)

    tree.prune([i for i in leaves if i != outgr_gene])
    return tree, outgr_gene
def make_matrix(treefile):
    """
    Create a matrix from a tree file
    :param treefile:
    :return:
    """

    tree = Tree(treefile)

    leaves = tree.get_leaves()
    paths = {x: set() for x in leaves}

    # get the paths going up the tree
    # we get all the nodes up to the last one and store them in a set
    sys.stderr.write("Precalculating distances\n")
    for n in leaves:
        if n.is_root():
            continue
        movingnode = n
        while not movingnode.is_root():
            paths[n].add(movingnode)
            movingnode = movingnode.up

    # now we want to get all pairs of nodes using itertools combinations. We need AB AC etc but don't need BA CA

    leaf_distances = {x.name: {} for x in leaves}

    sys.stderr.write("Iterating over the leaves\n")
    for (leaf1, leaf2) in combinations(leaves, 2):
        # figure out the unique nodes in the path
        uniquenodes = paths[leaf1] ^ paths[leaf2]
        distance = sum(x.dist for x in uniquenodes)
        leaf_distances[leaf1.name][leaf2.name] = leaf_distances[leaf2.name][
            leaf1.name] = distance

    allleaves = sorted(leaf_distances.keys())
    sys.stdout.write("\t".join([""] + allleaves) + "\n")
    for n in allleaves:
        sys.stdout.write(n + "\t")
        for m in allleaves:
            if m == n:
                sys.stdout.write("0\t")
            else:
                sys.stdout.write("{}\t".format(leaf_distances[n][m]))
        sys.stdout.write("\n")
def make_dists(treefile, printone, verbose):
    """
    Create pairwise distances from a tree file
    :param treefile: the tree file to parse
    :param printone: if true we only print one copy of the pair (ie. A -> B). If false we print A->B and B->A
    :param verbose: make some additional output
    :return:
    """

    tree = Tree(treefile)

    leaves = tree.get_leaves()
    paths = {x:set() for x in leaves}

    # get the paths going up the tree
    # we get all the nodes up to the last one and store them in a set
    if verbose:
        sys.stderr.write("Precalculating distances\n")
    for n in leaves:
        if n.is_root():
            continue
        movingnode = n
        while not movingnode.is_root():
            paths[n].add(movingnode)
            movingnode = movingnode.up

    # now we want to get all pairs of nodes using itertools combinations. We need AB AC etc but don't need BA CA

    leaf_distances = {x.name:{} for x in leaves}

    if verbose:
        sys.stderr.write("Iterating over the leaves\n")
    for (leaf1, leaf2) in combinations(leaves, 2):
        # figure out the unique nodes in the path
        uniquenodes = paths[leaf1] ^ paths[leaf2]
        distance = sum(x.dist for x in uniquenodes)
        if printone:
            if leaf1.name < leaf2.name:
                print("{}\t{}\t{}".format(leaf1.name, leaf2.name, distance))
            else:
                print("{}\t{}\t{}".format(leaf2.name, leaf1.name, distance))
        else:
            print("{}\t{}\t{}".format(leaf1.name, leaf2.name, distance))
            print("{}\t{}\t{}".format(leaf2.name, leaf1.name, distance))
def make_matrix(treefile):
    """
    Create a matrix from a tree file
    :param treefile:
    :return:
    """

    tree = Tree(treefile)

    leaves = tree.get_leaves()
    paths = {x:set() for x in leaves}

    # get the paths going up the tree
    # we get all the nodes up to the last one and store them in a set
    sys.stderr.write("Precalculating distances\n")
    for n in leaves:
        if n.is_root():
            continue
        movingnode = n
        while not movingnode.is_root():
            paths[n].add(movingnode)
            movingnode = movingnode.up

    # now we want to get all pairs of nodes using itertools combinations. We need AB AC etc but don't need BA CA

    leaf_distances = {x.name:{} for x in leaves}

    sys.stderr.write("Iterating over the leaves\n")
    for (leaf1, leaf2) in combinations(leaves, 2):
        # figure out the unique nodes in the path
        uniquenodes = paths[leaf1] ^ paths[leaf2]
        distance = sum(x.dist for x in uniquenodes)
        leaf_distances[leaf1.name][leaf2.name] = leaf_distances[leaf2.name][leaf1.name] = distance

    allleaves = sorted(leaf_distances.keys())
    sys.stdout.write("\t".join([""] + allleaves) + "\n")
    for n in allleaves:
        sys.stdout.write(n + "\t")
        for m in allleaves:
            if m == n:
                sys.stdout.write("0\t")
            else:
                sys.stdout.write("{}\t".format(leaf_distances[n][m]))
        sys.stdout.write("\n")
示例#34
0
def is_proper_newick(newick_data, dont_raise=False):
    try:
        tree = Tree(newick_data, format=1)

        seen = set([])
        duplicates = set([])
        for leaf in tree.get_leaves():
            name = leaf.name
            if name in seen:
                duplicates.add(name)
            seen.add(name)

        if len(duplicates):
            raise Exception("Your newick tree contains duplicate leaves, here is a list of them: %s" % ", ".join(duplicates))

    except Exception as e:
        if dont_raise:
            return False
        else:
            raise FilesNPathsError("Your tree doesn't seem to be properly formatted. Here is what ETE had\
                                    to say about this: '%s'. Pity :/" % e)

    return True
def make_matrix(treefile, outputf):
    """
    Create a matrix from a tree file
    :param treefile: the tree file to read
    :param outputf: the file to write the matrix to 
    :return:
    """

    tree = Tree(treefile, quoted_node_names=True, format=1)

    leaves = tree.get_leaves()
    paths = {x:set() for x in leaves}

    # get the paths going up the tree
    # we get all the nodes up to the last one and store them in a set
    sys.stderr.write("Precalculating distances\n")
    for n in leaves:
        if n.is_root():
            continue
        movingnode = n
        while not movingnode.is_root():
            paths[n].add(movingnode)
            movingnode = movingnode.up

    # now we want to get all pairs of nodes using itertools combinations. We need AB AC etc but don't need BA CA

    leaf_distances = {x.name:{} for x in leaves}


    sys.stderr.write("Iterating over the leaves\n")
    sys.stderr.write("THere are {} leaves\n".format(len(leaves)))
    combi = combinations(leaves, 2)
    combidef = int(len(list(combi))/500);
    sys.stderr.write("There are {} combinations. Each dot is {} combinations\n".format(len(list(combi)), combidef))
    c=0
    cc=0
    for (leaf1, leaf2) in combi:
        if (c % combidef) == 0:
            if cc == 5:
                sys.stdout.write(" ")
                cc=0
            sys.stdout.write(".")
            cc+=1
            c+=1

        # figure out the unique nodes in the path
        uniquenodes = paths[leaf1] ^ paths[leaf2]
        distance = sum(x.dist for x in uniquenodes)
        leaf_distances[leaf1.name][leaf2.name] = leaf_distances[leaf2.name][leaf1.name] = distance

    sys.stdout.write("\n")

    allleaves = sorted(leaf_distances.keys())

    with open(outputf, 'w') as out:
        out.write("\t".join([""] + allleaves) + "\n")
        for n in allleaves:
            out.write(n + "\t")
            for m in allleaves:
                if m == n:
                    out.write("0\t")
                else:
                    out.write("{}\t".format(leaf_distances[n][m]))
            out.write("\n")
示例#36
0
inh = open(sys.argv[1])
treestring = inh.readline()
treestr = treestring.replace(';','')
treestr = treestr + ";" 
inh.close()

if len(treestr) == 0:
    print sys.argv[1] + "\tEmpty tree"
    quit()

t = Tree(treestr)

#define basic tree style

ts = TreeStyle()
ts.show_leaf_name = True
ts.show_branch_support = True

#for n in t.traverse()
#    if n.is_leaf():

#Here, we set up the annotations we want on the tree. For example, let's make the leaves with eukaryote sequences large red balls.
for leaf in t.get_leaves():
    if re.search('Eukaryota', leaf.name):
        leaf_style = NodeStyle()
        leaf_style["fgcolor"] = "red"
        leaf_style["size"] = 15
        leaf.set_style(leaf_style)
t.show(tree_style=ts)
示例#37
0
文件: iterators.py 项目: abdo3a/ete
import time
from ete3 import Tree

# Creates a random tree with 10,000 leaf nodes
tree = Tree()
tree.populate(10000)
# This code should be faster
t1 = time.time()
for leaf in tree.iter_leaves():
    if "aw" in leaf.name:
        print "found a match:", leaf.name,
        break
print "Iterating: ellapsed time:", time.time() - t1
# This slower
t1 = time.time()
for leaf in tree.get_leaves():
    if "aw" in leaf.name:
        print "found a match:", leaf.name,
        break
print "Getting: ellapsed time:", time.time() - t1
# Results in something like:
# found a match: guoaw Iterating: ellapsed time: 0.00436091423035 secs
# found a match: guoaw Getting: ellapsed time: 0.124316930771 secs
    for fasta in rec_dict[strain]:
        concat_dict[strain]=concat_dict[strain]+str(fasta.seq)

#write out concatenated fasta            
handle=open("all_concat.fasta", "w")
for rec in concat_dict:
    handle.write(">"+rec+"\n"+concat_dict[rec]+"\n")

handle.close()
#SeqIO.write(list(SeqIO.parse(open("all_concat.fasta"), "fasta")), "all_concat.phy", "phylip")

#now write out tree
for node in tree_old.traverse():
    if node.is_leaf():
        temp=node.name.replace('p','plate')
        node.name=temp
        

tree_old.prune(strains, preserve_branch_length=T)
write.tree(tree_old, "all_concat.newick", formatrue=1)



for leaf in collapsed.get_leaves():
    temp=leaf.name.replace('p', 'plate').split("_")[0]
    leaf.name=temp
collapsed.write(outfile="concat_107.newick", format=1)

test=subprocess(Popen(["/ebio/abt6_projects9/Pseudomonas_diversity/Programs/bin/ClonalFrameML",  "/ebio/abt6_projects9/Pseudomonas_diversity/data/post_assembly_analysis/pan_genome/data/vis/clonalframe/concat_107.newick", "/ebio/abt6_projects9/Pseudomonas_diversity/data/post_assembly_analysis/pan_genome/data/vis/clonalframe/all_concat.fasta"]), stdout=subprocess.PIPE))
output = test.communicate()[0]