示例#1
0
class NTBDB(object):
    def __init__(self, imgs_dir='/storage/imgs/low-res', metadata_dir='/storage/metadata'):
        self.metadata_dir = metadata_dir
        self.imgs_dir = imgs_dir
        with open(os.path.join(self.metadata_dir, 'metadata.pickle')) as md:
            self.metadata = pickle.load(md)
        for img in EXCLUDE_PICS:
            del self.metadata[img]
        self.by_tag = dict()
        for p in self.metadata.itervalues():
            for tag in p['tags']:
                self.by_tag.setdefault(tag, []).append(p)

        self.tags = Tree(os.path.join(self.metadata_dir, 'tags.nw'), format=8)
        self.tag_by_name = {tag.name: tag for tag in self.tags.get_descendants()}
     
    def by_tag_with_children(self, tag_name):
        tag_node = self.tags.search_nodes(name=tag_name)[0]
        all_tags = [tag_node]
        all_tags.extend(tag_node.get_descendants())
        return list(itertools.chain.from_iterable(self.by_tag.get(tag.name, []) for tag in all_tags))
    
    def tag_score(self, tag):
        return len(self.by_tag.get(tag.name, [])) + sum(map(self.tag_score, tag.children))
    
    def top_tags(self, max_children=5, max_depth=2):
        top_tags = self.tags.copy()
        for n in top_tags.traverse():
            n.children = sorted(n.children, key=self.tag_score, reverse=True)[:max_children]
            if n.get_distance(n.get_tree_root()) > max_depth - 1:
                n.children = []
        return top_tags

    def image_path(self, image_index):
        return os.path.join(self.imgs_dir, self.metadata[image_index]['folder'], self.metadata[image_index]['filename'] + '.jpg')
示例#2
0
    def __init__(self,tree:Tree,cluster_feature='accs'):
        """ constructor for EteMplTree.
        Calls: self.cluster_size() to add feature .cluster_relsize to each leaf node

        Arguments:
            tree: an ete3 tree instance
            [Also currently makes some assumed settings that are configurable public properties-
            -orientation,cluster_feature,cluster_viz,scale]
        
        Keyword Arguments:
            cluster_feature: what feature to use as indication of cluster size: None, 'accs' (default='accs')
        """

        self.tree=tree.copy()
        self.orientation='left'
        self.scale=1.0
        self.dashed_leaves=True
        self.cluster_viz='triangle'
        self.cviz_symboldict={'left':'<','right':'>','top':'^','bottom':'v'}
        self.cviz_hadict={'left':'left','right':'right','top':'center','bottom':'center'}
        self.cviz_vadict={'left':'center','right':'center','top':'top','bottom':'bottom'}
        self.tree_lw=3.0
        self.tree_color='black'
        self.initial_leafspacing=0.1
        self.create_leaf_names=False
        self.draw_leaf_names=False
        
        self.cluster_feature=cluster_feature
        self.set_cluster_size()

        self.plot_coords=[[np.inf,-np.inf],[np.inf,-np.inf]]
        self.ordered_leaves=None
        self.decorated_plot_coords=None
示例#3
0
def collapse_unifurcations(tree: ete3.Tree) -> ete3.Tree:
    """Collapse unifurcations.
    Collapse all unifurcations in the tree, namely any node with only one child
    should be removed and all children should be connected to the parent node.
    Args:
        tree: tree to be collapsed
    Returns:
        A collapsed tree.
    """

    collapse_fn = lambda x: (len(x.children) == 1)

    collapsed_tree = tree.copy()
    to_collapse = [n for n in collapsed_tree.traverse() if collapse_fn(n)]

    for n in to_collapse:
        n.delete()

    return collapsed_tree
示例#4
0
def write_newick_tree_with_uncoded_names(infile,
                                         outfile,
                                         tablefile,
                                         quoted_names=False):
    """Take a text file with a newick tree, and write a new one with names
    converted back to original names (given values in a conversion table), and
    optionally put quotation marks around the names in the output tree.
    """
    # Generate a dictionary for converting names.
    conv_dict = get_conversion_dict_from_table(tablefile)

    ## Look for each code in the input tree, and replace with original name.
    #tree_string = None
    #with open(infile) as infh:
    #    tree_string = infh.readline()

    #for code in conv_dict:
    #    if quoted_names:
    #        tree_string = tree_string.replace(code, '"' + conv_dict[code] + '"')
    #    else:
    #        tree_string = tree_string.replace(code, conv_dict[code])

    #t1 = Tree(infile)
    t1 = Tree(infile, format=1)
    t2 = t1.copy()
    for node in t2.traverse():
        if node.is_leaf():
            for x in conv_dict.keys():
                if node.name.strip('\'').replace(' ',
                                                 '_') == x.strip('\'').replace(
                                                     ' ', '_'):
                    node.name = conv_dict[x]

    # Write uncoded tree to output file.
    #with open(outfile, 'w') as o:
    #o.write(tree_string)
    #o.write(t2)
    t2.write(outfile=outfile, format=1)
示例#5
0
def write_clustertree_tonewick(ct: Tree, otfpath: str = 'clustertree.nw'):
    """redefines node names property according to child leaf accesion codes and writes out as 
    newick tree
    
    Arguments:
    ctree: ete tree clustered using ete_clustertree_bysize, or ...? (contains 'subtrees' feature)
    Keyword Arguments:
    otfpath: path of output tree (default='clustertree.nw')

    Returns:
    int value number of nodes
    """
    ct = ct.copy()
    for lnode in ct.get_leaves():
        if 'subtrees' in lnode.features:
            lnode.name = ''
            for st in lnode.subtrees:
                for acc in st.get_leaf_names():
                    lnode.name += f'{acc}|'
        else:
            lnode.name += '|'
    ct.write(outfile=otfpath, features=['name'])
    print(f'clusterified newick tree written to {otfpath}')
示例#6
0
def expand_eteclustertree(ct: Tree,
                          delete_cluster_names=True,
                          delete_cluster_features=True):
    """expands cluster tree to original topology. 
    
    Collapsed sisters are inferred from a cluster node with dist=0 to parent.
    Returned tree will have some differences in node names from parent?
    Arguments:
    ct: the cluster tree

    Keyword Arguments:
    delete_cluster_names: whether to delete cluster names (default=True)
    delete_cluster_features: whether to delete cluster features accs,cluster_numleaves,subtrees (default=True)

    Returns:
    ete tree expanded so that leaves represent single enzymes
    """
    ct = ct.copy()
    for lnode in ct.get_leaves():
        if 'subtrees' in lnode.features:
            #special handling for collapsed sisters
            if lnode.dist == 0:
                print(f'sister node detected for {lnode.name}')
                for st in lnode.subtrees:
                    lnode.up.add_child(st)
                lnode.detach()
            else:
                for st in lnode.subtrees:
                    lnode.add_child(st)
                if delete_cluster_names:
                    lnode.name = ''
                if delete_cluster_features:
                    lnode.del_feature('subtrees')
                    lnode.del_feature('cluster_numleaves')
                    lnode.del_feature('accs')
        #need to do a check... is this also proper handling collapsed sisters?
    return ct
示例#7
0
def partitionTreeSet(N):
    if N == 1:
        x = Tree(";",format=100)
        x.add_features(value=N, name=str(N))
        
        xFace = styleFace(x.name)
        x.add_face(xFace,column=0,position="branch-top")

        return (x,)
    else:
        y = ()
        base = Tree(";",format=100)
        base.dist = 1

        for k in range(lam(N)):
            left    = partitionTreeSet(N-(k+1))
            right   = partitionTreeSet(k+1)

            for l in left:
                for r in right:
                    l.dist = 1
                    r.dist = 1

                    z = base.copy()
                    z.dist = 1
                    
                    z.add_features(value=N, name=str(N))
                    z.add_child(l.copy())
                    z.add_child(r.copy())

                    zFace = styleFace(z.name)
                    z.add_face(zFace,column=0,position="branch-top")

                    y = y + (z,)
        
        return y
with open("representative_tree/bacteria_random_fam_reps.txt", "w") as handle:
    handle.writelines([k + "\t" + v + "\n" for k, v in fam2rnd.items()])

rnds = set(fam2rnd.values())
nodes = list(tree.iter_leaves())

sub_tree = tree.prune(
    [n for n in tree.iter_leaves() if "mOTU" in n.name or n.name in rnds],
    preserve_branch_length=True)

sub_tree.write(outfile="representative_tree/bacteria_pruned.tree")
motu_md = pandas.read_csv("metadata/mOTU_stats.csv", index_col=0)

gtdbid2fam.update(motu_md.consensus_tax.to_dict())

ptree = tree.copy()
for l in ptree.iter_leaves():
    l.name = gtdbid2fam[l.name]

ptree.write(outfile="representative_tree/bacteria_pretty.tree")

motu_md = pandas.read_csv("metadata/mOTU_stats.csv", index_col=0)
tt = set(list(tree.iter_leaf_names())).difference(set(motu_md.index))
gtdb_md = pandas.read_csv("/home/moritz/data/gtdb/bac120_metadata_r89.tsv",
                          index_col=0,
                          sep="\t")
mag_md = pandas.read_csv("metadata/master_table.csv", index_col=0)

tt2 = mag_md.loc[motu_md.representative_MAGs]
tt2.index = tt2.mOTU
示例#9
0
    output_file_path = cmdln[2]

    # Define query ID present in filename.
    query_id = os.path.basename(tf)[2:].rsplit('_', 1)[0]

    # Initiate a tree style.
    ts = TreeStyle()
    ts.show_leaf_name = False

    # Parse tree.
    #t1 = Tree(tf, format=3)
    t1 = Tree(tf, format=0)
    #print(t1)

    # Make a copy of the TreeNode object.
    t2 = t1.copy()

    # Root on midpoint.
    t2.set_outgroup(t2.get_midpoint_outgroup())

    # Add node support values as branch labels.
    add_support_to_nodes_as_faces(t2)

    # Customize the node styles generally.
    customize_node_styles_for_visualization(t2)

    #####################################################

    # Write tree to pdf.

    if platform == "linux" or platform == "linux2":
def plot_trees_from_traces(input_trace, output_plot, simu_dict, simu_tree):
    axis_trees, axis_filenames = dict(), dict()

    for filepath in input_trace:
        for tree_path in sorted(glob("{0}.*.nhx".format(filepath))):
            feature = remove_units(
                tree_path.replace(filepath + ".", "").replace(".nhx", ""))
            filename = os.path.basename(filepath)
            with open(tree_path, 'r') as tree_file:
                tree_str = remove_units(tree_file.readline())
                if tree_str.count("-nan") > 0:
                    continue
                tree = Tree(tree_str, format=1)

            if simu_tree:
                for n_inf, n_simu in zip(tree.traverse(),
                                         simu_tree.traverse()):
                    assert (sorted(n_simu.get_leaf_names()) == sorted(
                        n_inf.get_leaf_names()))

            if feature not in axis_trees:
                axis_trees[feature] = []
            if feature not in axis_filenames:
                axis_filenames[feature] = []

            axis_filenames[feature].append(filename)
            axis_trees[feature].append(tree)
            if len([n for n in tree.traverse()
                    if feature in n.features]) == len(list(tree.traverse())):
                plot_tree(
                    tree.copy(), feature,
                    "{0}/{1}.{2}.pdf".format(output_plot, filename, feature))

    for feature in axis_trees:
        axis_dict, err_dict, std_dict = dict(), dict(), dict()
        if feature in simu_dict:
            axis_dict["Simulation"] = simu_dict[feature]
        for filename, tree in zip(axis_filenames[feature],
                                  axis_trees[feature]):
            values = np.array([
                to_float(getattr(n, feature)) for n in tree.traverse()
                if feature in n.features
            ])
            min_values = np.array([
                to_float(getattr(n, feature + "_min"))
                for n in tree.traverse() if feature + "_min" in n.features
            ])
            max_values = np.array([
                to_float(getattr(n, feature + "_max"))
                for n in tree.traverse() if feature + "_max" in n.features
            ])
            axis_dict[filename] = values
            err_dict[filename] = np.vstack(
                (np.abs(values - min_values), np.abs(max_values - values)))
            std_dict[filename] = np.array([
                to_float(getattr(n, feature + "_std"))
                for n in tree.traverse() if feature + "_std" in n.features
            ])

        if len(axis_dict) > 1:
            path = '{0}/correlation.{1}.pdf'.format(output_plot, feature)
            plot_correlation(path,
                             axis_dict,
                             err_dict,
                             std_dict=std_dict,
                             global_min_max=False)
示例#11
0
children, tree_string = cvtNewick(parent)

orig = Tree(tree_string, format=8)

out = open(join(args.out_dir, args.out), "w")
outbase = splitext(args.out)[0]

if args.format == "plink":
    if len(args.sample) != 1:
        sys.exit(
            "When using PLINK fomat there can only be some sample -- the base of a plink data set"
        )
    (bim, fam, bed) = read_plink(args.sample[0], verbose=False)
    N = len(fam) if args.num == 0 else args.num
    for p in range(N):
        tr = orig.copy(method='deepcopy')
        if fam.fid[p] == fam.iid[p]:
            the_id = fam.fid[p]
        else:
            the_id = fam.fid[p] + "_" + fam.iid[p]
        print(p, the_id)
        data = getInd(p, bed, bim)
        processSample(out, the_id, data)
else:
    N = len(args.sample) if args.num == 0 else args.num
    for sample in args.sample[:N]:
        tr = orig.copy(method='deepcopy')
        processSample(out, sample, open(sample))
out.close()

group_names = [g for g in overall.keys() if overall[g][count] > 0]
示例#12
0
def keep_subsequent_wgd_species(stree, ensembl_tree, missing_leaves_keep,
                                sp_current_wgd, authorized_sp):
    """
        When re-grafting a subtree corrected for species descending from 'WGD1' only, keep
        positions of species with subsequent WGDs consistent in the tree.
        To do so, find the closest 'WGD1-only' species gene in ensembl tree and keep subsequently
        duplicated species genes at the same position (relative to it).

        Modifies `stree` in-place.

        Args:
            stree (ete3.Tree): Tree object for the synteny corrected tree of WGD1
            ensembl_tree (ete3.Tree): Tree object for the full original gene tree
            missing_leaves_keep (list of ete3.TreeNode): Genes of subsequently duplicated species
            sp_current_wgd (list of str): List of WGD1 duplicated species
            authorized_sp (dict): Dict used to keep the tree consistent with the species tree. For
            a 'WGD1' species, a list of WGD1 species that are closer to it than are 4R species.
    """

    #genes in the WGD1 corrected tree
    sleaves = [i.name for i in stree.get_leaves()]
    stree.prune([i for i in stree.get_leaves()])

    #genes in subsequent WGDs
    missing = [i.name for i in missing_leaves_keep]

    #original tree
    enstree = ensembl_tree.copy()

    #find all clades of subsequent WGD genes to replace them at a correct position together
    for leaf in enstree.get_leaves():
        if leaf.name in missing:
            leaf.missing = "Y"
    clades = enstree.get_monophyletic(values=["Y"], target_attr="missing")

    closest_gene = {}

    #for each clade to replace
    for clade in clades:

        #find closest neighbour in the original tree which is in WGD1 duplicated species
        outgr_gene = closest_gene_in_tree(enstree,
                                          clade,
                                          sp_current_wgd,
                                          attr='S')

        if outgr_gene.name in sleaves:
            closest_gene[clade] = outgr_gene

    #if the closest WGD1 gene is in the WGD1 stree
    #we'll keep 4R genes in the family, at a similar position
    for outgr_gene in set(closest_gene.values()):
        clades = [i for i in closest_gene if closest_gene[i] == outgr_gene]
        clades = list(chain.from_iterable(clades))

        subtree_4r = enstree.copy()
        subtree_4r.prune([i.name for i in clades] + [outgr_gene.name])
        outgroup_subtree = stree.copy()

        outgr = stree.get_leaves_by_name(name=outgr_gene.name)[0]

        sister_outgroup_genes = [outgr_gene.name]
        find_sister_of_outgroup(outgr, authorized_sp[outgr_gene.S],
                                sister_outgroup_genes)

        #We keep all sister outgroup genes together in the corrected tree
        if len(sister_outgroup_genes) > 1:

            #stree is modified in-place
            subtree_4r = keep_sis_genes_together(subtree_4r,
                                                 outgr_gene.name,
                                                 sister_outgroup_genes,
                                                 outgroup_subtree,
                                                 node_max='')
            lca = stree.get_common_ancestor(sister_outgroup_genes)
            # cop = stree.copy()
            stree.prune([lca] + [i for i in stree.get_leaves()\
                                 if i.name not in sister_outgroup_genes])
        else:
            lca = outgr

        #in case we do not paste the subtree at a terminal node (cleared above)
        if len(lca.children) == 2:
            lca_cop = lca.copy()
            tmp = Tree()
            lca_cop.prune([i for i in lca_cop.get_leaves()])
            tmp.add_child(lca_cop.copy())
            tmp.add_child(subtree_4r.copy())
            lca.up.add_child(name='here')
            lca.detach()
            lca_new = stree.search_nodes(name="here")[0]
            lca_new.add_child(tmp.copy())
            lca_new.name = ''

        else:
            lca.name = ''
            lca.add_child(subtree_4r)

    #remove potential signle child internal nodes artefact
    stree.prune([i for i in stree.get_leaves()])

    #clean up attributes
    for leaf in stree.get_leaves():
        if hasattr(leaf, 'missing'):
            delattr(leaf, 'missing')
示例#13
0
def create_experiment(prefix, name, sample, replicate, tree_name, cds_list, lht, calibs, intersection, screen, sbatch,
                      nbr_cpu, random_state):
    root_path = os.getcwd() + "/" + name
    tree = Tree("{0}/{1}".format(root_path, tree_name), format=1)
    print("{0} extant species found for the rooted tree/".format(len(tree)))

    if os.path.isfile('{0}/{1}'.format(root_path, cds_list)):
        print("Found list of CDS : " + cds_list)
        genes = pd.read_csv("{0}/{1}".format(root_path, cds_list), header=None)
    else:
        genes = pd.DataFrame(
            [i.replace(".ali", "") for i in os.listdir("{0}/singlegene_alignments".format(root_path)) if ".ali" in i])

    print("{0} CDS provided.".format(len(genes)))

    if replicate == -1:
        replicate = len(genes)

    for rep in range(replicate):
        random_state += 654
        experiment = prefix + "_{0}_{1}_{2}_Sample{3}_Replicates{4}_Id{5}".format(name, tree_name, cds_list, sample,
                                                                                  replicate, rep)
        exp_path = os.getcwd() + '/Experiments/' + experiment
        os.makedirs(exp_path, exist_ok=True)
        os.system('cp config.yaml {0}'.format(exp_path))
        os.remove(exp_path + "/Snakefile") if os.path.exists(exp_path + "/Snakefile") else None
        os.symlink(os.getcwd() + "/Snakefile", exp_path + "/Snakefile")

        if os.path.isfile('{0}/{1}'.format(root_path, lht)):
            print("Life-History-Traits file provided (" + lht + ")")
            os.system('cp {0}/{1} {2}/life_history_traits.tsv'.format(root_path, lht, exp_path))

        if os.path.isfile('{0}/{1}'.format(root_path, calibs)):
            print("Fossil Calibrations file provided (" + calibs + ")")
            os.system('cp {0}/{1} {2}/calibs.tsv'.format(root_path, calibs, exp_path))

        if os.path.isfile('{0}/known_population_size.tsv'.format(root_path)):
            print("Known population size file provided (known_population_size.tsv)")
            os.system('cp {0}/known_population_size.tsv {1}'.format(root_path, exp_path))

        alignments = []
        taxa = set(tree.get_leaf_names()) if intersection else set()

        if sample == -1:
            vals = [genes.loc[rep, :]]
        else:
            vals = genes.sample(sample, random_state=random_state).values

        pd.DataFrame(vals).to_csv(exp_path + "/CDS.list", index=False, header=None)
        for selected in vals:
            alignments.append(import_ali("{0}/singlegene_alignments/{1}.ali".format(root_path, selected[0])))
            taxa = taxa.intersection(alignments[-1].keys()) if intersection else taxa.union(alignments[-1].keys())

        taxa = taxa.intersection(set(tree.get_leaf_names()))

        merge_alignment = {k: "" for k in taxa}
        for alignment in alignments:
            seq_len_set = set([len(s) for s in alignment.values()])
            assert (len(seq_len_set) == 1)
            size = seq_len_set.pop()
            for taxon in taxa:
                if taxon in alignment:
                    merge_alignment[taxon] += alignment[taxon]
                else:
                    merge_alignment[taxon] += "-" * size

        export_ali(exp_path + "/CDS.ali", merge_alignment)

        trimmed_tree = tree.copy()
        trimmed_tree.prune(taxa, preserve_branch_length=True)
        print("{0} taxa for replicate {1}".format(len(taxa), rep + 1))
        trimmed_tree.write(outfile="{0}/rootedtree.nhx".format(exp_path), format=1)

        assert (set(merge_alignment.keys()) == set(
            Tree("{0}/rootedtree.nhx".format(exp_path), format=1).get_leaf_names()))

        run_file = exp_path + "/snakeslurm.sh"
        with open(run_file, 'w') as w:
            w.write("#!/usr/bin/env bash\n")
            run_str = 'snakemake '
            if sbatch:
                run_str += '-j 99 --cluster "sbatch -J {0} -p long -N 1 ' \
                           '-o {1}/slurm.%x.%j.out -e {1}/slurm.%x.%j.err '.format(experiment, exp_path)
                run_str += '--cpus-per-task={params.threads} --mem={params.mem} -t {params.time}"\n'
            else:
                run_str += "-j {0} --printshellcmds".format(nbr_cpu)
            w.write(run_str)
        os.system("chmod 755 " + run_file)
        cmd = 'cd ' + exp_path + ' && ./snakeslurm.sh'
        screen_cmd = 'screen -dmS ' + "{0}_{1}_{2}".format(prefix, name, rep) + ' bash -c "' + cmd + '"'
        with open(exp_path + "/screen.sh", 'w') as w:
            w.write("#!/usr/bin/env bash\n")
            w.write(screen_cmd)
        if screen:
            print(screen_cmd)
            run(screen_cmd, shell=True)
        else:
            print(cmd)
            run(cmd, shell=True)
示例#14
0
 gl = pd.read_csv(inputGLF, sep = "\t")
 t = Tree(spTreeF)
 t.sort_descendants(attr='O')
 ts = TreeStyle()
 ts.complete_branch_lines_when_necessary = False
 # calculate branch colors
 gainL = [] # list with all rates of gain
 lossL = [] # list with all rates of loss
 gm = gl.rgain.min()
 gM = gl.rgain.max()
 lm = gl.rloss.min()
 lM = gl.rloss.max()
 #bcrg = scaleCol(gl.pgain.tolist())  # Branch Colors for Rates of Gain
 #bcrl = scaleCol(gl.ploss.tolist())  # Branch Colors for Rates of Loss
 # make a "gain" and a "loss" copy of the tree
 tg = t.copy()
 tl = t.copy()
 gcm = cm.ScalarMappable(norm = colors.Normalize(vmin = gm, vmax = gM), cmap = "coolwarm")
 lcm = cm.ScalarMappable(norm = colors.Normalize(vmin = lm, vmax = lM), cmap = "coolwarm")
 for node in tg.iter_descendants(): # do not include root
     if node.up.is_root():
         rgain = gl.loc[(gl.fromNode == 0) & (gl.toNode == int(node.ND)), 'rgain']
     else:
         rgain = gl.loc[(gl.fromNode == int(node.up.ND)) & (gl.toNode == int(node.ND)), 'rgain']
     if rgain.empty:
         continue
     rgain = rgain.item()
     style = NodeStyle()
     gainString = "%.2f" % (rgain)
     #pick colors
     ci = colors.rgb2hex(gcm.to_rgba(rgain)[:3])
示例#15
0
def ete_cluster_bysize(t:Tree,cluster_maxsize:int=50,cluster_minsize:int=5,\
                            collapse_sisters:bool=True,cleanup_merge=True,outgroup:str=None):
    """takes ete3 tree, then groups sets of leaf nodes and truncates tree at common ancestor.
    merge_sister_orphans options can be useful just for visualization purposes if tree has many polytomies
    
    Arguments:
    t: ete3 tree object
    
    Keyword Arguments:
    cluster_maxsize: maximum size for each cluster (default=50)
    cluster_minsize: minimum size for each cluster (default=5)
    collapse_sisters: whether to merge leaves or groups<threshold into a single branch (default True)
    cleanup_merge: whether to perform final step that clusters all subtrees even if size<cluster_minsize
    outgroup: name of outgroup node (default=None)
    
    Returns: 
    ete tree file with added feature 'subtrees', 'cluster_numleaves', and 'accs' for collapsed nodes, 
    which correspond to a list of child nodes, number of leaves, and all (exapanded) leaf names in the cluster.
    Collapsed nodes names: 'm_<numsubtrees>_<numleaves>', 's_<numsubtrees>_<numleaves>', 'c_<numsubtrees>_<numleaves>'
    """
    t = t.copy()
    if outgroup is not None:
        t.set_outgroup(t & outgroup)

    #Breadth-First Tree Traversal, stop when no.leaves<cluster_maxsize
    orphans = []
    tovisit_ = [t]
    print(f'--starting number of leaves: {len(t.get_leaf_names())}--')
    cluster_merges = []
    while (len(tovisit_) > 0):
        node = tovisit_.pop()
        lnames = node.get_leaf_names()
        numleaves = len(lnames)
        if numleaves < cluster_maxsize:
            groupaccs_ = node.get_leaf_names()
            if len(groupaccs_) > cluster_minsize:
                node.add_feature('cluster_numleaves',
                                 len(node.get_leaf_names()))
                node.add_feature('subtrees',
                                 [nc.detach()
                                  for nc in node.get_children()])  #
                node.name = f'm_{len(node.subtrees)}_{node.cluster_numleaves}'
                cluster_merges.append(len(groupaccs_))
            else:
                orphans.append(node)
        else:
            tovisit_.extend(node.children)
    print(f'cluster collapse sizes: {cluster_merges}')

    #merge sister orphans
    sister_merges = []
    if collapse_sisters:
        while (len(orphans) > 0):
            cur_orphan = orphans.pop()
            pnode = cur_orphan.up
            sisters = cur_orphan.get_sisters()
            sis_orphs = []
            for orphos in range(len(orphans) - 1, -1, -1):
                if orphans[orphos] in sisters:
                    sis_orphs.append(orphans.pop(orphos))
            if len(sis_orphs) > 1:
                size_of_merged = len(cur_orphan.get_leaf_names()) + np.sum(
                    [len(x.get_leaf_names()) for x in sis_orphs])
                if size_of_merged > cluster_minsize:
                    newnode = pnode.add_child(dist=0)
                    newnode.add_feature('cluster_numleaves', size_of_merged)
                    newnode.add_feature('subtrees', [cur_orphan.detach()])
                    for so in sis_orphs:
                        newnode.subtrees.append(so.detach())
                    newnode.name = f's_{len(newnode.subtrees)}_{newnode.cluster_numleaves}'
                    sister_merges.append(size_of_merged)
    print(f'sisters collapse sizes: {sister_merges}')
    #now cleanup_merge
    if cleanup_merge:
        visited = set()
        while len(visited) < len(t.get_leaves()):
            for n in set(t.get_leaves()).difference(visited):
                visited.add(n)
                nosubtrees = 'subtrees' not in n.features
                addn = None
                #if no subtrees, see how far we can climb
                while nosubtrees:
                    #if we've climbed once, this is a cluster
                    if len(n.get_descendants()) > 0:
                        addn = n
                    n = n.up
                    #continue only if no descendants have subtree
                    subtree_status = [
                        'subtrees' in x.features for x in n.traverse()
                    ]
                    nosubtrees = True not in subtree_status
                #clusterify the node, then add it and all sub-leaves to the visited set
                if addn is not None:
                    addn.add_feature('cluster_numleaves',
                                     len(addn.get_leaf_names()))
                    addn.add_feature(
                        'subtrees',
                        [nc.detach() for nc in addn.get_children()])  #
                    addn.name = f'c_{len(addn.subtrees)}_{addn.cluster_numleaves}'
                    visited = visited.union([x for x in addn.get_leaves()])
                    break  #break to reset leaf candidates with updated visited set as filter

    #now at end add accs feature (a list of acc under each)
    for lnode in t.get_leaves():
        lnode.add_feature('accs', [])
        if 'subtrees' in lnode.features:
            for st in lnode.subtrees:
                lnode.accs.extend(
                    st.get_leaf_names()
                )  #lnode.accs=[*y for y in [x.get_leaf_names() for x in lnode.subtrees]]
        else:
            lnode.accs.append(lnode.name)

    #final consistency check and readout
    num_leaves = 0
    for lnode in t.get_leaves():
        if 'subtrees' in lnode.features:
            num_leaves += np.sum(
                [len(x.get_leaf_names()) for x in lnode.subtrees])
            #num_leaves+=lnode.cluster_numleaves#np.sum([len(x.get_leaf_names()) for x in lnode.subtrees])
        else:
            num_leaves += 1
    print(f'--total leaves at end: {num_leaves}--')
    return t
    os.waitpid(p.pid, 0)
    r1 = list(map(float, p1.split()))
    r2 = list(map(float, p2.split()))
    nTriplets = r1[1]
    nUnresolved = r2[-2]
    nResolved = nTriplets - nUnresolved
    if nResolved == 0:
        return np.nan
    nAgree = r1[4]
    return 1. - nAgree/nResolved


results = []
for fam in families:
    print(fam)
    fmGlot = glot.copy()
    tree = Tree('madRooted/' + fam + '.madRooted.tre')
    fmGlot.prune(tree.get_leaf_names())
    fmResults = [len(tree)]
    fmResults.append(gtd(Tree('madRooted/' + fam +
                              '.madRooted.tre'), fmGlot))
    fmResults.append(gtd(Tree('midpointRooted/' + fam +
                              '.midpointRooted.tre'), fmGlot))
    fmResults.append(gtd(Tree('outgroupRooted/' + fam +
                              '.outgroupRooted.tre'), fmGlot))
    fmResults.append(gtd(Tree('yuleRooted/' + fam +
                              '.yuleRooted.tre'), fmGlot))
    results.append(fmResults)

results = pd.DataFrame(results,
                       index=families,
示例#17
0
count = 0
for i in range(1, 5):
	S[i] = dict()
	for j in range(1, 5):
		S[i][j] = dict()
		for k in range(5, 9):
			sum = 0
			for s in P[k].keys():
				t = Tree(s, format=9)
				num = 0
				for leaf in t:
					leaf.name = str(num)
					num += 1
				count = 0
				for Q in combinations(range(k), 4):
					Q = list(map(str, Q))
					TQ = t.copy()
					TQ.prune(Q)
					for Qp in combinations(range(k), 4):
						Qp = list(map(str, Qp))
						TQp = t.copy()
						TQp.prune(Qp)
						if len(set(Q).intersection(set(Qp))) == 8 - k and shape(TQ) == T4[i] and shape(TQp) == T4[j]:
							count += 1
				sum += simplify(q[i]*q[j]*count*P[k][s])
			S[i][j][k] = simplify(sum)
			count += 1
			print "round", count, "of", 64
			print >>out, i, j, k, S[i][j][k]

示例#18
0
class TreeHolder:
    def __init__(self,
                 tree,
                 logger,
                 scale=None,
                 labels_dict=None,
                 node_colors=defaultdict(lambda: 'black')):
        self.tree = Tree(tree)
        self.scale = scale

        for node in self.tree.traverse():
            if len(node.children) == 3:
                logger.info("Trying to root tree by first child of root")
                logger.info(f'Children of root: {node.children}')
                self.tree.set_outgroup(node.children[0])
            break

        for node in self.tree.traverse():
            # Hide node circles
            node.img_style['size'] = 0

            if node.is_leaf():
                try:
                    name_face = TextFace(
                        labels_dict[node.name] if labels_dict else node.name,
                        fgcolor=node_colors[node.name])
                except KeyError:
                    msg = f'There is not label for leaf {node.name} in labels file'
                    logger.error(msg)
                    raise KeyError(msg)
                node.add_face(name_face, column=0)

    def draw_neighbours(self,
                        neighbours,
                        block,
                        colors=('Crimson', 'Teal', 'DarkGreen', 'Purple',
                                'DarkKhaki', 'MediumVioletRed', 'DarkOrange',
                                'Navy', 'RosyBrown', 'DarkGoldenrod', 'Sienna',
                                'Indigo', 'DarkRed', 'Olive', 'SlateGray',
                                'SeaGreen', 'IndianRed', 'BurlyWood')):
        posible_ns = sorted(
            list(
                set(n[:-1] for nss in neighbours.values() for ns in nss
                    for n in ns[:2])))

        ns_colors = {
            posible_ns[i]: colors[i % len(colors)]
            for i in range(len(posible_ns))
        }
        ns_colors[str(block)] = 'grey'

        # if block != 2: return
        all_genomes = [node.name for node in self.tree.traverse()]
        aligned_neighbours = align_neighbours(neighbours, all_genomes)
        offsets = get_offsets(aligned_neighbours)

        for node in self.tree.traverse():
            if not node.is_leaf(): continue
            face = generate_neighbour_face(aligned_neighbours[node.name],
                                           ns_colors, block, offsets)
            node.add_face(face, 1, "aligned")

    def draw(self,
             file,
             colors,
             color_internal_nodes=True,
             legend_labels=(),
             show_branch_support=True,
             show_scale=True,
             legend_scale=1,
             mode="c",
             neighbours=None,
             neighbours_block=None):
        max_color = len(colors)

        used_colors = set()
        for node in self.tree.traverse():
            if not (color_internal_nodes or node.is_leaf()): continue
            color = colors[min(node.color, max_color - 1)]
            node.img_style['bgcolor'] = color
            used_colors.add(color)

        ts = TreeStyle()
        ts.mode = mode
        ts.scale = self.scale
        # Disable the default tip names config
        ts.show_leaf_name = False
        ts.show_branch_support = show_branch_support

        # ts.branch_vertical_margin = 20
        ts.show_scale = show_scale
        cur_max_color = max(v.color for v in self.tree.traverse())
        current_colors = colors[0:cur_max_color + 1]

        for i, (label, color_) in enumerate(zip(legend_labels,
                                                current_colors)):
            if color_ not in used_colors: continue
            rf = RectFace(20 * legend_scale, 16 * legend_scale, color_, color_)
            rf.inner_border.width = 1
            rf.margin_right = 14
            rf.margin_left = 14

            tf = TextFace(label, fsize=26 * legend_scale)
            tf.margin_right = 14

            ts.legend.add_face(rf, column=0)
            ts.legend.add_face(tf, column=1)

        if neighbours:
            old_tree = self.tree.copy()
            self.draw_neighbours(neighbours, neighbours_block)

        self.tree.render(file, w=1000, tree_style=ts)

        if neighbours:
            self.tree = old_tree

    def get_all_leafs(self):
        return {node.name for node in self.tree.get_leaves()}

    def count_innovations_fitch(self, leaf_colors, count_second_color=True):
        def assign_colorset_feature(v):
            if v.is_leaf():
                v.add_features(colorset={leaf_colors[v.name]},
                               color=leaf_colors[v.name])
            else:
                try:
                    child1, child2 = v.children
                except ValueError:
                    print(v.children)
                    raise ValueError('Tree must me binary')
                cs1 = assign_colorset_feature(child1)
                cs2 = assign_colorset_feature(child2)
                v.add_features(
                    colorset=(cs1 & cs2) if len(cs1 & cs2) > 0 else cs1 | cs2)

            return v.colorset

        def chose_color(colorset):
            return sorted(colorset,
                          key=lambda c: color_counter[c],
                          reverse=True)[0]

        def down_to_leaves(v, color):
            if v.is_leaf(): return
            v.add_features(color=color if color in
                           v.colorset else chose_color(v.colorset))
            for child in v.children:
                down_to_leaves(child, v.color)

        def count_innovations(v, innovations):
            for child in v.children:
                if v.color != child.color and not (not count_second_color and
                                                   (v.color == 2) or
                                                   (child.color == 2)):
                    innovations[child.color].append(child)
                count_innovations(child, innovations)

        color_counter = Counter(leaf_colors.values())

        # get colorsets for internal nodes
        root = self.tree.get_tree_root()
        assign_colorset_feature(root)

        # get color for internal nodes
        root_color = chose_color(root.colorset)
        down_to_leaves(root, root_color)

        # get inconsistent colors
        self.innovations = defaultdict(list)
        count_innovations(root, self.innovations)

    def count_parallel_rearrangements(self, skip_grey):
        score, count, count_all = 0, 0, 0
        for color, nodes in self.innovations.items():
            if len(nodes) <= 1 or (skip_grey and color == 1): continue
            count += 1
            count_all += len(nodes)
            for n1, n2 in combinations(nodes, 2):
                score += n1.get_distance(n2)
        return score, count, count_all

    def count_parallel_breakpoints(self):
        count = sum(map(len, self.innovations.values()))
        score = sum(
            n1.get_distance(n2)
            for n1, n2 in combinations((n for ns in self.innovations.values()
                                        for n in ns), 2))
        return score, count

    def draw_coloring(self, file):
        for node in self.tree.traverse():
            node.img_style['bgcolor'] = self.colors[node.color]
        ts = TreeStyle()
        ts.show_leaf_name = False
        self.tree.render(file, w=1000, tree_style=ts)

    def prune(self, ls):
        self.tree.prune(list(ls))
示例#19
0
			spID += 1
			node.sp = spID
			for leaf in node:
				try:
					leaf.sp = spID
				except AttributeError:
					leaf.add_features(sp=1)

if (ms_input == True) and (ms_islands[-1] > largest_id):
	sys.exit("2. The MS island structure you provided through -I does not fit the number of tips in the demography. The option might have been mispecified. Check it out!")

sys.stdout.write('S') # SpreadSpeciation

# if requested, print speciational tree
if plot_trees:
	tmut = t.copy()
	for leaf in tmut:
		leaf.name = "["+str(leaf.sp)+"]"+leaf.name
	tmut.render(ophylo+"_2MUT.png", w=183, units="mm")


#======================================================#
# CONVERT demography to phylogeny using a traversing method

# __/!\__ to be modified for non dichotomic trees (eg Lambda coalescent)
traversedNodes = set()

for node in t.traverse("preorder"):

	if node not in traversedNodes:
示例#20
0
def generax2mcmctree(xml_file,
                     stree,
                     gene,
                     dating_o,
                     calbration_file,
                     genome2cog25={}):
    """
    generate files for mcmctree, including 
    1. list of used genomes
    2. used genomes (itol annotation)
    3. constructed species tree topology for dating
    4. target genomes in the complete phylogeny of phylum (itol annotation)
    5. calibration file and tree file with calibrations information
    """
    # xml_file = join(r_odir,f'reconciliations/{gene}_reconciliated.xml')
    # stree = f"./trees/iqtree/{phylum_name}.reroot.newick"

    phylum_name = xml_file.split('/')[-4]
    st = Tree(stree, format=3)
    tmp_name = phylum_name + '_' + gene
    _p2node, _p2node_transfer_receptor = get_p2node(xml_file,
                                                    stree,
                                                    key=tmp_name)
    target_nodes = list(_p2node.values())[0] + list(
        _p2node_transfer_receptor.values())[0]

    must_in_genomes = open(
        "/mnt/home-backup/thliao/cyano_basal/rawdata/assembly_ids.list").read(
        ).strip('\n').split('\n')
    # new calibrations are /mnt/home-backup/thliao/cyano/ref_genomes_list.txt
    cluster2genomes = get_cluster(
        stree.replace('.reroot.newick', '.clusterd.list'))
    g2cluster = {v: c for c, d in cluster2genomes.items() for v in d}
    retained_ids = sampling(st,
                            target_nodes,
                            must_in=must_in_genomes,
                            node2cluster=g2cluster,
                            genome2cog25=genome2cog25)

    text = to_binary_shape({g: ['keep']
                            for g in retained_ids},
                           {"keep": {
                               "color": "#88b719"
                           }})
    text = to_color_range({g: 'keep'
                           for g in retained_ids}, {"keep": "#88b719"})
    with open(join(dating_o, f'id_list/{phylum_name}_{gene}.txt'), 'w') as f1:
        f1.write(text)
    with open(join(dating_o, f'id_list/{phylum_name}_{gene}.list'), 'w') as f1:
        f1.write('\n'.join(retained_ids))
    print(phylum_name, len(st.get_leaf_names()), len(retained_ids))
    st.copy()
    st.prune(retained_ids)
    with open(join(dating_o, f'species_trees/{phylum_name}_{gene}.newick'),
              'w') as f1:
        f1.write(st.write(format=9))

    # draw target nodes
    LCA_nodes = []
    for name in target_nodes:
        n = [n for n in st.traverse() if n.name == name][0]
        l1 = n.children[0].get_leaf_names()[0]
        l2 = n.children[1].get_leaf_names()[0]
        LCA_nodes.append(f"{l1}|{l2}")

    text = pie_chart({n: {
        'speciation': 1
    }
                      for n in LCA_nodes}, {"speciation": "#ff0000"},
                     dataset_label='GeneRax results')
    with open(join(dating_o, f'target_nodes/{phylum_name}_{gene}.txt'),
              'w') as f1:
        f1.write(text)

    # new set file set14
    # set14_f = './dating/calibration_sets/scheme1/cal_set14.txt'

    c = 'GCA_000011385.1'
    n = [_ for _ in st.children if c not in _.get_leaf_names()][0]
    final_text = open(calbration_file).read().replace('GCA_002239005.1',
                                                      n.get_leaf_names()[0])
    with open(join(dating_o, f'calibrations/{phylum_name}_{gene}_set14.txt'),
              'w') as f1:
        f1.write(final_text)
class ASRTree:
    #Attributes
    __tree = None  #Actual tree
    __sim_tree = None  #Simulation tree
    ____transition_prob_anad = None
    __transition_prob_aqp3 = None
    __sim_effect_sizes = []  #List containing simulation effect sizes
    __p_value_count = 0  #Number of times an effect size is simulated => actual
    __effect_size = 0  #Actual effect size of model
    __num_of_branches = __num_anad = __num_aqp3 = __num_anad_and_aqp3 = __num_taxa = __p_value = 0
    __anadromy_lookup = dict(
    )  #Dictionary matching FASTA file names (key) to a list of taxa names and character states
    SCIENTIFIC_INDEX = 0
    COMMON_INDEX = 1
    ANAD_INDEX = 2
    AQP3_INDEX = 3
    EPSILON = 0.00000000000000000001  #Number being added to anadromy/aqp3 variables to avoid division by 0 in effect size

    #Public Methods

    #--------------------------constructor--------------------------------------
    # Description: Constructs ASTree and sets default value for tree, and creates
    #              the 2D list for transition rate matrix, setting initial
    #              values to 0.
    #---------------------------------------------------------------------------
    def __init__(self):
        self.__tree = None
        self.____transition_prob_anad = [[0.0 for x in range(2)]
                                         for y in range(2)]
        self.__transition_prob_aqp3 = [[0.0 for x in range(2)]
                                       for y in range(2)]

    #end constructor

    #-----------------------------build_tree------------------------------------
    # Description: Builds phylogenetic tree from newick tree file in RAxML result.
    #---------------------------------------------------------------------------
    def build_tree(self, path):
        rax_file = open(path, "r")
        if rax_file.mode == "r":
            contents = rax_file.read()
            self.__tree = Tree(contents)
            print("\nRAxML tree imported successully.")
        else:
            print(
                "\nRAxML tree failed to import successfully. Please check the file path and try again."
            )

    #end build_tree

    #-----------------------run_max_parsimony-----------------------------------
    # Description: Calls private functions for Fitch's algorithm of maximum
    #              parsimony.
    #---------------------------------------------------------------------------
    def run_max_parsimony(
        self
    ):  #Calls private functions for Fitch's algorithm of maximum parsimony
        if self.__tree is None:
            print(
                "\n****************Error****************\nTree has not been imported. Please run build_tree method first."
            )
        else:
            self.__tree.resolve_polytomy(
            )  #Transform tree to bifurcating - does nothing if already bifurcating
            self.__down_pass()
            self.__up_pass()
            self.__clean_tree()
            self.__find_char_states()
            self.__find_transition_prob()
            self.__effect_size = self.calc_effect_size(self.__num_anad + self.EPSILON,\
            self.__num_aqp3 + self.EPSILON, self.__num_anad_and_aqp3 + self.EPSILON)

    #end run_max_parsimony

    #-----------------------------get_num_taxa----------------------------------
    # Description: Returns number of taxa.
    #---------------------------------------------------------------------------
    def get_num_taxa(self):
        return self.__num_taxa

    #end get_num_taxa

    #-----------------------------get_p_value-----------------------------------
    # Description: Returns the P-Value of the hypothesis test.
    #---------------------------------------------------------------------------
    def get_p_value(self):
        return self.__p_value

    #end get_p_value

    #--------------------------import_lookup------------------------------------
    # Description: Imports the look-up file for assigning character state
    #              changes and taxa names.
    #---------------------------------------------------------------------------
    def import_lookup(
        self, path
    ):  #Imports the look-up file for assigning character state changes and taxa names
        import_file = xlrd.open_workbook(path)
        file = import_file.sheet_by_index(0)
        values = list()  #Local list for holding cell row information

        for row in range(
                1, file.nrows):  #Nested loops to cover entire spreadsheet
            for col in range(
                    file.ncols
            ):  #Creates a list of the scientific names, common names and character states for each fish in file
                if col == 0:
                    file_name = file.cell_value(row, col)
                    values.append(file_name)
                elif col == 1:
                    scientific_name = file.cell_value(row, col)
                    values.append(scientific_name)
                elif col == 2:
                    common_name = file.cell_value(row, col)
                    values.append(common_name)
                elif col == 3:
                    anadromous = int(file.cell_value(row, col))
                    values.append(anadromous)
                else:
                    aqp3 = int(file.cell_value(row, col))
                    values.append(aqp3)
                    self.__anadromy_lookup[values[0]] = values[1:]
                    values.clear()

        __num_taxa = len(self.__anadromy_lookup)

    #end import_lookup

    #----------------------------show_tree--------------------------------------
    # Description: Displays tree in console and opens an external window to
    #              interact with tree and see branch length.
    #---------------------------------------------------------------------------
    def show_tree(self):
        print(
            self.__tree.get_ascii(attributes=["name", "anadromy", "aqp3"],
                                  show_internal=True))
        self.__tree.show()

    #end show_tree

    #----------------------------to_string--------------------------------------
    # Description: Prints to console number of taxa and their names, as well as
    #              the number of character state changes.
    #---------------------------------------------------------------------------
    def to_string(self):
        if self.__tree == None or self.__effect_size == 0:
            return "\n****************Error****************\nTree not constructed,\
             or maximum parsimony not yet run. Please run methods and try again."

        count = 0
        asr_info = "\n\t\tTaxa\n"
        for key in self.__anadromy_lookup:
            count += 1
            asr_info += str(count) + ": " + self.__anadromy_lookup[key][
                self.SCIENTIFIC_INDEX]
            asr_info += " (" + self.__anadromy_lookup[key][
                self.COMMON_INDEX] + ")\n"
        asr_info += "\nAnadromy Character State Changes: " + str(
            self.__num_anad)
        asr_info += "\nAQP3 Character State Changes: " + str(self.__num_aqp3)
        return asr_info

    #end to_string

    #------------------------calc_effect_size-----------------------------------
    # Description: Public method that calculates the effect size of the ASRTree.
    #---------------------------------------------------------------------------
    def calc_effect_size(self, numOfAnad, numOfAqp3, numAnadAndAqp3):
        effect_size = ((numAnadAndAqp3 / self.__num_of_branches) /
                       ((numOfAnad / self.__num_of_branches) *
                        (numOfAqp3 / self.__num_of_branches)))
        return effect_size

    #end calc_effect_size

    #-------------------------monte_carlo_sim-----------------------------------
    # Description: Public method to run n number of Monte Carlo simulations
    #              in order to test the hypothesis. Each simulation checks
    #              the ancestral node in the tree, then refers to the transition
    #              rate matrix for the probability of getting the same or a
    #              different character state.
    #---------------------------------------------------------------------------
    def monte_carlo_sim(self, num_sims):
        #Checks if there already is a simulation tree to avoid unncessary copies
        self.__p_value_count = 0  #Initialize back to 0
        self.__sim_effect_sizes.clear()  #Initialize back to empty
        if self.__sim_tree is None:
            self.__sim_tree = self.__tree.copy()
        for sim in range(num_sims):
            #Set values of each count back to the EPSILON value to avoid
            #division by 0 in the effect size
            aqp3_count = self.EPSILON
            anad_count = self.EPSILON
            anad_aqp3_count = self.EPSILON
            for node in self.__sim_tree.traverse("preorder"):
                rand_num_1 = random.randint(0, 1001)
                rand_num_2 = random.randint(0, 1001)
                if not node.is_root():
                    #Check each ancestor's character state, and roll a random
                    #number against the probability of going from that state to
                    #the same or a different state based on transition matrix
                    #and assign that character state. Tally all gains
                    if node.up.anadromy == 1:
                        if (self.____transition_prob_anad[1][0] *
                                1000) > rand_num_1:
                            node.add_feature("anadromy", 0)
                        else:
                            node.add_feature("anadromy", 1)
                            anad_count += 1
                    else:
                        if (self.____transition_prob_anad[0][1] *
                                1000) < rand_num_1:
                            node.add_feature("anadromy", 0)
                        else:
                            node.add_feature("anadromy", 1)
                            anad_count += 1
                    if node.up.aqp3 == 1:
                        if (self.__transition_prob_aqp3[1][0] *
                                1000) > rand_num_2:
                            node.add_feature("aqp3", 0)
                        else:
                            node.add_feature("aqp3", 1)
                            aqp3_count += 1
                    else:
                        if (self.__transition_prob_aqp3[0][1] *
                                1000) < rand_num_2:
                            node.add_feature("aqp3", 0)
                        else:
                            node.add_feature("aqp3", 1)
                            aqp3_count += 1
                    if node.anadromy == 1 and node.aqp3 == 1:
                        anad_aqp3_count += 1
            #Calculate the effect size and store the results.
            eff_size = self.calc_effect_size(anad_count, aqp3_count,
                                             anad_aqp3_count)
            self.__sim_effect_sizes.append(eff_size)
            if eff_size >= self.__effect_size:
                self.__p_value_count += 1
        self.__p_value = (self.__p_value_count / num_sims
                          )  #Calculate and store p-value

    #end monte_carlo_sim

    #--------------------------plot_histogram-----------------------------------
    # Description: Public method to plot the histogram for testing the null
    #              hypothesis.
    #---------------------------------------------------------------------------
    def plot_histogram(self):
        plt.style.use('seaborn')
        _ = plt.hist(self.__sim_effect_sizes, bins=100)
        plt.axvline(self.__effect_size,
                    color='k',
                    linestyle='dashed',
                    linewidth=1)
        plt.text(self.__effect_size + .05, 200,
                 '   Actual Effect Size:{:.3f}'.format(self.__effect_size))
        plt.xlabel('Effect Size')
        plt.ylabel('Effect Frequency')
        plt.title('Monte Carlo Simulation Distribution')
        plt.show()

    #end plot_histogram

    #--------------------__find_transition_prob---------------------------------
    # Description: Private method that determines the transition probability
    #              of each character trait change.
    #---------------------------------------------------------------------------
    def __find_transition_prob(self):
        #Establish counter variables and traverse tree
        zero_to_one_anad = zero_to_zero_anad = one_to_zero_anad = one_to_one_anad = 0.0
        zero_to_one_aqp3 = zero_to_zero_aqp3 = one_to_zero_aqp3 = one_to_one_aqp3 = 0.0
        for node in self.__tree.traverse("postorder"):
            if not node.is_root():
                #Find Anadromy transitions
                if (node.up.anadromy is 0 and node.anadromy is 0):
                    zero_to_zero_anad += 1
                elif (node.up.anadromy is 0 and node.anadromy is 1):
                    zero_to_one_anad += 1
                elif (node.up.anadromy is 1 and node.anadromy is 0):
                    one_to_zero_anad += 1
                else:
                    one_to_one_anad += 1
                #Find AQP3 transitions
                if (node.up.aqp3 is 0 and node.aqp3 is 0):
                    zero_to_zero_aqp3 += 1
                elif (node.up.aqp3 is 0 and node.aqp3 is 1):
                    zero_to_one_aqp3 += 1
                elif (node.up.aqp3 is 1 and node.aqp3 is 0):
                    one_to_zero_aqp3 += 1
                else:
                    one_to_one_aqp3 += 1

        #Insert the probability into the appropriate matrix
        self.____transition_prob_anad[0][0] = (zero_to_zero_anad /
                                               self.__num_of_branches)
        self.____transition_prob_anad[0][1] = (zero_to_one_anad /
                                               self.__num_of_branches)
        self.____transition_prob_anad[1][1] = (one_to_one_anad /
                                               self.__num_of_branches)
        self.____transition_prob_anad[1][0] = (one_to_zero_anad /
                                               self.__num_of_branches)

        self.__transition_prob_aqp3[0][0] = (zero_to_zero_aqp3 /
                                             self.__num_of_branches)
        self.__transition_prob_aqp3[0][1] = (zero_to_one_aqp3 /
                                             self.__num_of_branches)
        self.__transition_prob_aqp3[1][1] = (one_to_one_aqp3 /
                                             self.__num_of_branches)
        self.__transition_prob_aqp3[1][0] = (one_to_zero_aqp3 /
                                             self.__num_of_branches)

    #end findTransitionProb

#Private Methods
#---------------------------__down_pass-------------------------------------
# Description: Private method to perform down-pass to assign character state
#              to tips and internal nodes.
#---------------------------------------------------------------------------

    def __down_pass(self):
        for node in self.__tree.traverse("postorder"):
            #Check for internal nodes that have been visted - marked as "Ancestor"
            if node.name is "Ancestor":
                if not node.is_root():
                    #If the parent node of the current ancestor node is unvisited,
                    #attach the character state of this node to its ancestor
                    if node.up.name is "":
                        node.up.add_feature("anadromy", node.anadromy)
                        node.up.add_feature("aqp3", node.aqp3)
                        node.up.name = "Ancestor"
                    #If the node has an intersection with its ancestor, set it
                    if node.aqp3.issubset(
                            node.up.aqp3) or node.aqp3.issuperset(
                                node.up.aqp3):
                        node.up.add_feature(
                            "aqp3", node.up.aqp3.intersection(node.aqp3))
                    else:  #Otherwise, it's a union of two states
                        node.up.add_feature("aqp3",
                                            node.up.aqp3.union(node.aqp3))
                    #If the node has an intersection with its ancestor, set it
                    if node.anadromy.issubset(
                            node.up.anadromy) or node.anadromy.issuperset(
                                node.up.anadromy):
                        node.up.add_feature(
                            "anadromy",
                            node.up.anadromy.intersection(node.anadromy))
                    else:  #Otherwise, it's a union of two states
                        node.up.add_feature(
                            "anadromy", node.up.anadromy.union(node.anadromy))
            else:  #Otherwise, it could be an unnamed internal node, or a terminal node
                #If it's a terminal node, grab its states from the lookup
                if node.name in self.__anadromy_lookup:
                    isAnadromous = set(
                        [self.__anadromy_lookup[node.name][self.ANAD_INDEX]])
                    isAqp3 = set(
                        [self.__anadromy_lookup[node.name][self.AQP3_INDEX]])
                    node.add_feature("anadromy", isAnadromous)
                    node.add_feature("aqp3", isAqp3)

                    if node.up.name is "":  #If the internal node is not yet named, it is unvisited
                        node.up.add_feature("anadromy", isAnadromous)
                        node.up.add_feature("aqp3", isAqp3)
                        node.up.name = "Ancestor"  #Tag internal nodes as Ancestor to easily identify visited nodes

                    if self.__anadromy_lookup[node.name][
                            self.AQP3_INDEX] in node.up.aqp3:
                        node.up.add_feature(
                            "aqp3", node.aqp3.intersection(node.up.aqp3))
                    else:
                        node.up.add_feature("aqp3",
                                            node.up.aqp3.union(node.aqp3))

                    if self.__anadromy_lookup[node.name][
                            self.ANAD_INDEX] in node.up.anadromy:
                        node.up.add_feature(
                            "anadromy",
                            node.anadromy.intersection(node.up.anadromy))
                    else:
                        node.up.add_feature(
                            "anadromy", node.up.anadromy.union(node.anadromy))
                node.name = self.__anadromy_lookup[node.name][
                    self.COMMON_INDEX]

    #end __down_pass

    #----------------------------__up_pass--------------------------------------
    # Description: Private method to perform up-pass to clear any union in
    #              ancestor nodes by sinding the intersection of the
    #              ancestor and its parent node.
    #---------------------------------------------------------------------------
    def __up_pass(self):  #Up-pass to clear any union in ancestor nodes
        for node in self.__tree.traverse("preorder"):
            if node.name is "Ancestor":
                if not node.is_root():
                    if len(node.anadromy) > 1:
                        node.add_feature(
                            "anadromy",
                            node.anadromy.intersection(node.up.anadromy))
                    if len(node.aqp3) > 1:
                        node.add_feature("aqp3",
                                         node.aqp3.intersection(node.up.aqp3))

    #end __up_pass

    #--------------------------__clean_tree-------------------------------------
    # Description: Private function to clear the sets in the attributes for
    #              anadromy and AQP3 in each node and turn them into integers.
    #---------------------------------------------------------------------------
    def __clean_tree(self):
        for node in self.__tree.traverse("preorder"):
            character_state_anad = next(iter(node.anadromy))
            character_state_aqp3 = next(iter(node.aqp3))
            node.add_feature("anadromy", character_state_anad)
            node.add_feature("aqp3", character_state_aqp3)

    #end __clean_tree

    #-------------------------__find_char_states---------------------------------
    # Description: Private function to find the number of branches, as well as
    #              find the number of character states - both individual and
    #              branches with both andromy and AQP3.
    #---------------------------------------------------------------------------
    def __find_char_states(self):
        for node in self.__tree.traverse("preorder"):
            self.__num_of_branches += 1
            if node.anadromy == 1 and node.aqp3 == 1:
                self.__num_anad_and_aqp3 += 1
            if node.anadromy == 1:
                self.__num_anad += 1
            if node.aqp3 == 1:
                self.__num_aqp3 += 1
        self.__num_of_branches -= 1  #Not counting the root as a separate branch
示例#22
0
 all_dists = []
 for othersp_seq in seqids_of_other_species:
     dist = (t & seqid).get_distance(othersp_seq, topology_only=True)
     all_dists.append(dist)
 # find indexes of the three shortest distances
 try:
     idxes_of_3_smallest = np.argpartition(np.array(all_dists), 3)[:3]
 except ValueError:
     idxes_of_3_smallest = np.argpartition(
         np.array(all_dists),
         2)  # for the case that list is only 3 items long
 closest_seq_ids = [seqid]
 for d in idxes_of_3_smallest:
     closest_seq_ids.append(seqids_of_other_species[d])
 # ete3 has codeml handling implemented!! No need for own functions.
 subtree = t.copy()
 subtree.prune(closest_seq_ids, preserve_branch_length=True)
 subtree.unroot()
 evotree = EvolTree(subtree.write())
 subfasta = make_clean_fasta(closest_seq_ids, seqdatadict)
 if not subfasta:
     omega_list.append("NA")
     continue
 else:
     evotree.link_to_alignment(subfasta)
     workdirname = './codeml_' + "__".join(closest_seq_ids)
     evotree.workdir = workdirname
     list_of_tempdirs.append(workdirname)
     # mark the foreground branch
     foreground_leafnode = evotree & seqid
     #			print (seqid)
示例#23
0
# GET 4 RANDOM INDICES TO PRUNE
indices = sample(range(0, len(leaves)), 4)
print "\nRANDOM 4 INDICES: " + ', '.join(str(x) for x in indices)

# USE THOSE INDICES TO GET 4 RANDOM NODES
to_prune = []
for index in indices:
	to_prune.append(leaves[index])

print "\nTO PRUNE "
print to_prune
print "\n"

# COPY THE TREE TO NOT LOSE DATA
c = t.copy();

# PRUNE THE TREE
c.prune(to_prune)
print c


# END RESULT
# Old tree still stored in "t"
# Pruned tree stored in "c"





# Reads in a file with a tree structure and returns a ete tree object 
示例#24
0
true_tree = Tree(mstree.newick(node_labels=labels))

RAX_MIN_BL = 1e-6
#### convert branch lengths to # expected substitutions
for node in true_tree.traverse("postorder"):
    node.dist = node.dist * mutation_rate
    # clip the min branch length to make it workable with raxml-ng
    node.dist = max(RAX_MIN_BL, node.dist)

#### print the true_tree as newick
with open(os.path.join(out_dir, "true_tree.newick"), "w+") as f:
    f.write(true_tree.write(format=5, dist_formatter="%.12f"))

# prep the copy to work on
ref_tree = true_tree.copy()

#### randomly select one individual per population for the reference set,
# add the rest for the query set
ref_map = defaultdict(list)
qry_map = defaultdict(list)
for k, v in pop_species_map.items():
    ref = rd.randint(len(v), size=1)[0]
    for i in range(len(v)):
        if i == ref:
            ref_map[k].append(v[i])
        else:
            qry_map[k].append(v[i])

# already prune out the query taxa
ref_list = [v[0] for k, v in ref_map.items()]
示例#25
0
def main(S,G,number_of_leaves,path,k,running_time,number_of_planted_vertices):
    global random_for_precentage,all_edges,TH_edges_in_subtree,compare_subtrees,TH_pattern_in_subtree,TH_compare_subtrees,both,TH_both,accur
    starting_time = datetime.now()
    new_G = nx.DiGraph()
    noise = 0
    number_of_HT_under_planted = 10
    S = Tree()
    sigma = {}
    nCr_lookup_table = {}
    fact_lookup_table = {}
    colors = {}
    S_dis_matrix = {}
    names = []
    S_colors = {}
    G_internal_colors = {}
    sol = {}

    for i in range(0, number_of_leaves):
        names.append(sym + str(i))
    S.populate(number_of_leaves, names_library=names)
    count_nodes_and_update_internal_names(S)
    #S = random_again(S, number_of_leaves / 4)
    colors = random_colors(S, colors)

    G = S.copy("newick")
    for leaf in G.iter_leaves():
        if leaf.name[:6] == 'Specie':
            leaf.name = "Gene" + leaf.name[6:]
        else:
            leaf.name = "GeneI" + leaf.name[8:]
    print_tree(G,'G',path)
    print_tree(S,'S',path)

    sigma = create_sigme(number_of_leaves, sigma)
    utils.newick2edgelist.main(path)
    save_edgelist(S_dis_matrix,path)

    S = tr.Tree.get_from_path(path + "/phyliptree(binary,all).phy", schema="newick")
    G = tr.Tree.get_from_path(path + "/GeneTree(binary)_local.txt", schema="newick")

    S = utiles.init_internal_labels(S, 'x', sigma, path)
    G = utiles.init_internal_labels(G, 'u', sigma, path)

    G = tree_operations.collapse_edges(G)
    S = tree_operations.collapse_edges(S)

    S_labels_table, G_labels_table,sigma = inits.init_taxon_to_label_table(S, G, sigma)

    sigma, old_sigma = inits.update_sigma(S, G, 0, sigma, False, path, True, S_labels_table, G_labels_table)
    colors, old_colors = inits.update_colors(S, colors, True)
    max_dis = tree_operations.max_dis(S_dis_matrix)

    flag = True
    j = 0
    all_random_sources_red_to_red = []
    all_random_sources_black_to_black = []
    all_random_nutral = []
    all_random_sources = (all_random_sources_red_to_red, all_random_sources_black_to_black, all_random_nutral)
    new_G = tree_operations.weight_G_based_on_same_color_HT(G, new_G, [],
                                                            [],[],[], 0, False,
                                                            'HT', False, k)
    new_G = tree_operations.number_of_edges_in_subtree(new_G)

    S_colors = tree_operations.color_tree(S, 'S', S_colors, colors, sigma)
    G_internal_colors = tree_operations.color_tree(G, 'G', G_internal_colors, colors, sigma)
    if not on_lab:
        draw.draw_S_and_G(S, G, old_sigma, colors, sigma, path, None, '_rand_before')

    if not running_time:
        while j < number_of_planted_vertices:
            print(
                '                                                                 *****         %sth vertex            ******' % str(
                    j))
            sol[j] = {}
            nCr_lookup_table, fact_lookup_table, (
                sol[j]['Marked'], sol[j]['list_of_couples']), colors = choose_planted_vertex(S_dis_matrix,new_G, S, G,
                                                                                            G_internal_colors,
                                                                                            TH_edges_in_subtree,
                                                                                            compare_subtrees,
                                                                                            TH_compare_subtrees,
                                                                                            sigma,
                                                                                            k,
                                                                                            both,
                                                                                            TH_both, j, sol, accur,
                                                                                            nCr_lookup_table,
                                                                                            fact_lookup_table,
                                                                                            all_random_sources,
                                                                                            colors,
                                                                                            S_colors, max_dis)
            if sol[j]['Marked'] == False:
                flag = flag and sol[j]['Marked']
            else:
                sigma, old_sigma, y = change_sigma(sigma, old_sigma, S, G, sol[j]['list_of_couples'],
                                                   number_of_HT_under_planted,S_labels_table,G_labels_table)
                S_colors = tree_operations.color_tree(S, 'S', S_colors, colors, sigma)
                G_internal_colors = tree_operations.color_tree(G, 'G', G_internal_colors, colors, sigma)
            j += 1
            if not flag:
                if not on_lab:
                    draw.draw_S_and_G(S, G, old_sigma, colors, sigma, path, None, '_rand')
                old_colors = return_color_to_taxon(S, colors)
                save_data(old_sigma, old_colors, sol, noise, 0, compare,path)
                if not running_time:
                    quit()
            print('Planted vertices:%s' % str(sol))
            if not on_lab:
                draw.draw_S_and_G(S, G, old_sigma, colors, sigma, path, sol, '_rand' + str(noise) + '.' + str(0))
            old_colors = return_color_to_taxon(S, colors)
            save_data(old_sigma, old_colors, sol, noise, 0,compare,path)
            return_planted_nodes_new_name(sol,G,path)
        p = Pool(15)
        parameters = [(noise_level[i],number_of_HT_under_planted,G_internal_colors,S_colors,nCr_lookup_table,fact_lookup_table,number_of_leaves) for i in range(0,len(noise_level))]
        p.map(create_tree_for_HT_and_colors_noise, parameters)
        p.map(create_tree_for_color_noise, parameters)
        p.map(create_tree_for_HT_noise, parameters)


    else:
        save_data(old_sigma, old_colors, sol, noise, 0, compare, path)
    print('Running time: %s' % str(datetime.now() - starting_time))
示例#26
0
def main(arg1, arg2):
    start_time = time.time()
    with open(arg1) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    content = [x.strip() for x in content]
    ###print(content)
    if (arg2 == "common"):
        leaf_lists = {}
        for i in range(len(content)):
            t1 = Tree(content[i])
            leaf_lists[i] = []
            for leaf in t1:
                leaf_lists[i].append(leaf.name)
        ##print(leaf_lists)
        distance_mat = []
        for x in range(0, len(content)):
            distance_mat.append([])
            for y in range(0, x):
                lev_dist = len(intersection(leaf_lists[x], leaf_lists[y]))
                distance_mat[x].append(lev_dist)
        #.pop(0)
        M_labels = number_labels(0, len(content))
        tree, order = UPGMA_inc.UPGMA(distance_mat, M_labels)
        ##print(tree)
        tree = tree + ';'
        t_order = Tree(tree)
        order_list = []
        for node in t_order.traverse("postorder"):
            # Do some analysis on node
            if node.is_leaf():
                order_list.append(node.name)

        ###print(t_order)
        ###print(order_list)
        #min_x=distance_mat.index(min(distance_mat))
        #min_y=distance_mat[min_x].index(min(distance_mat[min_x]))

        ###print(min_x,min_y)
        ###print(distance_mat[min_x][min_y])
        t2 = Tree(content[int(order_list[0])])
        for i in range(0, len(order_list) - 1):

            t1 = Tree(content[int(order_list[i + 1])])
            tree1_copy = t1.copy()
            t2 = Tree(scm(t1, t2))

            #splits1=rf_dist_list.main(tree1_copy.copy(),tree2_copy.copy())
            ###print("splits 1:  ", splits1)
            ###print("splits 2:  ",splits2)
        ###print(t2.write(format=9))
        #t2.show()
    elif (arg2 == "uncommon"):
        leaf_lists = {}
        for i in range(len(content)):
            t1 = Tree(content[i])
            leaf_lists[i] = []
            for leaf in t1:
                leaf_lists[i].append(leaf.name)
        ###print(leaf_lists)
        distance_mat = []
        for x in range(0, len(content)):
            distance_mat.append([])
            for y in range(0, x):
                lev_dist = get_unique(leaf_lists[x], leaf_lists[y])
                distance_mat[x].append(lev_dist)
        #.pop(0)
        M_labels = number_labels(0, len(content))
        tree, order = UPGMA.UPGMA(distance_mat, M_labels)
        ###print(tree)
        tree = tree + ';'
        t_order = Tree(tree)
        order_list = []
        for node in t_order.traverse("postorder"):
            # Do some analysis on node
            if node.is_leaf():
                order_list.append(node.name)

        ###print(t_order)
        ##print(order_list)
        #min_x=distance_mat.index(min(distance_mat))
        #min_y=distance_mat[min_x].index(min(distance_mat[min_x]))

        ###print(min_x,min_y)
        ###print(distance_mat[min_x][min_y])
        t2 = Tree(content[int(order_list[0])])
        for i in range(0, len(order_list) - 1):

            t1 = Tree(content[int(order_list[i + 1])])
            tree1_copy = t1.copy()
            t2 = Tree(scm(t1, t2))

            leaf_list1 = []
            leaf_list2 = []
            for leaf in t1:
                leaf_list1.append(leaf.name)
            for leaf in t2:
                leaf_list2.append(leaf.name)
            ###print(leaf_list1)
            overlap = intersection(leaf_list1, leaf_list2)
            ###print("overlap is: ",overlap)

            tree2_copy = t2.copy()
            ###print(tree1_copy,tree2_copy)

            tree1_copy.prune(overlap)
            tree2_copy.prune(overlap)

            #t.write(format=1
            splits2 = rf_dist_list.main(tree2_copy.copy(), tree1_copy.copy())
            #splits1=rf_dist_list.main(tree1_copy.copy(),tree2_copy.copy())
            ###print("splits 1:  ", splits1)
            ###print("splits 2:  ",splits2)
    else:
        t2 = Tree(content[0])
        for i in range(0, len(content) - 1):

            t1 = Tree(content[i + 1])
            tree1_copy = t1.copy()
            t2 = Tree(scm(t1, t2))

            leaf_list1 = []
            leaf_list2 = []
            for leaf in t1:
                leaf_list1.append(leaf.name)
            for leaf in t2:
                leaf_list2.append(leaf.name)
            ###print(leaf_list1)
            #overlap=intersection(leaf_list1,leaf_list2)
            ###print("overlap is: ",overlap)

            #tree2_copy=t2.copy()
            ###print(tree1_copy,tree2_copy)

            #tree1_copy.prune(overlap)
            #tree2_copy.prune(overlap)

            #t.write(format=1
            #splits2=rf_dist_list.main(tree2_copy.copy(),tree1_copy.copy())
            #splits1=rf_dist_list.main(tree1_copy.copy(),tree2_copy.copy())
            ###print("splits 1:  ", splits1)
            ###print("splits 2:  ",splits2)
    ###print(time.time()-start_time)
    ###print(t2.write(format=9))
    #t2.show()
    return t2
def train_placement_distances(rank_training_seqs: dict,
                              taxonomic_ranks: dict,
                              ref_fasta_dict: dict,
                              test_fasta: FASTA,
                              ref_pkg: ReferencePackage,
                              leaf_taxa_map: dict,
                              molecule: str,
                              executables: dict,
                              raxml_threads=4):
    """
    Function for iteratively performing leave-one-out analysis for every taxonomic lineage represented in the tree,
    yielding an estimate of placement distances corresponding to taxonomic ranks.

    :param rank_training_seqs: A dictionary storing the sequence names being used to test each taxon within each rank
    :param taxonomic_ranks: A dictionary mapping rank names (e.g. Phylum)
     to rank depth values where Kingdom is 0, Phylum is 1, etc.
    :param ref_fasta_dict: A dictionary with headers as keys and sequences as values containing only reference sequences
    :param test_fasta: Dictionary with headers as keys and sequences as values for deduplicated training sequences
    :param ref_pkg: A ReferencePackage instance
    :param leaf_taxa_map: A dictionary mapping TreeSAPP numeric sequence identifiers to taxonomic lineages
    :param executables: A dictionary mapping software to a path of their respective executable
    :param molecule: Molecule type [prot | dna | rrna]
    :param raxml_threads: Number of threads to be used by RAxML for parallel computation

    :return:
    """

    logging.info(
        "\nEstimating branch-length placement distances for taxonomic ranks. Progress:\n"
    )
    taxonomic_placement_distances = dict()
    taxonomy_filtered_query_seqs = dict()
    pruned_ref_fasta_dict = dict()
    query_seq_name_map = dict()
    seq_dict = dict()
    pqueries = list()
    intermediate_files = list()
    aligner = "hmmalign"

    temp_tree_file = "tmp_tree.txt"
    temp_ref_aln_prefix = "taxonomy_filtered_ref_seqs"
    temp_query_fasta_file = "queries.fasta"
    query_multiple_alignment = aligner + "_queries_aligned.phy"

    # Read the tree as ete3 Tree instance
    ref_tree = Tree(ref_pkg.tree)

    bmge_file = executables["BMGE.jar"]
    if not os.path.exists(bmge_file):
        raise FileNotFoundError("Cannot find " + bmge_file)

    num_training_queries = 0
    for rank in rank_training_seqs:
        num_rank_training_seqs = 0
        for taxonomy in rank_training_seqs[rank]:
            num_rank_training_seqs += len(rank_training_seqs[rank][taxonomy])
        if len(rank_training_seqs[rank]) == 0:
            logging.error("No sequences available for estimating " + rank +
                          "-level placement distances.\n")
            return taxonomic_placement_distances, pqueries
        else:
            logging.debug(
                str(num_rank_training_seqs) + " sequences to train " + rank +
                "-level placement distances\n")
        num_training_queries += num_rank_training_seqs

    if num_training_queries < 30:
        logging.error("Too few (" + str(num_training_queries) +
                      ") sequences for training placement distance model.\n")
        return taxonomic_placement_distances, pqueries
    if num_training_queries < 50:
        logging.warning("Only " + str(num_training_queries) +
                        " sequences for training placement distance model.\n")
    step_proportion = setup_progress_bar(num_training_queries)
    acc = 0.0

    # For each rank from Class to Species (Kingdom & Phylum-level classifications to be inferred by LCA):
    for rank in sorted(rank_training_seqs, reverse=True):
        if rank not in taxonomic_ranks:
            logging.error("Rank '" + rank +
                          "' not found in ranks being used for training.\n")
            sys.exit(33)
        taxonomic_placement_distances[rank] = list()
        leaf_trimmed_taxa_map = trim_lineages_to_rank(leaf_taxa_map, rank)

        # Add the lineages to the Tree instance
        for leaf in ref_tree:
            leaf.add_features(
                lineage=leaf_trimmed_taxa_map.get(leaf.name, "none"))

        # Remove all sequences belonging to a taxonomic rank from tree and reference alignment
        for taxonomy in sorted(rank_training_seqs[rank]):
            logging.debug("Testing placements for " + taxonomy + ":\n")
            query_name = re.sub(r"([ /])", '_', taxonomy.split("; ")[-1])
            leaves_excluded = 0

            # Write query FASTA containing sequences belonging to `taxonomy`
            query_seq_decrementor = -1
            for seq_name in rank_training_seqs[rank][taxonomy]:
                query_seq_name_map[query_seq_decrementor] = seq_name
                taxonomy_filtered_query_seqs[str(
                    query_seq_decrementor)] = test_fasta.fasta_dict[seq_name]
                query_seq_decrementor -= 1
            logging.debug("\t" +
                          str(len(taxonomy_filtered_query_seqs.keys())) +
                          " query sequences.\n")
            acc += len(taxonomy_filtered_query_seqs.keys())
            write_new_fasta(taxonomy_filtered_query_seqs,
                            fasta_name=temp_query_fasta_file)
            intermediate_files.append(temp_query_fasta_file)

            for key in ref_fasta_dict.keys():
                node = key.split('_')[0]
                # Node with truncated and/or unclassified lineages are not in `leaf_trimmed_taxa_map`
                if node in leaf_trimmed_taxa_map and not re.match(
                        taxonomy, leaf_trimmed_taxa_map[node]):
                    pruned_ref_fasta_dict[node] = ref_fasta_dict[key]
                else:
                    leaves_excluded += 1

            unique_ref_headers = set([
                re.sub('_' + re.escape(ref_pkg.prefix), '', x)
                for x in pruned_ref_fasta_dict.keys()
            ])
            logging.debug("\t" + str(leaves_excluded) +
                          " sequences pruned from tree.\n")

            # Copy the tree since we are removing leaves of `taxonomy` and don't want this to be permanent
            tmp_tree = ref_tree.copy(method="deepcopy")
            # iteratively detaching the monophyletic clades generates a bad tree, so do it all at once
            tmp_tree.prune(pruned_ref_fasta_dict.keys(),
                           preserve_branch_length=True)
            # Resolve any multifurcations
            tmp_tree.resolve_polytomy()
            logging.debug("\t" + str(len(tmp_tree.get_leaves())) +
                          " leaves in pruned tree.\n")

            # Write the new reference tree with sequences from `taxonomy` removed
            tmp_tree.write(outfile=temp_tree_file, format=5)
            intermediate_files.append(temp_tree_file)

            ##
            # Run hmmalign, BMGE and RAxML to map sequences from the taxonomic rank onto the tree
            ##
            if aligner == "papara":
                temp_ref_phylip_file = temp_ref_aln_prefix + ".phy"
                # Write the reference MSA with sequences of `taxonomy` removed
                phy_dict = utilities.reformat_fasta_to_phy(
                    pruned_ref_fasta_dict)
                utilities.write_phy_file(temp_ref_phylip_file, phy_dict)
                aln_stdout = wrapper.run_papara(executables["papara"],
                                                temp_tree_file,
                                                temp_ref_phylip_file,
                                                temp_query_fasta_file, "prot")
                intermediate_files.append(temp_ref_phylip_file)
                os.rename("papara_alignment.default", query_multiple_alignment)
            elif aligner == "hmmalign":
                temp_ref_fasta_file = temp_ref_aln_prefix + ".fasta"
                temp_ref_profile = temp_ref_aln_prefix + ".hmm"
                sto_file = re.sub("\.phy$", ".sto", query_multiple_alignment)
                # Write the pruned reference FASTA file
                write_new_fasta(pruned_ref_fasta_dict, temp_ref_fasta_file)
                # Build the HMM profile that doesn't include pruned reference sequences
                wrapper.build_hmm_profile(executables["hmmbuild"],
                                          temp_ref_fasta_file,
                                          temp_ref_profile)
                # Currently not supporting rRNA references (phylogenetic_rRNA)
                aln_stdout = wrapper.profile_aligner(executables,
                                                     temp_ref_fasta_file,
                                                     temp_ref_profile,
                                                     temp_query_fasta_file,
                                                     sto_file)
                # Reformat the Stockholm format created by cmalign or hmmalign to Phylip
                sto_dict = file_parsers.read_stockholm_to_dict(sto_file)
                for seq_name in sto_dict:
                    try:
                        int(seq_name.split('_')[0])
                        seq_dict[seq_name.split('_')[0]] = sto_dict[seq_name]
                    except ValueError:
                        seq_dict[seq_name] = sto_dict[seq_name]
                write_new_fasta(seq_dict, query_multiple_alignment)
                intermediate_files += [
                    temp_ref_fasta_file, temp_ref_profile, sto_file,
                    query_multiple_alignment
                ]
            else:
                logging.error("Unrecognised alignment tool '" + aligner +
                              "'. Exiting now.\n")
                sys.exit(33)
            logging.debug(str(aln_stdout) + "\n")

            trim_command, query_filtered_multiple_alignment = wrapper.get_msa_trim_command(
                executables, query_multiple_alignment, molecule)
            launch_write_command(trim_command)
            intermediate_files += glob(query_filtered_multiple_alignment + "*")

            # Ensure reference sequences haven't been removed
            msa_dict, failed_msa_files, summary_str = file_parsers.validate_alignment_trimming(
                [query_filtered_multiple_alignment], unique_ref_headers, True)
            nrow, ncolumn = file_parsers.multiple_alignment_dimensions(
                seq_dict=read_fasta_to_dict(query_filtered_multiple_alignment),
                mfa_file=query_filtered_multiple_alignment)
            logging.debug("Columns = " + str(ncolumn) + "\n")
            if query_filtered_multiple_alignment not in msa_dict.keys():
                logging.debug(
                    "Placements for '" + taxonomy +
                    "' are being skipped after failing MSA validation.\n")
                for old_file in intermediate_files:
                    os.remove(old_file)
                    intermediate_files.clear()
                continue
            logging.debug("Number of sequences discarded: " + summary_str +
                          "\n")

            # Run RAxML with the parameters specified
            raxml_files = wrapper.raxml_evolutionary_placement(
                executables["raxmlHPC"], temp_tree_file,
                query_filtered_multiple_alignment, ref_pkg.sub_model, "./",
                query_name, raxml_threads)

            # Parse the JPlace file to pull distal_length+pendant_length for each placement
            jplace_data = jplace_parser(raxml_files["jplace"])
            placement_tree = jplace_data.tree
            node_map = map_internal_nodes_leaves(placement_tree)
            for pquery in jplace_data.placements:
                top_lwr = 0.1
                top_placement = PQuery(taxonomy, rank)
                for name, info in pquery.items():
                    if name == 'p':
                        for placement in info:
                            # Only record the best placement's distance
                            lwr = float(placement[2])
                            if lwr > top_lwr:
                                top_lwr = lwr
                                top_placement.inode = placement[0]
                                top_placement.likelihood = placement[1]
                                top_placement.lwr = lwr
                                top_placement.distal = round(
                                    float(placement[3]), 6)
                                top_placement.pendant = round(
                                    float(placement[4]), 6)
                                leaf_children = node_map[int(
                                    top_placement.inode)]
                                if len(leaf_children) > 1:
                                    # Reference tree with clade excluded
                                    parent = tmp_tree.get_common_ancestor(
                                        leaf_children)
                                    tip_distances = parent_to_tip_distances(
                                        parent, leaf_children)
                                    top_placement.mean_tip = round(
                                        float(
                                            sum(tip_distances) /
                                            len(tip_distances)), 6)
                    elif name == 'n':
                        top_placement.name = query_seq_name_map[int(
                            info.pop())]
                    else:
                        logging.error("Unexpected variable in pquery keys: '" +
                                      name + "'\n")
                        sys.exit(33)

                if top_placement.lwr >= 0.5:  # The minimum likelihood weight ration a placement requires to be included
                    pqueries.append(top_placement)
                    taxonomic_placement_distances[rank].append(
                        top_placement.total_distance())

            # Remove intermediate files from the analysis of this taxon
            intermediate_files += list(raxml_files.values())
            for old_file in intermediate_files:
                os.remove(old_file)
            # Clear collections
            taxonomy_filtered_query_seqs.clear()
            intermediate_files.clear()
            pruned_ref_fasta_dict.clear()
            seq_dict.clear()
            query_seq_name_map.clear()

            while acc > step_proportion:
                acc -= step_proportion
                sys.stdout.write('-')
                sys.stdout.flush()

        if len(taxonomic_placement_distances[rank]) == 0:
            logging.debug("No samples available for " + rank + ".\n")
        else:
            stats_string = "RANK: " + rank + "\n"
            stats_string += "\tSamples = " + str(
                len(taxonomic_placement_distances[rank])) + "\n"
            stats_string += "\tMedian = " + str(
                round(utilities.median(taxonomic_placement_distances[rank]),
                      4)) + "\n"
            stats_string += "\tMean = " + str(
                round(
                    float(sum(taxonomic_placement_distances[rank])) /
                    len(taxonomic_placement_distances[rank]), 4)) + "\n"
            logging.debug(stats_string)
    sys.stdout.write("-]\n")
    return taxonomic_placement_distances, pqueries