예제 #1
0
def generate_tree_with_splits_from_tree(t, force_fully_resolved=False):    
    if force_fully_resolved:        
        resolve_polytomies(t, update_splits=False)
    t = PhylogeneticTree(t)
    _LOG.debug("calculating splits")
    t.calc_splits()
    _LOG.debug("end generating tree from string")
    return t
예제 #2
0
 def phylogeneticTreeFromFile(self, treefile, file_format):
     dataset = Dataset()
     dataset.read(open(treefile, 'rU'), schema=file_format)
     dendropy_tree = dataset.tree_lists[0][0]
     tree = PhylogeneticTree(dendropy_tree)
     tree.calc_splits()
     delete_outdegree_one(tree._tree)
     return tree
예제 #3
0
def generate_tree_with_splits_from_tree(t, force_fully_resolved=False):
    if force_fully_resolved:
        resolve_polytomies(t, update_splits=False)
    t = PhylogeneticTree(t)
    _LOG.debug("calculating splits")
    t.calc_splits()
    _LOG.debug("end generating tree from string")
    return t
예제 #4
0
 def phylogeneticTreeFromFile(self, treefile, file_format):
     dataset = Dataset()
     dataset.read(open(treefile, 'rU'), schema=file_format)
     dendropy_tree = dataset.tree_lists[0][0]
     tree = PhylogeneticTree(dendropy_tree)
     tree.calc_splits()
     delete_outdegree_one(tree._tree)
     return tree
예제 #5
0
def main(args):
    # Step 1: Decompose tree
    tree = dendropy.Tree.get(path=args.input_tree_file, schema="newick")
    tree.resolve_polytomies(limit=2, update_bipartitions=True)

    phy_tree = PhylogeneticTree(tree)

    X = []

    centroid_edge = phy_tree.get_centroid_edge()

    dd_tree = copy.deepcopy(tree)

    max_e_w = 0.0
    for e in dd_tree.postorder_edge_iter():
        if e.length == None:
            e.length = 0
        else:
            max_e_w = max(max_e_w, e.length)

    max_e_w = 200 * max_e_w

    centroid_edge.length = max_e_w
    a = centroid_edge.head_node
    b = centroid_edge.tail_node
    a_arr = []
    b_arr = []
    dd_tree.reroot_at_node(a)
    for leaf in dd_tree.leaf_nodes():
        a_arr.append((leaf.taxon.label, leaf.distance_from_root()))
    a_arr = sorted(a_arr, key=lambda x: x[1])

    for i in range(25):
        X.append(a_arr[i][0])

    dd_tree.reroot_at_node(b)
    for leaf in dd_tree.leaf_nodes():
        b_arr.append((leaf.taxon.label, leaf.distance_from_root()))
    b_arr = sorted(b_arr, key=lambda x: x[1])

    for i in range(25):
        X.append(b_arr[i][0])

    with open(args.output + "/X.lab", "w") as f:
        f.write("\n".join(X))
예제 #6
0
def main(args):
    # Step 1: Decompose tree
    tree = dendropy.Tree.get(path=args.input_tree_file, schema="newick")
    tree.resolve_polytomies(limit=2, update_bipartitions=True)
    tree = PhylogeneticTree(tree)
    trees = decompose_trees(tree, args.max_subset_size)
    # Step 2: Write out leaf subsets
    n = len(trees)
    pad = len(str(n))
    # i = 1
    i = 0
    for tree in trees:
        keep = tree.leaf_node_names()
        j = str(i).zfill(pad)
        # output_file = args.output + "-" + j + "-outof-" + str(n) + ".txt"
        with open(args.output + "_ctree" + str(i) + ".lab", 'w') as f:
            f.write("\n".join(keep))
        i = i + 1
예제 #7
0
 def testCentroidEdge(self):
     sd = SequenceDataset()
     fp = data_source_path('100T.fasta')
     sd.read(open(fp, 'rU'), file_format='FASTA', datatype='DNA')
     fp = data_source_path('100T.tree')
     tree_list = read_and_encode_splits(sd.dataset, open(fp, "rU"))
     self.assertEqual(len(tree_list), 1)
     t = PhylogeneticTree(tree_list[0])
     self._do_test_centroid(t)
예제 #8
0
def bisect_tree(tree,
                breaking_edge_style='mincluster',
                min_size=0,
                max_size=None,
                max_diam=None):
    """Partition 'tree' into two parts
    """
    _LOG.debug("bisecting tree...")
    # uym2: midpoint decomposition (not in used for now)
    if breaking_edge_style == 'midpoint':
        _LOG.debug("breaking by midpoint")
        t1, t2 = midpoint_bisect(tree._tree, min_size=min_size)
        if t1 is None or t2 is None:
            return None, None
        tree1 = PhylogeneticTree(t1)
        tree2 = PhylogeneticTree(t2)
        return tree1, tree2
    ###############

    # uym2: min_cluster decomposition
    if breaking_edge_style == 'mincluster':
        _LOG.debug("breaking using min-cluster strategy")
        t1, t2 = min_cluster_size_bisect(tree._tree, max_size)
        tree1 = PhylogeneticTree(t1) if t1 else None
        tree2 = PhylogeneticTree(t2) if t2 else None
        return tree1, tree2
    ###############

    _LOG.debug("breaking by centroid")
    e = tree.get_breaking_edge(breaking_edge_style)
    _LOG.debug("breaking_edge length = %s, %s" %
               (e.length, breaking_edge_style))
    snl = tree.n_leaves
    tree1, tree2 = tree.bipartition_by_edge(e)
    _LOG.debug("Tree 1 has %s nodes, tree 2 has %s nodes" %
               (tree1.n_leaves, tree2.n_leaves))
    assert snl == tree1.n_leaves + tree2.n_leaves
    return tree1, tree2
예제 #9
0
def main(args):
    # Step 1: Decompose tree
    tree = dendropy.Tree.get(path=args.input_tree_file, schema="newick")
    tree.resolve_polytomies(limit=2, update_bipartitions=True)
    tree = PhylogeneticTree(tree)
    t1, t2 = bisect_tree(tree)
    trees = [t1, t2]

    # Step 2: Write out leaf subsets
    # i = 1
    i = 0

    keep1 = t1.leaf_node_names()
    with open(args.output + "/A.lab", "w") as f:
        f.write("\n".join(keep1))

    keep2 = t2.leaf_node_names()
    with open(args.output + "/B.lab", "w") as f:
        f.write("\n".join(keep2))
예제 #10
0
파일: pastajob.py 프로젝트: SagesWang/pasta
 def build_subsets_tree(self, curr_tmp_dir_par):
     translate={}
     t2 = {}
     for node in self.tree._tree.leaf_iter():
         nalsj = self.pasta_team.subsets[node.taxon.label]            
         newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:]
         translate[node.taxon.label] = newname
         t2[newname] = set([nalsj])            
     subsets_tree = PhylogeneticTree(read_newick_with_translate(StringIO(self.tree_str),translate_dict=translate))
     for node in subsets_tree._tree.leaf_iter():            
         node.alignment_subset_job = t2[node.taxon]
     del t2
     del translate
     _LOG.debug("nodes labeled")        
     #subsets_tree._tree.infer_taxa()
     #_LOG.debug("fake taxa inferred")                   
     #Then make sure the tree is rooted at a branch (not at a node). 
     if len(subsets_tree._tree.seed_node.child_nodes()) > 2:
         subsets_tree._tree.reroot_at_edge(subsets_tree._tree.seed_node.child_nodes()[0].edge)                        
     _LOG.debug("Subset Labeling (start):\n%s" %str(subsets_tree.compose_newick()))
     # Then label internal branches based on their children, and collapse redundant edges. 
     for node in subsets_tree._tree.postorder_internal_node_iter():
         # my label is the intersection of my children, 
         # unless the intersection is empty, in which case it is the union
         if not hasattr(node, "alignment_subset_job") or node.alignment_subset_job is None:
             node.alignment_subset_job = set.intersection(*[c.alignment_subset_job for c in node.child_nodes()])
             if not node.alignment_subset_job:
                 node.alignment_subset_job = set.union(*[c.alignment_subset_job for c in node.child_nodes()])
         # Now go ahead and prune any child whose label encompasses my label. 
         # Use indexing instead of iteration, because with each collapse, 
         # new children can be added, and we want to process them as well.                         
         i = 0;
         while i < len(node.child_nodes()):                                
             c = node.child_nodes()[i]
             if node.alignment_subset_job.issubset(c.alignment_subset_job):
                 # Dendropy does not collapsing and edge that leads to a tip. Remove instead
                 if c.child_nodes():
                     c.edge.collapse()                                    
                 else:
                     node.remove_child(c)                      
             else:
                 i += 1
         
     # Now, the remaining edges have multiple labels. These need to
     # be further resolved. Do it by minimum length
     #   First find all candidate edges that we might want to contract
     candidate_edges = set()
     for e in subsets_tree._tree.postorder_edge_iter():
         if e.tail_node and e.head_node.alignment_subset_job.intersection(e.tail_node.alignment_subset_job):
             candidate_edges.add( (e.length,e) )
     #   Then sort the edges, and start removing them one by one
     #   only if an edge is still having intersecting labels at the two ends                                                    
     candidate_edges = sorted(candidate_edges)        
     for (el, edge) in candidate_edges:
         I = edge.tail_node.alignment_subset_job.intersection(edge.head_node.alignment_subset_job)
         if I:
             edge.tail_node.alignment_subset_job = I 
             if edge.head_node.child_nodes():
                 edge.collapse()
             else:
                 edge.tail_node.remove_child(edge.head_node)
     # Make sure the tree is correct, remove the actual jobs
     # from nodes (can cause deep-copy problems), assign a label to each
     # node, and keep a mapping between the labels and actual alignment job objects
     self.pasta_team.subsets = {} # Let this now map from subset labels to the actual alignment jobs
     for node in subsets_tree._tree.postorder_node_iter():
         assert len(node.alignment_subset_job) == 1
         nalsj = node.alignment_subset_job.pop()
         node.alignment_subset_job = None 
         node.label = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:]
         self.pasta_team.subsets[node.label] = nalsj
         if node.is_leaf():
             # Add a dummy taxon, or else dendropy can get confused
             node.taxon = Taxon(label=node.label)
     #subsets_tree._tree.infer_taxa()
     return subsets_tree
예제 #11
0
    def launch_alignment(self, context_str=None):
        '''
        '''
        if self.killed:
            raise RuntimeError("PastaAligner Job killed")

        self._reset_jobs()
        self.context_str = context_str
        if self.context_str is None:
            self.context_str = ''
        node_count = self.tree.count_nodes()
        _LOG.debug("Recursive merge on a branch with %d subsets" % (node_count))
        prefix = "subsets tree: %s" %self.tree.compose_newick()[0:200]
        if node_count == 2:
            nodes = self.tree._tree.nodes()
            _LOG.debug("%s ... pairwise merge " % prefix)
            self.skip_merge = False
            self.subjob1 = self.pasta_team.subsets[nodes[0].label]           
            self.subjob2 = self.pasta_team.subsets[nodes[1].label]
            
            self.subjob1.add_parent(self)
            self.add_child(self.subjob1)

            self.subjob2.add_parent(self)
            self.add_child(self.subjob2)                                        
        else:
            _LOG.debug("%s ... recursing further " % prefix)
            self.skip_merge = True
            
            # Reroot near centroid edge
            ce = self.tree.get_centroid_edge(spanning=True)
            nr = ce.head_node if not ce.head_node.is_leaf() else ce.tail_node
            self.tree._tree.reroot_at_node(nr,delete_outdegree_one=False)            
            _LOG.debug("rerooted to: %s ..." % self.tree.compose_newick()[0:200])   
            # For each path from root to its children, create a new merge job         
            merge_job_list = []
            nr = self.tree._tree.seed_node
            children = nr.child_nodes()
            for keepchild in children:                
                remchilds = []                
                for remchild in children:
                    if remchild != keepchild:
                        remchilds.append(nr.reversible_remove_child(remchild, suppress_deg_two=False))
                t1 = PhylogeneticTree(Tree(self.tree._tree))
                remchilds.reverse()
                for child in remchilds:
                    nr.reinsert_nodes(child)
                _LOG.debug("child = %s ..." % t1.compose_newick()[0:200])
                multilocus_dataset1 = self.multilocus_dataset.new_with_shared_meta()
                
                if t1.count_nodes() == 2:            
                    ns = t1._tree.nodes()
                    tmp_dir_par = self.get_pairwise_temp_dir(ns[0].label, ns[1].label)
                else:
                    tmp_dir_par = self.tmp_base_dir                    
                configuration = self.configuration()
                cj = PASTAMergerJob(multilocus_dataset=multilocus_dataset1,
                                    pasta_team=self.pasta_team,
                                    tree=t1,
                                    tmp_base_dir=self.tmp_base_dir,
                                    tmp_dir_par= tmp_dir_par,
                                    delete_temps2=False,
                                    **configuration)
                cj.add_parent(self)
                self.add_child(cj)                                
                merge_job_list.append(cj);
                        
            self.merge_job_list = merge_job_list
            
            # now launch these new merge jobs
            for merge_job in self.merge_job_list:
                if self.killed:
                    raise RuntimeError("PastaAligner Job killed")
                merge_job.launch_alignment()

            self._merge_queued_event.set()
            
            if self.killed:
                raise RuntimeError("PastaAligner Job killed")
        return
예제 #12
0
파일: pastajob.py 프로젝트: xieduo7/pasta
    def build_subsets_tree(self, curr_tmp_dir_par, build_min_tree=True):
        # uym2 added: add option for MST
        if build_min_tree:
            _LOG.debug("START building Minimum Spanning Tree")
            grouping = {}
            groupName2jobName = {}

            for node in self.tree._tree.leaf_node_iter():
                groupName = self.pasta_team.subsets[
                    node.taxon.label].tmp_dir_par[len(curr_tmp_dir_par) + 1:]
                grouping[node.taxon.label] = groupName.replace("/", "")
                groupName2jobName[groupName] = self.pasta_team.subsets[
                    node.taxon.label]

            subsets_tree = build_groups_MST(self.tree._tree, grouping)

            for node in subsets_tree.postorder_node_iter():
                if node.is_leaf():
                    node.taxon.label = node.taxon.label.replace("d", "/d")
                node.label = node.label.replace("d", "/d")

            self.pasta_team.subsets = groupName2jobName
            MST = PhylogeneticTree(subsets_tree)
            _LOG.debug("Spanning tree is:\n %s" % MST)
            return MST

    ###################################

        _LOG.debug("START building heuristic spanning tree")

        translate = {}
        t2 = {}
        for node in self.tree._tree.leaf_node_iter():
            nalsj = self.pasta_team.subsets[node.taxon.label]
            newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par) + 1:]
            translate[node.taxon.label] = newname
            t2[newname] = set([nalsj])
        subsets_tree = PhylogeneticTree(
            Tree.get(data=self.tree_str, schema='newick'))
        for node in subsets_tree._tree.leaf_node_iter():
            node.alignment_subset_job = t2[translate[node.taxon.label]]
            #node.alignment_subset_job = t2[node.taxon]
        del t2
        del translate
        _LOG.debug("leafs labeled")
        #subsets_tree._tree.infer_taxa()
        #_LOG.debug("fake taxa inferred")
        #Then make sure the tree is rooted at a branch (not at a node).
        if len(subsets_tree._tree.seed_node.child_nodes()) > 2:
            for c in subsets_tree._tree.seed_node.child_nodes():
                if c.edge.is_internal():
                    break
            subsets_tree._tree.is_rooted = True
            subsets_tree._tree.reroot_at_edge(c.edge,
                                              length1=c.edge.length / 2.,
                                              length2=c.edge.length / 2.,
                                              suppress_unifurcations=False)
        _LOG.debug(
            "Subset Labeling (start):\n%s" %
            str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000])
        #_LOG.debug("Subset Labeling (start):\n%s" %str(len(subsets_tree._tree.seed_node.child_nodes())))
        # Then label internal branches based on their children, and collapse redundant edges.
        for node in subsets_tree._tree.postorder_internal_node_iter():
            # my label is the intersection of my children,
            # unless the intersection is empty, in which case it is the union
            if not hasattr(node, "alignment_subset_job"
                           ) or node.alignment_subset_job is None:
                node.alignment_subset_job = set.intersection(
                    *[c.alignment_subset_job for c in node.child_nodes()])
                if not node.alignment_subset_job:
                    node.alignment_subset_job = set.union(
                        *[c.alignment_subset_job for c in node.child_nodes()])
            # Now go ahead and prune any child whose label encompasses my label.
            # Use indexing instead of iteration, because with each collapse,
            # new children can be added, and we want to process them as well.
            i = 0
            while i < len(node.child_nodes()):
                c = node.child_nodes()[i]
                if node.alignment_subset_job.issubset(c.alignment_subset_job):
                    # Dendropy does not collapsing and edge that leads to a tip. Remove instead
                    if c.child_nodes():
                        c.edge.collapse()
                    else:
                        node.remove_child(c)
                else:
                    i += 1

            node.label = "+".join(nj.tmp_dir_par[len(curr_tmp_dir_par) + 1:]
                                  for nj in node.alignment_subset_job)
            if node.is_leaf():
                node.taxon = subsets_tree._tree.taxon_namespace.new_taxon(
                    label=node.label)

        _LOG.debug(
            "Before final round, the tree is:\n %s" %
            str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000])
        # Now, the remaining edges have multiple labels. These need to
        # be further resolved. Do it by minimum length
        #   First find all candidate edges that we might want to contract
        candidate_edges = set()
        for e in subsets_tree._tree.postorder_edge_iter():
            if e.tail_node and e.head_node.alignment_subset_job.intersection(
                    e.tail_node.alignment_subset_job):
                candidate_edges.add((e.length, e))
        #   Then sort the edges, and start removing them one by one
        #   only if an edge is still having intersecting labels at the two ends
        candidate_edges = sorted(candidate_edges,
                                 key=lambda x: x[0] if x[0] else -1)
        for (el, edge) in candidate_edges:
            I = edge.tail_node.alignment_subset_job.intersection(
                edge.head_node.alignment_subset_job)
            if I:
                edge.tail_node.alignment_subset_job = I
                if edge.head_node.child_nodes():
                    #edge.collapse(adjust_collapsed_head_children_edge_lengths=True)
                    edge.collapse()
                else:
                    edge.tail_node.remove_child(edge.head_node)
        # Make sure the tree is correct, remove the actual jobs
        # from nodes (can cause deep-copy problems), assign a label to each
        # node, and keep a mapping between the labels and actual alignment job objects
        self.pasta_team.subsets = {
        }  # Let this now map from subset labels to the actual alignment jobs
        for node in subsets_tree._tree.postorder_node_iter():
            assert len(node.alignment_subset_job) == 1
            nalsj = node.alignment_subset_job.pop()
            node.alignment_subset_job = None
            node.label = nalsj.tmp_dir_par[
                len(curr_tmp_dir_par) + 1:]  #only find last part of the name
            self.pasta_team.subsets[node.label] = nalsj
            if node.is_leaf():
                # Add a dummy taxon, or else dendropy can get confused
                node.taxon = subsets_tree._tree.taxon_namespace.new_taxon(
                    label=node.label)
        #subsets_tree._tree.infer_taxa()
        _LOG.debug("Spanning tree is:\n %s" % subsets_tree)
        labels = [nd.label for nd in subsets_tree._tree.postorder_node_iter()]
        if len(set(labels)) != len(labels):
            import collections
            raise Exception("Duplicate names found %s" % "\n".join(
                item for item, count in collections.Counter(labels).items()
                if count > 1))

        return subsets_tree
예제 #13
0
파일: pastajob.py 프로젝트: smirarab/pasta
    def build_subsets_tree(self, curr_tmp_dir_par,build_min_tree=True):
    # uym2 added: add option for MST
        if build_min_tree:
            _LOG.debug("START building Minimum Spanning Tree")
            grouping = {}
            groupName2jobName = {}
            
            for node in self.tree._tree.leaf_node_iter():
                groupName = self.pasta_team.subsets[node.taxon.label].tmp_dir_par[len(curr_tmp_dir_par)+1:]
                grouping[node.taxon.label] = groupName.replace("/","")
                groupName2jobName[groupName] = self.pasta_team.subsets[node.taxon.label]
            
            subsets_tree = build_groups_MST(self.tree._tree,grouping)
 
            for node in subsets_tree.postorder_node_iter():
               if node.is_leaf():
                   node.taxon.label = node.taxon.label.replace("d","/d")
               node.label = node.label.replace("d","/d") 

            self.pasta_team.subsets = groupName2jobName
            MST = PhylogeneticTree(subsets_tree) 
            _LOG.debug("Spanning tree is:\n %s" %MST)
            return MST
    ###################################


        _LOG.debug("START building heuristic spanning tree")

        translate={}
        t2 = {}
        for node in self.tree._tree.leaf_node_iter():
            nalsj = self.pasta_team.subsets[node.taxon.label]            
            newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:]
            translate[node.taxon.label] = newname
            t2[newname] = set([nalsj])            
        subsets_tree = PhylogeneticTree(Tree.get(data=self.tree_str,schema='newick'))
        for node in subsets_tree._tree.leaf_node_iter():
            node.alignment_subset_job = t2[translate[node.taxon.label]]
            #node.alignment_subset_job = t2[node.taxon]
        del t2
        del translate
        _LOG.debug("leafs labeled")        
        #subsets_tree._tree.infer_taxa()
        #_LOG.debug("fake taxa inferred")                   
        #Then make sure the tree is rooted at a branch (not at a node). 
        if len(subsets_tree._tree.seed_node.child_nodes()) > 2:
            for c in subsets_tree._tree.seed_node.child_nodes():
                if c.edge.is_internal():
                    break
            subsets_tree._tree.is_rooted = True
            subsets_tree._tree.reroot_at_edge(c.edge,length1=c.edge.length/2., 
                                              length2=c.edge.length/2., suppress_unifurcations=False)                        
        _LOG.debug("Subset Labeling (start):\n%s" %str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000])
        #_LOG.debug("Subset Labeling (start):\n%s" %str(len(subsets_tree._tree.seed_node.child_nodes())))
        # Then label internal branches based on their children, and collapse redundant edges. 
        for node in subsets_tree._tree.postorder_internal_node_iter():
            # my label is the intersection of my children, 
            # unless the intersection is empty, in which case it is the union
            if not hasattr(node, "alignment_subset_job") or node.alignment_subset_job is None:
                node.alignment_subset_job = set.intersection(*[c.alignment_subset_job for c in node.child_nodes()])
                if not node.alignment_subset_job:
                    node.alignment_subset_job = set.union(*[c.alignment_subset_job for c in node.child_nodes()])
            # Now go ahead and prune any child whose label encompasses my label. 
            # Use indexing instead of iteration, because with each collapse, 
            # new children can be added, and we want to process them as well.                         
            i = 0;
            while i < len(node.child_nodes()):                                
                c = node.child_nodes()[i]
                if node.alignment_subset_job.issubset(c.alignment_subset_job):
                    # Dendropy does not collapsing and edge that leads to a tip. Remove instead
                    if c.child_nodes():
                        c.edge.collapse()                                    
                    else:
                        node.remove_child(c)                      
                else:
                    i += 1
            
            node.label = "+".join(nj.tmp_dir_par[len(curr_tmp_dir_par)+1:] for nj in node.alignment_subset_job)
            if node.is_leaf():
                node.taxon = subsets_tree._tree.taxon_namespace.new_taxon(label=node.label)
            
        _LOG.debug("Before final round, the tree is:\n %s" %str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000])
        # Now, the remaining edges have multiple labels. These need to
        # be further resolved. Do it by minimum length
        #   First find all candidate edges that we might want to contract
        candidate_edges = set()
        for e in subsets_tree._tree.postorder_edge_iter():
            if e.tail_node and e.head_node.alignment_subset_job.intersection(e.tail_node.alignment_subset_job):
                candidate_edges.add( (e.length,e) )
        #   Then sort the edges, and start removing them one by one
        #   only if an edge is still having intersecting labels at the two ends                                                    
        candidate_edges = sorted(candidate_edges, key=lambda x:  x[0] if x[0] else -1)       
        for (el, edge) in candidate_edges:
            I = edge.tail_node.alignment_subset_job.intersection(edge.head_node.alignment_subset_job)
            if I:
                edge.tail_node.alignment_subset_job = I 
                if edge.head_node.child_nodes():
                    #edge.collapse(adjust_collapsed_head_children_edge_lengths=True)
                    edge.collapse()
                else:
                    edge.tail_node.remove_child(edge.head_node)
        # Make sure the tree is correct, remove the actual jobs
        # from nodes (can cause deep-copy problems), assign a label to each
        # node, and keep a mapping between the labels and actual alignment job objects
        self.pasta_team.subsets = {} # Let this now map from subset labels to the actual alignment jobs
        for node in subsets_tree._tree.postorder_node_iter():
            assert len(node.alignment_subset_job) == 1
            nalsj = node.alignment_subset_job.pop()
            node.alignment_subset_job = None 
            node.label = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:]#only find last part of the name
            self.pasta_team.subsets[node.label] = nalsj
            if node.is_leaf():
                # Add a dummy taxon, or else dendropy can get confused
                node.taxon = subsets_tree._tree.taxon_namespace.new_taxon(label=node.label)
        #subsets_tree._tree.infer_taxa()
        _LOG.debug("Spanning tree is:\n %s" %subsets_tree)
        labels = [nd.label for nd in subsets_tree._tree.postorder_node_iter()]
        if len(set(labels)) != len(labels):
            import collections
            raise Exception("Duplicate names found %s" %"\n".join
                   (item for item, count in 
                    collections.Counter(labels).items() if count > 1))
           
        return subsets_tree
예제 #14
0
def bisect_tree(tree,
                breaking_edge_style='mincluster',
                breaking_constraint='nleaves',
                min_size=0,
                max_size=None,
                max_diam=None,
                max_brlen=None):
    """Partition 'tree' into two parts
    """
    _LOG.debug("bisecting tree...")
    # uym2: midpoint decomposition (not in used for now)
    if breaking_edge_style == 'midpoint':
        _LOG.debug("breaking by midpoint")
        t1, t2 = midpoint_bisect(tree._tree, min_size=min_size)
        if t1 is None or t2 is None:
            return None, None
        tree1 = PhylogeneticTree(t1)
        tree2 = PhylogeneticTree(t2)
        return tree1, tree2
    ###############

    # uym2 (2019): min_cluster decomposition
    if breaking_edge_style == 'mincluster':
        _LOG.debug("breaking using min-cluster strategy")

        if breaking_constraint == 'nleaves':
            _LOG.debug(
                "constraining on the maximum number of leaves each subtree can have"
            )
            t1, t2 = min_cluster_size_bisect(tree._tree, max_size)
        elif breaking_constraint == 'brlen':
            _LOG.debug(
                "constraining on the maximum sum of branch lengths each subtree can have"
            )
            t1, t2 = min_cluster_brlen_bisect(tree._tree, max_brlen)

        tree1 = PhylogeneticTree(t1) if t1 else None
        tree2 = PhylogeneticTree(t2) if t2 else None
        return tree1, tree2

    if breaking_edge_style == 'centroid':
        _LOG.debug("breaking recursively at centroid edge")
        e = tree.get_breaking_edge(breaking_edge_style)
        _LOG.debug("breaking_edge length = %s, %s" %
                   (e.length, breaking_edge_style))

        snl = tree.n_leaves

        if breaking_constraint == 'brlen':
            sbrlen = tree.sum_brlen()

        tree1, tree2 = tree.bipartition_by_edge(e)
        _LOG.debug("Tree 1 has %s nodes, tree 2 has %s nodes" %
                   (tree1.n_leaves, tree2.n_leaves))

        if breaking_constraint == 'brlen':
            _LOG.debug(
                "Tree 1 has %s total edge length, tree 2 has %s total edge length"
                % (tree1.sum_brlen(), tree2.sum_brlen()))

        assert snl == tree1.n_leaves + tree2.n_leaves
        return tree1, tree2
예제 #15
0
    def launch_alignment(self, context_str=None):
        '''
        '''
        if self.killed:
            raise RuntimeError("PastaAligner Job killed")

        self._reset_jobs()
        self.context_str = context_str
        if self.context_str is None:
            self.context_str = ''
        node_count = self.tree.count_nodes()
        _LOG.debug("Recursive merge on a branch with %d subsets" %
                   (node_count))
        prefix = "subsets tree: %s" % self.tree.compose_newick()[0:200]
        if node_count == 2:
            nodes = self.tree._tree.nodes()
            _LOG.debug("%s ... pairwise merge " % prefix)
            self.skip_merge = False
            self.subjob1 = self.pasta_team.subsets[nodes[0].label]
            self.subjob2 = self.pasta_team.subsets[nodes[1].label]

            self.subjob1.add_parent(self)
            self.add_child(self.subjob1)

            self.subjob2.add_parent(self)
            self.add_child(self.subjob2)
        else:
            _LOG.debug("%s ... recursing further " % prefix)
            self.skip_merge = True

            # Reroot near centroid edge
            ce = self.tree.get_centroid_edge(spanning=True)
            nr = ce.head_node if not ce.head_node.is_leaf() else ce.tail_node
            self.tree._tree.reroot_at_node(nr, suppress_unifurcations=False)
            _LOG.debug("rerooted to: %s ..." %
                       self.tree.compose_newick()[0:200])
            # For each path from root to its children, create a new merge job
            merge_job_list = []
            nr = self.tree._tree.seed_node
            children = nr.child_nodes()
            for keepchild in children:
                remchilds = []
                for remchild in children:
                    if remchild != keepchild:
                        remchilds.append(
                            nr.reversible_remove_child(
                                remchild, suppress_unifurcations=False))
                t1 = PhylogeneticTree(Tree(self.tree._tree))
                remchilds.reverse()
                for child in remchilds:
                    nr.reinsert_nodes(child)
                _LOG.debug("child = %s ..." % t1.compose_newick()[0:200])
                multilocus_dataset1 = self.multilocus_dataset.new_with_shared_meta(
                )

                if t1.count_nodes() == 2:
                    ns = t1._tree.nodes()
                    tmp_dir_par = self.get_pairwise_temp_dir(
                        ns[0].label, ns[1].label)
                else:
                    tmp_dir_par = self.tmp_base_dir
                configuration = self.configuration()
                cj = PASTAMergerJob(multilocus_dataset=multilocus_dataset1,
                                    pasta_team=self.pasta_team,
                                    tree=t1,
                                    tmp_base_dir=self.tmp_base_dir,
                                    tmp_dir_par=tmp_dir_par,
                                    delete_temps2=False,
                                    **configuration)
                cj.add_parent(self)
                self.add_child(cj)
                merge_job_list.append(cj)

            self.merge_job_list = merge_job_list

            # now launch these new merge jobs
            for merge_job in self.merge_job_list:
                if self.killed:
                    raise RuntimeError("PastaAligner Job killed")
                merge_job.launch_alignment()

            self._merge_queued_event.set()

            if self.killed:
                raise RuntimeError("PastaAligner Job killed")
        return
예제 #16
0

if __name__ == '__main__':
    # main_dir = '/Users/esayyari/UCSD/oasis/viruses/all/corona_overall.April7/'
    main_dir = argv[1]
    num_processes = int(argv[2])
    max_size = int(argv[3])
    min_size = int(argv[4])

    ca = CompactAlignment()
    ca.read_filepath(join(main_dir, 'dna-sequences.fasta'))
    print("The total number of taxa is", ca.get_num_taxa())
    tree = dendropy.Tree.get(path=join(main_dir, 'sequences',
                                       'fastme_tree.nwk'),
                             schema="newick")
    phy = PhylogeneticTree(tree)
    orig_phy = deepcopy(phy)
    trees_map = decompose_phylogeny(phy, max_size=max_size, min_size=min_size)
    core_ca = CompactAlignment()
    core_ca.read_filepath(join(main_dir, 'dna-sequences.core.fasta'))
    IDs = list(core_ca.keys())
    tmp_dir = join(main_dir, 'sub_process')
    i = 0
    commands = []
    for tmp_tre in trees_map:
        i += 1
        command = [
            'mafft', '--reorder', '--nomemsave', '--thread', '1', '--auto',
            join(tmp_dir, 'dna-sequences.' + str(i) + '.fa')
        ]
        commands.append(command)
예제 #17
0
    def build_subsets_tree(self, curr_tmp_dir_par):
        translate = {}
        t2 = {}
        for node in self.tree._tree.leaf_iter():
            nalsj = self.pasta_team.subsets[node.taxon.label]
            newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par) + 1:]
            translate[node.taxon.label] = newname
            t2[newname] = set([nalsj])
        subsets_tree = PhylogeneticTree(
            read_newick_with_translate(StringIO(self.tree_str),
                                       translate_dict=translate))
        for node in subsets_tree._tree.leaf_iter():
            node.alignment_subset_job = t2[node.taxon]
        del t2
        del translate
        _LOG.debug("nodes labeled")
        #subsets_tree._tree.infer_taxa()
        #_LOG.debug("fake taxa inferred")
        #Then make sure the tree is rooted at a branch (not at a node).
        if len(subsets_tree._tree.seed_node.child_nodes()) > 2:
            subsets_tree._tree.reroot_at_edge(
                subsets_tree._tree.seed_node.child_nodes()[0].edge)
        _LOG.debug("Subset Labeling (start):\n%s" %
                   str(subsets_tree.compose_newick()))
        # Then label internal branches based on their children, and collapse redundant edges.
        for node in subsets_tree._tree.postorder_internal_node_iter():
            # my label is the intersection of my children,
            # unless the intersection is empty, in which case it is the union
            if not hasattr(node, "alignment_subset_job"
                           ) or node.alignment_subset_job is None:
                node.alignment_subset_job = set.intersection(
                    *[c.alignment_subset_job for c in node.child_nodes()])
                if not node.alignment_subset_job:
                    node.alignment_subset_job = set.union(
                        *[c.alignment_subset_job for c in node.child_nodes()])
            # Now go ahead and prune any child whose label encompasses my label.
            # Use indexing instead of iteration, because with each collapse,
            # new children can be added, and we want to process them as well.
            i = 0
            while i < len(node.child_nodes()):
                c = node.child_nodes()[i]
                if node.alignment_subset_job.issubset(c.alignment_subset_job):
                    # Dendropy does not collapsing and edge that leads to a tip. Remove instead
                    if c.child_nodes():
                        c.edge.collapse()
                    else:
                        node.remove_child(c)
                else:
                    i += 1

        # Now, the remaining edges have multiple labels. These need to
        # be further resolved. Do it by minimum length
        #   First find all candidate edges that we might want to contract
        candidate_edges = set()
        for e in subsets_tree._tree.postorder_edge_iter():
            if e.tail_node and e.head_node.alignment_subset_job.intersection(
                    e.tail_node.alignment_subset_job):
                candidate_edges.add((e.length, e))
        #   Then sort the edges, and start removing them one by one
        #   only if an edge is still having intersecting labels at the two ends
        candidate_edges = sorted(candidate_edges)
        for (el, edge) in candidate_edges:
            I = edge.tail_node.alignment_subset_job.intersection(
                edge.head_node.alignment_subset_job)
            if I:
                edge.tail_node.alignment_subset_job = I
                if edge.head_node.child_nodes():
                    edge.collapse()
                else:
                    edge.tail_node.remove_child(edge.head_node)
        # Make sure the tree is correct, remove the actual jobs
        # from nodes (can cause deep-copy problems), assign a label to each
        # node, and keep a mapping between the labels and actual alignment job objects
        self.pasta_team.subsets = {
        }  # Let this now map from subset labels to the actual alignment jobs
        for node in subsets_tree._tree.postorder_node_iter():
            assert len(node.alignment_subset_job) == 1
            nalsj = node.alignment_subset_job.pop()
            node.alignment_subset_job = None
            node.label = nalsj.tmp_dir_par[len(curr_tmp_dir_par) + 1:]
            self.pasta_team.subsets[node.label] = nalsj
            if node.is_leaf():
                # Add a dummy taxon, or else dendropy can get confused
                node.taxon = Taxon(label=node.label)
        #subsets_tree._tree.infer_taxa()
        return subsets_tree