def generate_tree_with_splits_from_tree(t, force_fully_resolved=False): if force_fully_resolved: resolve_polytomies(t, update_splits=False) t = PhylogeneticTree(t) _LOG.debug("calculating splits") t.calc_splits() _LOG.debug("end generating tree from string") return t
def phylogeneticTreeFromFile(self, treefile, file_format): dataset = Dataset() dataset.read(open(treefile, 'rU'), schema=file_format) dendropy_tree = dataset.tree_lists[0][0] tree = PhylogeneticTree(dendropy_tree) tree.calc_splits() delete_outdegree_one(tree._tree) return tree
def main(args): # Step 1: Decompose tree tree = dendropy.Tree.get(path=args.input_tree_file, schema="newick") tree.resolve_polytomies(limit=2, update_bipartitions=True) phy_tree = PhylogeneticTree(tree) X = [] centroid_edge = phy_tree.get_centroid_edge() dd_tree = copy.deepcopy(tree) max_e_w = 0.0 for e in dd_tree.postorder_edge_iter(): if e.length == None: e.length = 0 else: max_e_w = max(max_e_w, e.length) max_e_w = 200 * max_e_w centroid_edge.length = max_e_w a = centroid_edge.head_node b = centroid_edge.tail_node a_arr = [] b_arr = [] dd_tree.reroot_at_node(a) for leaf in dd_tree.leaf_nodes(): a_arr.append((leaf.taxon.label, leaf.distance_from_root())) a_arr = sorted(a_arr, key=lambda x: x[1]) for i in range(25): X.append(a_arr[i][0]) dd_tree.reroot_at_node(b) for leaf in dd_tree.leaf_nodes(): b_arr.append((leaf.taxon.label, leaf.distance_from_root())) b_arr = sorted(b_arr, key=lambda x: x[1]) for i in range(25): X.append(b_arr[i][0]) with open(args.output + "/X.lab", "w") as f: f.write("\n".join(X))
def main(args): # Step 1: Decompose tree tree = dendropy.Tree.get(path=args.input_tree_file, schema="newick") tree.resolve_polytomies(limit=2, update_bipartitions=True) tree = PhylogeneticTree(tree) trees = decompose_trees(tree, args.max_subset_size) # Step 2: Write out leaf subsets n = len(trees) pad = len(str(n)) # i = 1 i = 0 for tree in trees: keep = tree.leaf_node_names() j = str(i).zfill(pad) # output_file = args.output + "-" + j + "-outof-" + str(n) + ".txt" with open(args.output + "_ctree" + str(i) + ".lab", 'w') as f: f.write("\n".join(keep)) i = i + 1
def testCentroidEdge(self): sd = SequenceDataset() fp = data_source_path('100T.fasta') sd.read(open(fp, 'rU'), file_format='FASTA', datatype='DNA') fp = data_source_path('100T.tree') tree_list = read_and_encode_splits(sd.dataset, open(fp, "rU")) self.assertEqual(len(tree_list), 1) t = PhylogeneticTree(tree_list[0]) self._do_test_centroid(t)
def bisect_tree(tree, breaking_edge_style='mincluster', min_size=0, max_size=None, max_diam=None): """Partition 'tree' into two parts """ _LOG.debug("bisecting tree...") # uym2: midpoint decomposition (not in used for now) if breaking_edge_style == 'midpoint': _LOG.debug("breaking by midpoint") t1, t2 = midpoint_bisect(tree._tree, min_size=min_size) if t1 is None or t2 is None: return None, None tree1 = PhylogeneticTree(t1) tree2 = PhylogeneticTree(t2) return tree1, tree2 ############### # uym2: min_cluster decomposition if breaking_edge_style == 'mincluster': _LOG.debug("breaking using min-cluster strategy") t1, t2 = min_cluster_size_bisect(tree._tree, max_size) tree1 = PhylogeneticTree(t1) if t1 else None tree2 = PhylogeneticTree(t2) if t2 else None return tree1, tree2 ############### _LOG.debug("breaking by centroid") e = tree.get_breaking_edge(breaking_edge_style) _LOG.debug("breaking_edge length = %s, %s" % (e.length, breaking_edge_style)) snl = tree.n_leaves tree1, tree2 = tree.bipartition_by_edge(e) _LOG.debug("Tree 1 has %s nodes, tree 2 has %s nodes" % (tree1.n_leaves, tree2.n_leaves)) assert snl == tree1.n_leaves + tree2.n_leaves return tree1, tree2
def main(args): # Step 1: Decompose tree tree = dendropy.Tree.get(path=args.input_tree_file, schema="newick") tree.resolve_polytomies(limit=2, update_bipartitions=True) tree = PhylogeneticTree(tree) t1, t2 = bisect_tree(tree) trees = [t1, t2] # Step 2: Write out leaf subsets # i = 1 i = 0 keep1 = t1.leaf_node_names() with open(args.output + "/A.lab", "w") as f: f.write("\n".join(keep1)) keep2 = t2.leaf_node_names() with open(args.output + "/B.lab", "w") as f: f.write("\n".join(keep2))
def build_subsets_tree(self, curr_tmp_dir_par): translate={} t2 = {} for node in self.tree._tree.leaf_iter(): nalsj = self.pasta_team.subsets[node.taxon.label] newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:] translate[node.taxon.label] = newname t2[newname] = set([nalsj]) subsets_tree = PhylogeneticTree(read_newick_with_translate(StringIO(self.tree_str),translate_dict=translate)) for node in subsets_tree._tree.leaf_iter(): node.alignment_subset_job = t2[node.taxon] del t2 del translate _LOG.debug("nodes labeled") #subsets_tree._tree.infer_taxa() #_LOG.debug("fake taxa inferred") #Then make sure the tree is rooted at a branch (not at a node). if len(subsets_tree._tree.seed_node.child_nodes()) > 2: subsets_tree._tree.reroot_at_edge(subsets_tree._tree.seed_node.child_nodes()[0].edge) _LOG.debug("Subset Labeling (start):\n%s" %str(subsets_tree.compose_newick())) # Then label internal branches based on their children, and collapse redundant edges. for node in subsets_tree._tree.postorder_internal_node_iter(): # my label is the intersection of my children, # unless the intersection is empty, in which case it is the union if not hasattr(node, "alignment_subset_job") or node.alignment_subset_job is None: node.alignment_subset_job = set.intersection(*[c.alignment_subset_job for c in node.child_nodes()]) if not node.alignment_subset_job: node.alignment_subset_job = set.union(*[c.alignment_subset_job for c in node.child_nodes()]) # Now go ahead and prune any child whose label encompasses my label. # Use indexing instead of iteration, because with each collapse, # new children can be added, and we want to process them as well. i = 0; while i < len(node.child_nodes()): c = node.child_nodes()[i] if node.alignment_subset_job.issubset(c.alignment_subset_job): # Dendropy does not collapsing and edge that leads to a tip. Remove instead if c.child_nodes(): c.edge.collapse() else: node.remove_child(c) else: i += 1 # Now, the remaining edges have multiple labels. These need to # be further resolved. Do it by minimum length # First find all candidate edges that we might want to contract candidate_edges = set() for e in subsets_tree._tree.postorder_edge_iter(): if e.tail_node and e.head_node.alignment_subset_job.intersection(e.tail_node.alignment_subset_job): candidate_edges.add( (e.length,e) ) # Then sort the edges, and start removing them one by one # only if an edge is still having intersecting labels at the two ends candidate_edges = sorted(candidate_edges) for (el, edge) in candidate_edges: I = edge.tail_node.alignment_subset_job.intersection(edge.head_node.alignment_subset_job) if I: edge.tail_node.alignment_subset_job = I if edge.head_node.child_nodes(): edge.collapse() else: edge.tail_node.remove_child(edge.head_node) # Make sure the tree is correct, remove the actual jobs # from nodes (can cause deep-copy problems), assign a label to each # node, and keep a mapping between the labels and actual alignment job objects self.pasta_team.subsets = {} # Let this now map from subset labels to the actual alignment jobs for node in subsets_tree._tree.postorder_node_iter(): assert len(node.alignment_subset_job) == 1 nalsj = node.alignment_subset_job.pop() node.alignment_subset_job = None node.label = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:] self.pasta_team.subsets[node.label] = nalsj if node.is_leaf(): # Add a dummy taxon, or else dendropy can get confused node.taxon = Taxon(label=node.label) #subsets_tree._tree.infer_taxa() return subsets_tree
def launch_alignment(self, context_str=None): ''' ''' if self.killed: raise RuntimeError("PastaAligner Job killed") self._reset_jobs() self.context_str = context_str if self.context_str is None: self.context_str = '' node_count = self.tree.count_nodes() _LOG.debug("Recursive merge on a branch with %d subsets" % (node_count)) prefix = "subsets tree: %s" %self.tree.compose_newick()[0:200] if node_count == 2: nodes = self.tree._tree.nodes() _LOG.debug("%s ... pairwise merge " % prefix) self.skip_merge = False self.subjob1 = self.pasta_team.subsets[nodes[0].label] self.subjob2 = self.pasta_team.subsets[nodes[1].label] self.subjob1.add_parent(self) self.add_child(self.subjob1) self.subjob2.add_parent(self) self.add_child(self.subjob2) else: _LOG.debug("%s ... recursing further " % prefix) self.skip_merge = True # Reroot near centroid edge ce = self.tree.get_centroid_edge(spanning=True) nr = ce.head_node if not ce.head_node.is_leaf() else ce.tail_node self.tree._tree.reroot_at_node(nr,delete_outdegree_one=False) _LOG.debug("rerooted to: %s ..." % self.tree.compose_newick()[0:200]) # For each path from root to its children, create a new merge job merge_job_list = [] nr = self.tree._tree.seed_node children = nr.child_nodes() for keepchild in children: remchilds = [] for remchild in children: if remchild != keepchild: remchilds.append(nr.reversible_remove_child(remchild, suppress_deg_two=False)) t1 = PhylogeneticTree(Tree(self.tree._tree)) remchilds.reverse() for child in remchilds: nr.reinsert_nodes(child) _LOG.debug("child = %s ..." % t1.compose_newick()[0:200]) multilocus_dataset1 = self.multilocus_dataset.new_with_shared_meta() if t1.count_nodes() == 2: ns = t1._tree.nodes() tmp_dir_par = self.get_pairwise_temp_dir(ns[0].label, ns[1].label) else: tmp_dir_par = self.tmp_base_dir configuration = self.configuration() cj = PASTAMergerJob(multilocus_dataset=multilocus_dataset1, pasta_team=self.pasta_team, tree=t1, tmp_base_dir=self.tmp_base_dir, tmp_dir_par= tmp_dir_par, delete_temps2=False, **configuration) cj.add_parent(self) self.add_child(cj) merge_job_list.append(cj); self.merge_job_list = merge_job_list # now launch these new merge jobs for merge_job in self.merge_job_list: if self.killed: raise RuntimeError("PastaAligner Job killed") merge_job.launch_alignment() self._merge_queued_event.set() if self.killed: raise RuntimeError("PastaAligner Job killed") return
def build_subsets_tree(self, curr_tmp_dir_par, build_min_tree=True): # uym2 added: add option for MST if build_min_tree: _LOG.debug("START building Minimum Spanning Tree") grouping = {} groupName2jobName = {} for node in self.tree._tree.leaf_node_iter(): groupName = self.pasta_team.subsets[ node.taxon.label].tmp_dir_par[len(curr_tmp_dir_par) + 1:] grouping[node.taxon.label] = groupName.replace("/", "") groupName2jobName[groupName] = self.pasta_team.subsets[ node.taxon.label] subsets_tree = build_groups_MST(self.tree._tree, grouping) for node in subsets_tree.postorder_node_iter(): if node.is_leaf(): node.taxon.label = node.taxon.label.replace("d", "/d") node.label = node.label.replace("d", "/d") self.pasta_team.subsets = groupName2jobName MST = PhylogeneticTree(subsets_tree) _LOG.debug("Spanning tree is:\n %s" % MST) return MST ################################### _LOG.debug("START building heuristic spanning tree") translate = {} t2 = {} for node in self.tree._tree.leaf_node_iter(): nalsj = self.pasta_team.subsets[node.taxon.label] newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par) + 1:] translate[node.taxon.label] = newname t2[newname] = set([nalsj]) subsets_tree = PhylogeneticTree( Tree.get(data=self.tree_str, schema='newick')) for node in subsets_tree._tree.leaf_node_iter(): node.alignment_subset_job = t2[translate[node.taxon.label]] #node.alignment_subset_job = t2[node.taxon] del t2 del translate _LOG.debug("leafs labeled") #subsets_tree._tree.infer_taxa() #_LOG.debug("fake taxa inferred") #Then make sure the tree is rooted at a branch (not at a node). if len(subsets_tree._tree.seed_node.child_nodes()) > 2: for c in subsets_tree._tree.seed_node.child_nodes(): if c.edge.is_internal(): break subsets_tree._tree.is_rooted = True subsets_tree._tree.reroot_at_edge(c.edge, length1=c.edge.length / 2., length2=c.edge.length / 2., suppress_unifurcations=False) _LOG.debug( "Subset Labeling (start):\n%s" % str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000]) #_LOG.debug("Subset Labeling (start):\n%s" %str(len(subsets_tree._tree.seed_node.child_nodes()))) # Then label internal branches based on their children, and collapse redundant edges. for node in subsets_tree._tree.postorder_internal_node_iter(): # my label is the intersection of my children, # unless the intersection is empty, in which case it is the union if not hasattr(node, "alignment_subset_job" ) or node.alignment_subset_job is None: node.alignment_subset_job = set.intersection( *[c.alignment_subset_job for c in node.child_nodes()]) if not node.alignment_subset_job: node.alignment_subset_job = set.union( *[c.alignment_subset_job for c in node.child_nodes()]) # Now go ahead and prune any child whose label encompasses my label. # Use indexing instead of iteration, because with each collapse, # new children can be added, and we want to process them as well. i = 0 while i < len(node.child_nodes()): c = node.child_nodes()[i] if node.alignment_subset_job.issubset(c.alignment_subset_job): # Dendropy does not collapsing and edge that leads to a tip. Remove instead if c.child_nodes(): c.edge.collapse() else: node.remove_child(c) else: i += 1 node.label = "+".join(nj.tmp_dir_par[len(curr_tmp_dir_par) + 1:] for nj in node.alignment_subset_job) if node.is_leaf(): node.taxon = subsets_tree._tree.taxon_namespace.new_taxon( label=node.label) _LOG.debug( "Before final round, the tree is:\n %s" % str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000]) # Now, the remaining edges have multiple labels. These need to # be further resolved. Do it by minimum length # First find all candidate edges that we might want to contract candidate_edges = set() for e in subsets_tree._tree.postorder_edge_iter(): if e.tail_node and e.head_node.alignment_subset_job.intersection( e.tail_node.alignment_subset_job): candidate_edges.add((e.length, e)) # Then sort the edges, and start removing them one by one # only if an edge is still having intersecting labels at the two ends candidate_edges = sorted(candidate_edges, key=lambda x: x[0] if x[0] else -1) for (el, edge) in candidate_edges: I = edge.tail_node.alignment_subset_job.intersection( edge.head_node.alignment_subset_job) if I: edge.tail_node.alignment_subset_job = I if edge.head_node.child_nodes(): #edge.collapse(adjust_collapsed_head_children_edge_lengths=True) edge.collapse() else: edge.tail_node.remove_child(edge.head_node) # Make sure the tree is correct, remove the actual jobs # from nodes (can cause deep-copy problems), assign a label to each # node, and keep a mapping between the labels and actual alignment job objects self.pasta_team.subsets = { } # Let this now map from subset labels to the actual alignment jobs for node in subsets_tree._tree.postorder_node_iter(): assert len(node.alignment_subset_job) == 1 nalsj = node.alignment_subset_job.pop() node.alignment_subset_job = None node.label = nalsj.tmp_dir_par[ len(curr_tmp_dir_par) + 1:] #only find last part of the name self.pasta_team.subsets[node.label] = nalsj if node.is_leaf(): # Add a dummy taxon, or else dendropy can get confused node.taxon = subsets_tree._tree.taxon_namespace.new_taxon( label=node.label) #subsets_tree._tree.infer_taxa() _LOG.debug("Spanning tree is:\n %s" % subsets_tree) labels = [nd.label for nd in subsets_tree._tree.postorder_node_iter()] if len(set(labels)) != len(labels): import collections raise Exception("Duplicate names found %s" % "\n".join( item for item, count in collections.Counter(labels).items() if count > 1)) return subsets_tree
def build_subsets_tree(self, curr_tmp_dir_par,build_min_tree=True): # uym2 added: add option for MST if build_min_tree: _LOG.debug("START building Minimum Spanning Tree") grouping = {} groupName2jobName = {} for node in self.tree._tree.leaf_node_iter(): groupName = self.pasta_team.subsets[node.taxon.label].tmp_dir_par[len(curr_tmp_dir_par)+1:] grouping[node.taxon.label] = groupName.replace("/","") groupName2jobName[groupName] = self.pasta_team.subsets[node.taxon.label] subsets_tree = build_groups_MST(self.tree._tree,grouping) for node in subsets_tree.postorder_node_iter(): if node.is_leaf(): node.taxon.label = node.taxon.label.replace("d","/d") node.label = node.label.replace("d","/d") self.pasta_team.subsets = groupName2jobName MST = PhylogeneticTree(subsets_tree) _LOG.debug("Spanning tree is:\n %s" %MST) return MST ################################### _LOG.debug("START building heuristic spanning tree") translate={} t2 = {} for node in self.tree._tree.leaf_node_iter(): nalsj = self.pasta_team.subsets[node.taxon.label] newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:] translate[node.taxon.label] = newname t2[newname] = set([nalsj]) subsets_tree = PhylogeneticTree(Tree.get(data=self.tree_str,schema='newick')) for node in subsets_tree._tree.leaf_node_iter(): node.alignment_subset_job = t2[translate[node.taxon.label]] #node.alignment_subset_job = t2[node.taxon] del t2 del translate _LOG.debug("leafs labeled") #subsets_tree._tree.infer_taxa() #_LOG.debug("fake taxa inferred") #Then make sure the tree is rooted at a branch (not at a node). if len(subsets_tree._tree.seed_node.child_nodes()) > 2: for c in subsets_tree._tree.seed_node.child_nodes(): if c.edge.is_internal(): break subsets_tree._tree.is_rooted = True subsets_tree._tree.reroot_at_edge(c.edge,length1=c.edge.length/2., length2=c.edge.length/2., suppress_unifurcations=False) _LOG.debug("Subset Labeling (start):\n%s" %str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000]) #_LOG.debug("Subset Labeling (start):\n%s" %str(len(subsets_tree._tree.seed_node.child_nodes()))) # Then label internal branches based on their children, and collapse redundant edges. for node in subsets_tree._tree.postorder_internal_node_iter(): # my label is the intersection of my children, # unless the intersection is empty, in which case it is the union if not hasattr(node, "alignment_subset_job") or node.alignment_subset_job is None: node.alignment_subset_job = set.intersection(*[c.alignment_subset_job for c in node.child_nodes()]) if not node.alignment_subset_job: node.alignment_subset_job = set.union(*[c.alignment_subset_job for c in node.child_nodes()]) # Now go ahead and prune any child whose label encompasses my label. # Use indexing instead of iteration, because with each collapse, # new children can be added, and we want to process them as well. i = 0; while i < len(node.child_nodes()): c = node.child_nodes()[i] if node.alignment_subset_job.issubset(c.alignment_subset_job): # Dendropy does not collapsing and edge that leads to a tip. Remove instead if c.child_nodes(): c.edge.collapse() else: node.remove_child(c) else: i += 1 node.label = "+".join(nj.tmp_dir_par[len(curr_tmp_dir_par)+1:] for nj in node.alignment_subset_job) if node.is_leaf(): node.taxon = subsets_tree._tree.taxon_namespace.new_taxon(label=node.label) _LOG.debug("Before final round, the tree is:\n %s" %str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000]) # Now, the remaining edges have multiple labels. These need to # be further resolved. Do it by minimum length # First find all candidate edges that we might want to contract candidate_edges = set() for e in subsets_tree._tree.postorder_edge_iter(): if e.tail_node and e.head_node.alignment_subset_job.intersection(e.tail_node.alignment_subset_job): candidate_edges.add( (e.length,e) ) # Then sort the edges, and start removing them one by one # only if an edge is still having intersecting labels at the two ends candidate_edges = sorted(candidate_edges, key=lambda x: x[0] if x[0] else -1) for (el, edge) in candidate_edges: I = edge.tail_node.alignment_subset_job.intersection(edge.head_node.alignment_subset_job) if I: edge.tail_node.alignment_subset_job = I if edge.head_node.child_nodes(): #edge.collapse(adjust_collapsed_head_children_edge_lengths=True) edge.collapse() else: edge.tail_node.remove_child(edge.head_node) # Make sure the tree is correct, remove the actual jobs # from nodes (can cause deep-copy problems), assign a label to each # node, and keep a mapping between the labels and actual alignment job objects self.pasta_team.subsets = {} # Let this now map from subset labels to the actual alignment jobs for node in subsets_tree._tree.postorder_node_iter(): assert len(node.alignment_subset_job) == 1 nalsj = node.alignment_subset_job.pop() node.alignment_subset_job = None node.label = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:]#only find last part of the name self.pasta_team.subsets[node.label] = nalsj if node.is_leaf(): # Add a dummy taxon, or else dendropy can get confused node.taxon = subsets_tree._tree.taxon_namespace.new_taxon(label=node.label) #subsets_tree._tree.infer_taxa() _LOG.debug("Spanning tree is:\n %s" %subsets_tree) labels = [nd.label for nd in subsets_tree._tree.postorder_node_iter()] if len(set(labels)) != len(labels): import collections raise Exception("Duplicate names found %s" %"\n".join (item for item, count in collections.Counter(labels).items() if count > 1)) return subsets_tree
def bisect_tree(tree, breaking_edge_style='mincluster', breaking_constraint='nleaves', min_size=0, max_size=None, max_diam=None, max_brlen=None): """Partition 'tree' into two parts """ _LOG.debug("bisecting tree...") # uym2: midpoint decomposition (not in used for now) if breaking_edge_style == 'midpoint': _LOG.debug("breaking by midpoint") t1, t2 = midpoint_bisect(tree._tree, min_size=min_size) if t1 is None or t2 is None: return None, None tree1 = PhylogeneticTree(t1) tree2 = PhylogeneticTree(t2) return tree1, tree2 ############### # uym2 (2019): min_cluster decomposition if breaking_edge_style == 'mincluster': _LOG.debug("breaking using min-cluster strategy") if breaking_constraint == 'nleaves': _LOG.debug( "constraining on the maximum number of leaves each subtree can have" ) t1, t2 = min_cluster_size_bisect(tree._tree, max_size) elif breaking_constraint == 'brlen': _LOG.debug( "constraining on the maximum sum of branch lengths each subtree can have" ) t1, t2 = min_cluster_brlen_bisect(tree._tree, max_brlen) tree1 = PhylogeneticTree(t1) if t1 else None tree2 = PhylogeneticTree(t2) if t2 else None return tree1, tree2 if breaking_edge_style == 'centroid': _LOG.debug("breaking recursively at centroid edge") e = tree.get_breaking_edge(breaking_edge_style) _LOG.debug("breaking_edge length = %s, %s" % (e.length, breaking_edge_style)) snl = tree.n_leaves if breaking_constraint == 'brlen': sbrlen = tree.sum_brlen() tree1, tree2 = tree.bipartition_by_edge(e) _LOG.debug("Tree 1 has %s nodes, tree 2 has %s nodes" % (tree1.n_leaves, tree2.n_leaves)) if breaking_constraint == 'brlen': _LOG.debug( "Tree 1 has %s total edge length, tree 2 has %s total edge length" % (tree1.sum_brlen(), tree2.sum_brlen())) assert snl == tree1.n_leaves + tree2.n_leaves return tree1, tree2
def launch_alignment(self, context_str=None): ''' ''' if self.killed: raise RuntimeError("PastaAligner Job killed") self._reset_jobs() self.context_str = context_str if self.context_str is None: self.context_str = '' node_count = self.tree.count_nodes() _LOG.debug("Recursive merge on a branch with %d subsets" % (node_count)) prefix = "subsets tree: %s" % self.tree.compose_newick()[0:200] if node_count == 2: nodes = self.tree._tree.nodes() _LOG.debug("%s ... pairwise merge " % prefix) self.skip_merge = False self.subjob1 = self.pasta_team.subsets[nodes[0].label] self.subjob2 = self.pasta_team.subsets[nodes[1].label] self.subjob1.add_parent(self) self.add_child(self.subjob1) self.subjob2.add_parent(self) self.add_child(self.subjob2) else: _LOG.debug("%s ... recursing further " % prefix) self.skip_merge = True # Reroot near centroid edge ce = self.tree.get_centroid_edge(spanning=True) nr = ce.head_node if not ce.head_node.is_leaf() else ce.tail_node self.tree._tree.reroot_at_node(nr, suppress_unifurcations=False) _LOG.debug("rerooted to: %s ..." % self.tree.compose_newick()[0:200]) # For each path from root to its children, create a new merge job merge_job_list = [] nr = self.tree._tree.seed_node children = nr.child_nodes() for keepchild in children: remchilds = [] for remchild in children: if remchild != keepchild: remchilds.append( nr.reversible_remove_child( remchild, suppress_unifurcations=False)) t1 = PhylogeneticTree(Tree(self.tree._tree)) remchilds.reverse() for child in remchilds: nr.reinsert_nodes(child) _LOG.debug("child = %s ..." % t1.compose_newick()[0:200]) multilocus_dataset1 = self.multilocus_dataset.new_with_shared_meta( ) if t1.count_nodes() == 2: ns = t1._tree.nodes() tmp_dir_par = self.get_pairwise_temp_dir( ns[0].label, ns[1].label) else: tmp_dir_par = self.tmp_base_dir configuration = self.configuration() cj = PASTAMergerJob(multilocus_dataset=multilocus_dataset1, pasta_team=self.pasta_team, tree=t1, tmp_base_dir=self.tmp_base_dir, tmp_dir_par=tmp_dir_par, delete_temps2=False, **configuration) cj.add_parent(self) self.add_child(cj) merge_job_list.append(cj) self.merge_job_list = merge_job_list # now launch these new merge jobs for merge_job in self.merge_job_list: if self.killed: raise RuntimeError("PastaAligner Job killed") merge_job.launch_alignment() self._merge_queued_event.set() if self.killed: raise RuntimeError("PastaAligner Job killed") return
if __name__ == '__main__': # main_dir = '/Users/esayyari/UCSD/oasis/viruses/all/corona_overall.April7/' main_dir = argv[1] num_processes = int(argv[2]) max_size = int(argv[3]) min_size = int(argv[4]) ca = CompactAlignment() ca.read_filepath(join(main_dir, 'dna-sequences.fasta')) print("The total number of taxa is", ca.get_num_taxa()) tree = dendropy.Tree.get(path=join(main_dir, 'sequences', 'fastme_tree.nwk'), schema="newick") phy = PhylogeneticTree(tree) orig_phy = deepcopy(phy) trees_map = decompose_phylogeny(phy, max_size=max_size, min_size=min_size) core_ca = CompactAlignment() core_ca.read_filepath(join(main_dir, 'dna-sequences.core.fasta')) IDs = list(core_ca.keys()) tmp_dir = join(main_dir, 'sub_process') i = 0 commands = [] for tmp_tre in trees_map: i += 1 command = [ 'mafft', '--reorder', '--nomemsave', '--thread', '1', '--auto', join(tmp_dir, 'dna-sequences.' + str(i) + '.fa') ] commands.append(command)
def build_subsets_tree(self, curr_tmp_dir_par): translate = {} t2 = {} for node in self.tree._tree.leaf_iter(): nalsj = self.pasta_team.subsets[node.taxon.label] newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par) + 1:] translate[node.taxon.label] = newname t2[newname] = set([nalsj]) subsets_tree = PhylogeneticTree( read_newick_with_translate(StringIO(self.tree_str), translate_dict=translate)) for node in subsets_tree._tree.leaf_iter(): node.alignment_subset_job = t2[node.taxon] del t2 del translate _LOG.debug("nodes labeled") #subsets_tree._tree.infer_taxa() #_LOG.debug("fake taxa inferred") #Then make sure the tree is rooted at a branch (not at a node). if len(subsets_tree._tree.seed_node.child_nodes()) > 2: subsets_tree._tree.reroot_at_edge( subsets_tree._tree.seed_node.child_nodes()[0].edge) _LOG.debug("Subset Labeling (start):\n%s" % str(subsets_tree.compose_newick())) # Then label internal branches based on their children, and collapse redundant edges. for node in subsets_tree._tree.postorder_internal_node_iter(): # my label is the intersection of my children, # unless the intersection is empty, in which case it is the union if not hasattr(node, "alignment_subset_job" ) or node.alignment_subset_job is None: node.alignment_subset_job = set.intersection( *[c.alignment_subset_job for c in node.child_nodes()]) if not node.alignment_subset_job: node.alignment_subset_job = set.union( *[c.alignment_subset_job for c in node.child_nodes()]) # Now go ahead and prune any child whose label encompasses my label. # Use indexing instead of iteration, because with each collapse, # new children can be added, and we want to process them as well. i = 0 while i < len(node.child_nodes()): c = node.child_nodes()[i] if node.alignment_subset_job.issubset(c.alignment_subset_job): # Dendropy does not collapsing and edge that leads to a tip. Remove instead if c.child_nodes(): c.edge.collapse() else: node.remove_child(c) else: i += 1 # Now, the remaining edges have multiple labels. These need to # be further resolved. Do it by minimum length # First find all candidate edges that we might want to contract candidate_edges = set() for e in subsets_tree._tree.postorder_edge_iter(): if e.tail_node and e.head_node.alignment_subset_job.intersection( e.tail_node.alignment_subset_job): candidate_edges.add((e.length, e)) # Then sort the edges, and start removing them one by one # only if an edge is still having intersecting labels at the two ends candidate_edges = sorted(candidate_edges) for (el, edge) in candidate_edges: I = edge.tail_node.alignment_subset_job.intersection( edge.head_node.alignment_subset_job) if I: edge.tail_node.alignment_subset_job = I if edge.head_node.child_nodes(): edge.collapse() else: edge.tail_node.remove_child(edge.head_node) # Make sure the tree is correct, remove the actual jobs # from nodes (can cause deep-copy problems), assign a label to each # node, and keep a mapping between the labels and actual alignment job objects self.pasta_team.subsets = { } # Let this now map from subset labels to the actual alignment jobs for node in subsets_tree._tree.postorder_node_iter(): assert len(node.alignment_subset_job) == 1 nalsj = node.alignment_subset_job.pop() node.alignment_subset_job = None node.label = nalsj.tmp_dir_par[len(curr_tmp_dir_par) + 1:] self.pasta_team.subsets[node.label] = nalsj if node.is_leaf(): # Add a dummy taxon, or else dendropy can get confused node.taxon = Taxon(label=node.label) #subsets_tree._tree.infer_taxa() return subsets_tree