def testRerootSplits(self): newick = "((Athrotaxi,(Callitris,(Juniperusc,Libocedrus))),(((((((Basichlsac,(Mougeotisp,Lamprothma)),Thuidium),(Petalaphy,Haplomitr2)),((Botrychbit,(Vittarifle,((Dicksonant,((Polypodapp,Oleandrapi),Dennstasam)),Azollacaro))),Angiopteri)),Isoetesmel),((Sagittari,(Calochort,(Tacca,(Calathea,Ravenala)))),((Nelumbo,((((((Verbena,((Thunbergi,Acanthus),(Proboscid,Harpogoph))),Asclepias),Menyanthe),(Phyllonom,(Chamaedap,Pyrola))),((((Mirabilus,Pisum),Circaea),((Rheinward,Octomeles),Greyia)),Dudleya)),Phoradend)),(((Liriodchi,Annona),Gyrocarpu),Illicium)))),(Pseudotsu,(Agathisova,Agathismac))));" d = dataio.trees_from_newick([newick]) tree = d.trees_blocks[0][0] taxa_block = d.taxa_blocks[0] ref = dataio.trees_from_newick( [newick], taxa_block=taxa_block).trees_blocks[0][0] encode_splits(tree) encode_splits(ref) r = tree.seed_node curr_n = r.child_nodes()[1] former_mask = curr_n.edge.clade_mask tm = r.edge.clade_mask nbits = count_bits(tm) from dendropy.splits import split_as_string tree.reroot_at(curr_n, splits=True, delete_deg_two=False) new_root = tree.seed_node self.assertEqual(tm, new_root.edge.clade_mask) self.assertEqual(True, new_root is curr_n) self.assertEqual(True, r.parent_node is curr_n) flipped = (~(r.edge.clade_mask)) & tm self.assertEqual(True, (former_mask == r.edge.clade_mask) or (flipped == former_mask))
def testChangeTranslate(self): f = """#NEXUS Begin taxa ; dimensions ntax = 4; taxlabels a b c d ; end; begin trees; translate 1 a, 2 b, 3 c, 4 d; tree t = (1,2,(3,4)); end; begin trees; translate 1 d, 2 b, 3 c, 4 a; tree t = (4,2,(3,1)); end; """ d = Dataset() d.read(StringIO(f), format="NEXUS") t = d.trees_blocks[0][0] s = d.trees_blocks[1][0] self.assertEqual(t.taxa_block, s.taxa_block) encode_splits(s) encode_splits(t) self.assertEqual(treedists.symmetric_difference(t, s), 0)
def testEuclideanDist(self): d = dataio.trees_from_newick([ "((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);", "((t5:2.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);", "((t5:0.161175,t6:0.161175):0.392293,((t2:0.075411,(t4:0.104381,t1:0.075411):1):0.065840,t3:0.170221):0.383247);", "((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):0.028969):0.065840,t3:0.170221):0.383247);", ]) tree_list = [i[0] for i in d.trees_blocks] #print "\n".join([str(i) for i in tree_list]) for i in tree_list: encode_splits(i) assert_approx_equal( treedists.euclidean_distance(tree_list[0], tree_list[1]), 2.0) assert_approx_equal( treedists.euclidean_distance(tree_list[0], tree_list[2]), math.sqrt(2.0)) assert_approx_equal( treedists.euclidean_distance(tree_list[0], tree_list[3]), 0.97103099999999998) assert_approx_equal( treedists.euclidean_distance(tree_list[1], tree_list[2]), math.sqrt(6.0)) assert_approx_equal( treedists.euclidean_distance(tree_list[1], tree_list[3]), 2.2232636377544162) assert_approx_equal( treedists.euclidean_distance(tree_list[2], tree_list[3]), 1.000419513484718)
def testCollapseClade(self): tree = dataio.trees_from_newick(["(t5,t6,((t4,(t2,t1)),t3));" ]).trees_blocks[0][0] encode_splits(tree) root = tree.seed_node root_children = root.child_nodes() fc = root_children[0] collapse_clade(fc) tree.debug_check_tree(splits=True) self.assertEqual(str(tree), "(t5,t6,((t4,(t2,t1)),t3))") fc2 = root_children[2] fc2children = fc2.child_nodes() t124child = fc2children[0] collapse_clade(t124child) tree.debug_check_tree(logger_obj=_LOG) self.assertEqual(str(tree), "(t5,t6,((t4,t2,t1),t3))") collapse_clade(fc2) tree.debug_check_tree(logger_obj=_LOG) self.assertEqual(str(tree), "(t5,t6,(t4,t2,t1,t3))") collapse_clade(root) tree.debug_check_tree(logger_obj=_LOG) tree.debug_check_tree(logger_obj=_LOG) self.assertEqual(str(tree), "(t5,t6,t4,t2,t1,t3)") tree = dataio.trees_from_newick(["((t5,t6),((t4,(t2,t1)),t3));" ]).trees_blocks[0][0] root = tree.seed_node collapse_clade(root) tree.debug_check_tree(logger_obj=_LOG) self.assertEqual(str(tree), "(t5,t6,t4,t2,t1,t3)")
def kernelOfTest(self, trees): expected = trees[-1] input = trees[:-1] output = strict_consensus_merge(input) encode_splits(output) encode_splits(expected) if symmetric_difference(expected, output) != 0: self.fail("\n%s\n!=\n%s" % (str(output), str(expected)))
def map_split_support_to_tree(self, tree, split_distribution): "Maps splits support to the given tree." split_frequencies = split_distribution.split_frequencies tree.normalize_taxa(taxa_block=split_distribution.taxa_block) assert tree.taxa_block is split_distribution.taxa_block splits.encode_splits(tree) for split in tree.split_edges: if split in split_frequencies: split_support = split_frequencies[split] else: split_support = 0.0 self.map_split_support_to_node(tree.split_edges[split].head_node, split_support) return tree
def testSymmDiff(self): newick = "((t5,t6),((t4,(t2,t1)),t3));" d = dataio.trees_from_newick([newick]) ref = d.trees_blocks[0][0] taxa_block = d.taxa_blocks[0] encode_splits(ref) o_newick = "((t1,t2),((t4,(t5,t6)),t3));" o_tree = dataio.trees_from_newick( [o_newick], taxa_block=taxa_block).trees_blocks[0][0] encode_splits(o_tree) self.assertEqual(treedists.symmetric_difference(o_tree, ref), 2)
def check_tree(self, tree_str): d1 = datasets.Dataset() tree1 = d1.trees_from_string(tree_str, format="newick")[0] pa, edge_lens = to_parent_array(tree1, True, False) _LOG.info('Original tree: %s' % tree_str) cmd = self.prog_path + " " + " ".join(pa) stdout, stderr, returncode = run_program(cmd) assert returncode == 0, "Program exited with error:\n%s" % stderr _LOG.info('Returned tree: %s' % stdout) tree2 = d1.trees_from_string(stdout, format="newick")[0] splits.encode_splits(tree1) splits.encode_splits(tree2) d = treedists.symmetric_difference(tree1, tree2) assert d == 0, "Symmetric distance = %d:\n%s;\n%s;" % (d, tree_str, stdout)
def score_tree_list(self, full_dataset, inp_trees, stop_gen): culled = self._write_garli_input(full_dataset) culled_taxa = culled.taxa_blocks[0] self.set_active_taxa(culled_taxa) rescored = [] for tree_ind, tree in enumerate(inp_trees): tm = self.score_tree(tree, culled, tree_ind, stop_gen=stop_gen) encode_splits(tm.tree) rescored.append(tm) rescored.sort(reverse=True) del full_dataset.trees_blocks[:] full_dataset.trees_blocks.append([i.tree for i in rescored]) o = open("incrgarli.tre", "w") write_tree_file(o, rescored, culled) o.close() return rescored
def testRandomlyReorient(self): n = '(Basichlsac,(Lamprothma,Mougeotisp),(((Haplomitr2,Petalaphy),((Angiopteri,(((Azollacaro,((Dennstasam,(Oleandrapi,Polypodapp)),Dicksonant)),Vittarifle),Botrychbit)),(Isoetesmel,((((Agathismac,Agathisova),Pseudotsu),(((Libocedrus,Juniperusc),Callitris),Athrotaxi)),((Liriodchi,Nelumbo),Sagittari))))),Thuidium));' m = [n, n] dataset = dataio.trees_from_newick(m) trees = [i[0] for i in dataset.trees_blocks] ref = trees[0] changing = trees[1] rng = DebuggingRandom() encode_splits(ref) encode_splits(changing) for i in xrange(50): randomly_reorient_tree(changing, rng=rng, splits=True) self.assertNotEqual(str(changing), n) changing.debug_check_tree(logger_obj=_LOG, splits=True) if symmetric_difference(ref, changing) != 0: self.fail("\n%s\n!=\n%s" % (str(ref), str(changing)))
def strict_consensus_merge(trees_to_merge, copy_trees=False, rooted=False, gordons_supertree=False): """Returns a tree that is the strict consensus merger of the input trees. If copy_trees is True then the trees will be copied before the merger operation, if the `copy_trees` is False then the input trees will be destroyed by the operation (and the modified first tree will be returned). """ if copy_trees: tree_list = [copy.copy(i) for i in trees_to_merge] else: tree_list = list(trees_to_merge) del trees_to_merge[1:] nTrees = len(tree_list) _LOG.debug('%d Trees to merge:\n%s\n' % (nTrees, '\n'.join([str(i) for i in tree_list]))) if nTrees < 2: return tree_list[0] #tree_iter = iter(tree_list) #to_modify = tree_iter.next() to_modify = tree_list[0] if rooted: raise NotImplementedError("Rooted SCM is not implemented") else: to_modify.deroot() encode_splits(to_modify) if IS_DEBUG_LOGGING: assert to_modify._debug_tree_is_valid(splits=False) #for to_consume in tree_iter: for to_consume in tree_list[1:]: if not rooted: to_consume.deroot() encode_splits(to_consume) if IS_DEBUG_LOGGING: assert to_consume._debug_tree_is_valid(splits=True) add_to_scm(to_modify, to_consume, rooted, gordons_supertree=gordons_supertree) if IS_DEBUG_LOGGING: assert to_modify._debug_tree_is_valid(splits=False) return to_modify
def find_best_conflicting(self, starting_tree, split, dataset): new_starting = TreeModel(model=starting_tree.model) new_starting.tree = copy.deepcopy(starting_tree.tree) root = new_starting.tree.seed_node e = find_edge_from_split(root, split, root.edge.clade_mask) if e: e.collapse() tmp_tree_filename = ".tmp.tre" write_trees_to_filepath([new_starting], dataset, tmp_tree_filename) self.cache_settings() try: tmp_constrain_filename = ".tmpconstrain.tre" self.constraintfile = tmp_constrain_filename f = open(tmp_constrain_filename, "w") f.write("-%s\n" % split_as_string_rev(split, self.curr_n_taxa, '.', '*')) f.close() self.ofprefix = "negconst%d" % (split) # it seems a little odd to call this incompletetreefname rather # than streefname but we'd like to trigger the interactive mode, # and this is one way of doing that. self.incompletetreefname = tmp_tree_filename self.runmode = GARLI_ENUM.INCR_RUNMODE # NORMAL_RUNMODE self.topoweight = self.negcon_topoweight self.modweight = self.negcon_modweight self.stopgen = self.negcon_stopgen invoc = [] if new_starting.model: invoc.append("model = %s" % str(new_starting.model)) invoc.append("run") self.run(invoc, terminate_run=True) finally: self.restore_settings() err_lines = self.stderrThread.lines_between_prompt() r = self.parse_igarli_lines(err_lines, dataset) r.sort(reverse=True) for tm in r: encode_splits(tm.tree) assert split not in tm.tree.split_edges return r
def testSplits(self): unrooted = True for tc in test_cases: tree_filepaths = [dendropy.tests.data_source_path(tc[0])] taxa_filepath = dendropy.tests.data_source_path(tc[1]) paup_sd = paup.get_split_distribution(tree_filepaths, taxa_filepath, unrooted=unrooted, burnin=0) taxa_block = paup_sd.taxa_block dp_sd = splits.SplitDistribution(taxa_block=taxa_block) dp_sd.ignore_edge_lengths = True dp_sd.ignore_node_ages = True dp_sd.unrooted = unrooted taxa_mask = taxa_block.all_taxa_bitmask() taxa_block.lock() for tree_filepath in tree_filepaths: for tree in nexus.iterate_over_trees(open(tree_filepath, "rU"), taxa_block=taxa_block): #_LOG.debug("tree = %s" % str(tree)) splits.encode_splits(tree) dp_sd.count_splits_on_tree(tree) self.assertEqual(dp_sd.total_trees_counted, paup_sd.total_trees_counted) # SplitsDistribution counts trivial splits, whereas PAUP* # contree does not, so the following will not work # assert len(dp_sd.splits) == len(paup_sd.splits),\ # "dp = %d, sd = %d" % (len(dp_sd.splits), len(paup_sd.splits)) taxa_mask = taxa_block.all_taxa_bitmask() for split in dp_sd.splits: if not splits.is_trivial_split(split, taxa_mask): self.assertTrue(split in paup_sd.splits) self.assertEqual(dp_sd.split_counts[split], paup_sd.split_counts[split]) paup_sd.splits.remove(split) # if any splits remain here, they were not # in dp_sd assert len(paup_sd.splits) == 0
def testReroot(self): newick = "((t5,t6),((t4,(t2,t1)),t3));" d = dataio.trees_from_newick([newick]) tree = d.trees_blocks[0][0] taxa_block = d.taxa_blocks[0] ref = dataio.trees_from_newick( [newick], taxa_block=taxa_block).trees_blocks[0][0] encode_splits(ref) o_newick = "((t2, t1),((t4,(t5,t6)),t3));" o_tree = dataio.trees_from_newick( [o_newick], taxa_block=taxa_block).trees_blocks[0][0] encode_splits(o_tree) self.assertEqual(symmetric_difference(o_tree, ref), 2) taxa_labels = ["t%d" % i for i in xrange(1, 7)] for leaf_name in taxa_labels: f = lambda x: x.label == leaf_name nd = tree.find_taxon_node(f) tree.to_outgroup_position(nd) r_newick = str(tree) r_tree = dataio.trees_from_newick( [r_newick], taxa_block=taxa_block).trees_blocks[0][0] encode_splits(r_tree) self.assertEqual(symmetric_difference(r_tree, ref), 0)
def gather_neighborhood_commands(tree_list): dim = len(tree_list) mat = [[0]*dim for i in range(dim)] _LOG.debug("tree_list = %s\n" % str(tree_list)) for row_n, i in enumerate(tree_list[:-1]): offset = 1 + row_n row_splits = tree_list[row_n].splits _LOG.debug("row_splits = %s\n" % str(row_splits)) row = mat[row_n] for col_disp, j in enumerate(tree_list[offset:]): col_n = offset + col_disp col_splits = tree_list[col_n].splits _LOG.debug("col_splits = %s\n" % str(col_splits)) d = len(row_splits.symmetric_difference(col_splits)) _LOG.debug("d = %s\n" % str(d)) row[col_n] = d mat[col_n][row_n] = d connected_indices = connected_at(mat, 4) _LOG.debug("connected_indices = %s\n" % str(connected_indices)) sc_tr_commands_list = [] for group_indices in connected_indices: trees = [tree_list[i] for i in group_indices] first_tree = trees[0] cmd_list = ["model = %s\ntree = %s\n" % (first_tree.model, first_tree.tree_string)] sc_tr_commands = ScoreConstraintCommands(first_tree.score, None, cmd_list) cmd_list.append("clearconstraints = 1\n") if len(trees) > 1: trees.sort(reverse=True, cmp=lambda x,y: cmp(x.score,y.score)) c = copy.deepcopy(first_tree.tree) encode_splits(c) e = find_edge_from_split(c.seed_node, last_split) edge_dist = 3 e.head_node.collapse_neighborhood(edge_dist) c.splits = get_norm_nontrivial_split_set(c) si = set(c.splits) for t in trees[1:]: si.intersection_update(t.splits) if len(si): sd = SplitDistribution(taxa_block=taxa_block, split_set=si) ts = TreeSummarizer() sc = ts.tree_from_splits(sd, min_freq=None, include_edge_lengths=False) encode_splits(sc) sc.splits = si sc_tr_commands.constr_splits = si cmd_list.append("posconstraint = %s\n" % sc.compose_newick(edge_lengths=False)) else: c = copy.deepcopy(first_tree.tree) e = find_edge_from_split(c.seed_node, last_split) edge_dist = 3 e.head_node.collapse_neighborhood(edge_dist) encode_splits(c) sc_tr_commands.constr_splits = get_norm_nontrivial_split_set(c) if len(sc_tr_commands.constr_splits) > 0: cmd_list.append("posconstraint = %s\n" % c.compose_newick(edge_lengths=False)) cmd_list.append("run\n") sc_tr_commands_list.append(sc_tr_commands) return sc_tr_commands_list
def count_splits_on_trees(self, tree_iterator, split_distribution=None, trees_splits_encoded=False): """ Given a list of trees file, a SplitsDistribution object (a new one, or, if passed as an argument) is returned collating the split data in the files. """ if split_distribution is None: split_distribution = splits.SplitDistribution() taxa_block = split_distribution.taxa_block for tree_idx, tree in enumerate(tree_iterator): self.total_trees_counted += 1 if taxa_block is None: assert (split_distribution.taxa_block is None) split_distribution.taxa_block = tree.taxa_block taxa_block = tree.taxa_block else: assert (taxa_block is tree.taxa_block) if not trees_splits_encoded: splits.encode_splits(tree) split_distribution.count_splits_on_tree(tree) return split_distribution
def main(): """ Main CLI handler. """ parser = OptionParser(usage=_prog_usage, add_help_option=True, version=_prog_version, description=_prog_description) parser.add_option('-d', '--database', action='store', dest='db_uri', type='string', # also 'float', 'string' etc. default=None, metavar='URI', help='[MANDATORY] database URI (e.g. "postgres://*****:*****@localhost/demodb")') parser.add_option('-q', '--quiet', action='store_true', dest='quiet', default=False, help='suppress progress messages') parser.add_option('-e', '--echo', action='store_true', dest='echo', default=False, help='echo database communications') (opts, args) = parser.parse_args() if opts.db_uri is None: sys.stderr.write('Database URI needs to be specified ("-d" flag; see "--help").\n') sys.exit(1) if len(args) == 0: sys.stderr.write("Tree file(s) not specified.\n") sys.exit(1) src_fpaths = [] for a in args: f = os.path.expandvars(os.path.expanduser(a)) # src_fpaths.append(f) if not os.path.exists(f): sys.stderr.write('File not found: "%s"\n' % f) sys.exit(1) elif not os.path.isfile(f): sys.stderr.write('Directory specified instead of file: "%s"\n' % f) sys.exit(1) else: src_fpaths.append(f) for f in src_fpaths: ## initial read ## if not opts.quiet: sys.stderr.write("Pre-import parse ...\n") ds1 = datasets.Dataset() ds1.read(open(f, "rU"), "nexml") tree_list = [] for trees_block in ds1.trees_blocks: for tree in trees_block: tree_list.append(tree) ## import ## cmd = ["python", "biosql-insert.py", '-d %s' % opts.db_uri, '-b %s' % TEST_BIODB] if opts.quiet: cmd.append("-q") if opts.echo: cmd.append("-e") cmd.append(f) cmd = " ".join(cmd) if not opts.quiet: sys.stderr.write("Executing import: %s\n" % cmd) input_p = subprocess.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = input_p.communicate() if input_p.returncode: sys.stderr.write('*** IMPORT ERROR ***\n') sys.stderr.write(stderr) sys.exit(1) names = stdout.split("\n") for idx, name in enumerate(names): if name: tree_list[idx].name = name for idx, model_tree in enumerate(tree_list): ## export ## cmd = ["python", "biosql-gettree.py", '-d %s' % opts.db_uri, '-b %s' % TEST_BIODB] if opts.quiet: cmd.append("-q") if opts.echo: cmd.append("-e") cmd.append(tree.name) cmd = " ".join(cmd) if not opts.quiet: sys.stderr.write("Executing export: %s\n" % cmd) export_p = subprocess.Popen([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = export_p.communicate() if export_p.returncode: sys.stderr.write('*** EXPORT ERROR ***\n') sys.stderr.write(stderr) sys.exit(1) ds2 = datasets.Dataset() result_tree = ds2.trees_from_string(stdout, "nexml")[0] ## compare ## if not opts.quiet: sys.stderr.write("Comparing splits ...\n") taxa_block = model_tree.taxa_block result_tree.normalize_taxa(taxa_block) assert model_tree.taxa_block is result_tree.taxa_block splits.encode_splits(model_tree) splits.encode_splits(result_tree) sd = treedists.symmetric_difference(model_tree, result_tree) if not opts.quiet: sys.stderr.write("Symmetric distance = %d\n" % sd) rfd = treedists.robinson_foulds_distance(model_tree, result_tree) if not opts.quiet: sys.stderr.write("Weighted Robinson-Fould's distance = %d\n" % rfd) if abs(rfd) < 0.0001: sys.stdout.write("%s (%d/%d): SUCCESS\n" % (f, idx+1, len(tree_list))) else: sys.stdout.write("%s (%d/%d): FAIL\n" % (f, idx+1, len(tree_list)))
def _do_add_taxon_incremental_step(self, full_dataset, inp_trees): culled = self._write_garli_input(full_dataset) culled_taxa = culled.taxa_blocks[0] self.set_active_taxa(culled_taxa) next_round_trees = [] for tree_ind, tree in enumerate(inp_trees): tree_model_list = self.add_to_tree(tree, culled, tree_ind, self.add_tree_stopgen) to_save = [] for tm in tree_model_list: print "Tree %d for %d taxa: %f" % (tree_ind, self.curr_n_taxa, tm.score) step_add_tree = tm.tree encode_splits(step_add_tree) split = 1 << (self.curr_n_taxa - 1) e = find_edge_from_split(step_add_tree.seed_node, split) assert e is not None, "Could not find split %s. Root mask is %s" % (bin(split)[2:], bin(step_add_tree.seed_node.edge.clade_mask)[2:]) nt_list = self.check_neighborhood_after_addition(tm, e.head_node, self.first_neighborhood, culled, tree_ind) deeper_search_start = [] better_tm = tm for nt in nt_list: encode_splits(nt.tree) if symmetric_difference(nt.tree, step_add_tree) != 0: deeper_search_start.append(nt) elif nt.score > better_tm.score: better_tm = nt if deeper_search_start: entire_neighborhood = [better_tm] + deeper_search_start for alt_tm in deeper_search_start: e = find_edge_from_split(alt_tm.tree.seed_node, split) assert e is not None, "Could not find split %s. Root mask is %s" % (bin(split)[2:], bin(alt_tm.tree.seed_node.edge.clade_mask)[2:]) nt_list = self.check_neighborhood_after_addition(alt_tm, e.head_node, self.first_neighborhood + self.neighborhood_incr, culled, tree_ind) for nt in nt_list: encode_splits(nt.tree) entire_neighborhood.append(nt) entire_neighborhood.sort(reverse=True) to_add = [] for nt in entire_neighborhood: found = False for x in to_add: if symmetric_difference(x.tree, nt.tree) == 0: found = True break if not found: to_add.append(nt) to_save.extend(to_add) else: to_save.append(better_tm) # this is where we should evaluate which trees need to be maintained for the next round. next_round_trees.extend(to_save) next_round_trees = self.select_trees_for_next_round(culled, next_round_trees) del full_dataset.trees_blocks[:] full_dataset.trees_blocks.append([i.tree for i in next_round_trees]) o = open("incrgarli.tre", "w") write_tree_file(o, next_round_trees, culled) o.close() return next_round_trees
def _do_add_taxon_incremental_step(self, full_dataset, inp_trees): culled = self._write_garli_input(full_dataset) culled_taxa = culled.taxa_blocks[0] self.set_active_taxa(culled_taxa) next_round_trees = [] for tree_ind, tree in enumerate(inp_trees): tree_model_list = self.add_to_tree(tree, culled, tree_ind, self.add_tree_stopgen) to_save = [] for tm in tree_model_list: print "Tree %d for %d taxa: %f" % (tree_ind, self.curr_n_taxa, tm.score) step_add_tree = tm.tree encode_splits(step_add_tree) split = 1 << (self.curr_n_taxa - 1) e = find_edge_from_split(step_add_tree.seed_node, split) assert e is not None, "Could not find split %s. Root mask is %s" % ( bin(split)[2:], bin( step_add_tree.seed_node.edge.clade_mask)[2:]) nt_list = self.check_neighborhood_after_addition( tm, e.head_node, self.first_neighborhood, culled, tree_ind) deeper_search_start = [] better_tm = tm for nt in nt_list: encode_splits(nt.tree) if symmetric_difference(nt.tree, step_add_tree) != 0: deeper_search_start.append(nt) elif nt.score > better_tm.score: better_tm = nt if deeper_search_start: entire_neighborhood = [better_tm] + deeper_search_start for alt_tm in deeper_search_start: e = find_edge_from_split(alt_tm.tree.seed_node, split) assert e is not None, "Could not find split %s. Root mask is %s" % ( bin(split)[2:], bin(alt_tm.tree.seed_node.edge.clade_mask)[2:]) nt_list = self.check_neighborhood_after_addition( alt_tm, e.head_node, self.first_neighborhood + self.neighborhood_incr, culled, tree_ind) for nt in nt_list: encode_splits(nt.tree) entire_neighborhood.append(nt) entire_neighborhood.sort(reverse=True) to_add = [] for nt in entire_neighborhood: found = False for x in to_add: if symmetric_difference(x.tree, nt.tree) == 0: found = True break if not found: to_add.append(nt) to_save.extend(to_add) else: to_save.append(better_tm) # this is where we should evaluate which trees need to be maintained for the next round. next_round_trees.extend(to_save) next_round_trees = self.select_trees_for_next_round( culled, next_round_trees) del full_dataset.trees_blocks[:] full_dataset.trees_blocks.append([i.tree for i in next_round_trees]) o = open("incrgarli.tre", "w") write_tree_file(o, next_round_trees, culled) o.close() return next_round_trees
d.read(open(data_file, "rU"), format="NEXUS") taxa = d.taxa_blocks[0] full_taxa_mask = taxa.all_taxa_bitmask() for n, taxon in enumerate(taxa): TAXON_TO_TRANSLATE[taxon] = str(n + 1) _LOG.debug("%s = full_taxa_mask" % bin(full_taxa_mask)) assert (len(d.taxa_blocks) == 1) characters = d.char_blocks[0] assert (len(d.char_blocks) == 1) assert (len(characters) == len(taxa)) inp_trees = d.read_trees(open(intree_file, "rU"), format="NEXUS") assert (inp_trees) current_taxon_mask = None for tree in inp_trees: assert tree.taxa_block is taxa encode_splits(tree) if current_taxon_mask is None: current_taxon_mask = tree.seed_node.edge.clade_mask _LOG.debug("%s = current_taxon_mask" % bin(current_taxon_mask)) assert ((current_taxon_mask | full_taxa_mask) == full_taxa_mask) toadd_taxon_mask = current_taxon_mask ^ full_taxa_mask else: assert (current_taxon_mask == tree.seed_node.edge.clade_mask) next_toadd = lowest_bit_only(current_taxon_mask ^ full_taxa_mask) if (next_toadd - 1) != current_taxon_mask: _LOG.debug("%s = next_toadd" % format_split(next_toadd, taxa=taxa)) _LOG.debug( "%s = current_taxon_mask\n(next_toadd - 1) != current_taxon_mask" % format_split(current_taxon_mask, taxa=taxa)) sys.exit( "In this version, taxa must be added to the tree in the order that they appear in the matrix"
full_taxa_mask = taxa.all_taxa_bitmask() for n, taxon in enumerate(taxa): TAXON_TO_TRANSLATE[taxon] = str(n + 1) _LOG.debug("%s = full_taxa_mask" % bin(full_taxa_mask)) garli.datafname = os.path.join("data.nex") raw_trees = full_dataset.read_trees(open(intree_file, "rU"), format="NEXUS") assert(raw_trees) current_taxon_mask = None # read initial trees and verify that they have the correct set of taxa for tree in raw_trees: assert tree.taxa_block is taxa encode_splits(tree) if current_taxon_mask is None: current_taxon_mask = tree.seed_node.edge.clade_mask _LOG.debug("%s = current_taxon_mask" % bin(current_taxon_mask)) assert( (current_taxon_mask | full_taxa_mask) == full_taxa_mask) toadd_taxon_mask = current_taxon_mask ^ full_taxa_mask else: assert(current_taxon_mask == tree.seed_node.edge.clade_mask) next_toadd = lowest_bit_only(current_taxon_mask^full_taxa_mask) if (next_toadd - 1) != current_taxon_mask: _LOG.debug("%s = next_toadd" % format_split(next_toadd, taxa=taxa)) _LOG.debug("%s = current_taxon_mask\n(next_toadd - 1) != current_taxon_mask" % format_split(current_taxon_mask, taxa=taxa)) sys.exit("In this version, taxa must be added to the tree in the order that they appear in the matrix") inp_trees = [TreeModel(tree=i) for i in raw_trees]
def tree_from_splits(self, split_distribution, min_freq=0.5, include_edge_lengths=True): "Returns a consensus tree from splits in `split_distribution`." leaf_to_root_search = True taxa_block = split_distribution.taxa_block con_tree = treegen.star_tree(taxa_block) split_freqs = split_distribution.split_frequencies taxa_mask = taxa_block.all_taxa_bitmask() splits.encode_splits(con_tree) leaves = con_tree.leaf_nodes() if leaf_to_root_search: to_leaf_dict = {} for leaf in leaves: to_leaf_dict[leaf.edge.clade_mask] = leaf include_edge_lengths = self.support_as_labels and include_edge_lengths unrooted = split_distribution.unrooted to_try_to_add = [] for s, f in split_freqs.iteritems(): if (f > min_freq): m = s & taxa_mask if (m != taxa_mask) and ( (m - 1) & m ): # if not root (i.e., all "1's") and not singleton (i.e., one "1") if unrooted: c = (~m) & taxa_mask if (c - 1) & c: # not singleton (i.e., one "0") if 1 & m: k = c else: k = m to_try_to_add.append((f, k, m)) else: to_try_to_add.append((f, m, m)) to_try_to_add.sort(reverse=True) root = con_tree.seed_node root_edge = root.edge # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree for freq, split_to_add, split_in_dict in to_try_to_add: if (split_to_add & root_edge.clade_mask) != split_to_add: continue elif leaf_to_root_search: lb = splits.lowest_bit_only(split_to_add) one_leaf = to_leaf_dict[lb] parent_node = one_leaf while (split_to_add & parent_node.edge.clade_mask) != split_to_add: parent_node = parent_node.parent_node else: parent_node = shallowest_containing_node( start_node=con_tree.seed_node, split=split_to_add, taxa_mask=taxa_mask) if parent_node is None or parent_node.edge.clade_mask == split_to_add: continue # split is not in tree, or already in tree. new_node = trees.Node() self.map_split_support_to_node(node=new_node, split_support=freq) new_node_children = [] new_edge = new_node.edge new_edge.clade_mask = 0 for child in parent_node.child_nodes(): # might need to modify the following if rooted splits # are used cecm = child.edge.clade_mask if (cecm & split_to_add): assert cecm != split_to_add new_edge.clade_mask |= cecm new_node_children.append(child) # Check to see if we have accumulated all of the bits that we # needed, but none that we don't need. if new_edge.clade_mask == split_to_add: if include_edge_lengths: elen = split_distribution.split_edge_lengths[split_in_dict] if len(elen) > 0: new_edge.length = float(sum(elen)) / len(elen) else: new_edge.length = None for child in new_node_children: parent_node.remove_child(child) new_node.add_child(child) parent_node.add_child(new_node) con_tree.split_edges[split_to_add] = new_edge ## here we add the support values and/or edge lengths for the terminal taxa ## for node in leaves: if unrooted: split = con_tree.split_edges.normalize_key( node.edge.clade_mask) else: split = node.edge.clade_mask self.map_split_support_to_node(node, 1.0) if include_edge_lengths: elen = split_distribution.split_edge_lengths.get(split, [0.0]) if len(elen) > 0: node.edge.length = float(sum(elen)) / len(elen) else: node.edge.length = None return con_tree
taxa_block = TaxaBlock([str(i+1) for i in range(n_tax)]) taxa_blocks = [taxa_block] dataset = Dataset(taxa_blocks=taxa_blocks) #setting this > 1.0 means that more trees are retained to the neighborhood search stage score_diff_multiplier = 1.0 commands = [] if nbhd_tree_groups is None: _LOG.debug("Invocation of igarli_neighborhood.py with only one tree file -- need to set up initial neighborhood searches") for g in all_tree_groups: for el in g: newick_string = el.tree_string newick_stream = StringIO(newick_string) t = dataset.read_trees(newick_stream, format="newick")[0] encode_splits(t) el.tree = t _LOG.debug("len(g) = %d" % len(g)) opt_tree_el = g[0] opt_tree = opt_tree_el.tree opt_tree_el.splits = get_norm_nontrivial_split_set(opt_tree) _LOG.debug("opt_tree_el.splits = %s" % str(opt_tree_el.splits)) unopt_score = None to_preserve = [opt_tree_el] for el in g[1:]: other_tree = el.tree el.splits = get_norm_nontrivial_split_set(other_tree) if unopt_score is None and el.splits == opt_tree_el.splits: unopt_score = el.score else: to_preserve.append(el)
def main_cli(): description = '%s %s ' % (_program_name, _program_version) usage = "%prog [options] <TREES FILE> [<TREES FILE> [<TREES FILE> [...]]" parser = OptionParser(usage=usage, add_help_option=True, version = _program_version, description=description) parser.add_option('-r','--reference', dest='reference_tree_filepath', default=None, help="path to file containing the reference (true) tree") parser.add_option('-v', '--verbose', action='store_false', dest='quiet', default=True, help="Verbose mode") (opts, args) = parser.parse_args() ################################################### # Support file idiot checking sampled_filepaths = [] missing = False for fpath in args: fpath = os.path.expanduser(os.path.expandvars(fpath)) if not os.path.exists(fpath): sys.exit('Sampled trees file not found: "%s"' % fpath) sampled_filepaths.append(fpath) if not sampled_filepaths: sys.exit("Expecting arguments indicating files that contain sampled trees") sampled_file_objs = [open(f, "rU") for f in sampled_filepaths] ################################################### # Lots of other idiot-checking ... # target tree if opts.reference_tree_filepath is None: sys.exit("A reference tree must be specified (use -h to see all options)") reference_tree_filepath = os.path.expanduser(os.path.expandvars(opts.reference_tree_filepath)) if not os.path.exists(reference_tree_filepath): sys.exit('Reference tree file not found: "%s"\n' % reference_tree_filepath) d = Dataset() ref_trees = d.read_trees(open(reference_tree_filepath, 'ru'), schema="NEXUS") if len(ref_trees) != 1: sys.exit("Expecting one reference tree") ref_tree = ref_trees[0] splits.encode_splits(ref_tree) assert(len(d.taxa_blocks) == 1) taxa = d.taxa_blocks[0] ################################################### # Main work begins here: Count the splits start_time = datetime.datetime.now() comments = [] tsum = treesum.TreeSummarizer() tsum.burnin = 0 if opts.quiet: tsum.verbose = False tsum.write_message = None else: tsum.verbose = True tsum.write_message = sys.stderr.write _LOG.debug("### COUNTING SPLITS ###\n") split_distribution = splits.SplitDistribution(taxa_block=taxa) tree_source = MultiFileTreeIterator(filepaths=sampled_filepaths, core_iterator=nexus.iterate_over_trees) tsum.count_splits_on_trees(tree_source, split_distribution) report = [] report.append("%d trees read from %d files." % (tsum.total_trees_read, len(sampled_filepaths))) report.append("%d trees ignored in total." % (tree_source.total_trees_ignored)) report.append("%d trees considered in total for split support assessment." % (tsum.total_trees_counted)) report.append("%d unique taxa across all trees." % len(split_distribution.taxa_block)) num_splits, num_unique_splits, num_nt_splits, num_nt_unique_splits = split_distribution.splits_considered() report.append("%d unique splits out of %d total splits counted." % (num_unique_splits, num_splits)) report.append("%d unique non-trivial splits out of %d total non-trivial splits counted." % (num_nt_unique_splits, num_nt_splits)) _LOG.debug("\n".join(report)) con_tree = treegen.star_tree(taxa) taxa_mask = taxa.all_taxa_bitmask() splits.encode_splits(con_tree) leaves = con_tree.leaf_nodes() to_leaf_dict = {} for leaf in leaves: to_leaf_dict[leaf.edge.clade_mask] = leaf unrooted = True n_read = float(tsum.total_trees_read) sp_list = [] for split, count in split_distribution.split_counts.iteritems(): freq = count/n_read if not splits.is_trivial_split(split, taxa_mask): m = split & taxa_mask if (m != taxa_mask) and ((m-1) & m): # if not root (i.e., all "1's") and not singleton (i.e., one "1") if unrooted: c = (~m) & taxa_mask if (c-1) & c: # not singleton (i.e., one "0") if 1 & m: k = c else: k = m sp_list.append((freq, k, m)) else: sp_list.append((freq, m, m)) sp_list.sort(reverse=True) root = con_tree.seed_node root_edge = root.edge curr_freq = 1.1 curr_all_splits_list = [] curr_compat_splits_list = [] all_splits_by_freq = [] compat_splits_by_freq = [] # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree for freq, split_to_add, split_in_dict in sp_list: if abs(curr_freq-freq) > 0.000001: # dropping down to the next lowest freq curr_l = [freq, []] curr_all_splits_list = curr_l[1] all_splits_by_freq.append(curr_l) curr_l = [freq, []] curr_compat_splits_list = curr_l[1] compat_splits_by_freq.append(curr_l) curr_freq = freq curr_all_splits_list.append(split_to_add) if (split_to_add & root_edge.clade_mask) != split_to_add: continue lb = splits.lowest_bit_only(split_to_add) one_leaf = to_leaf_dict[lb] parent_node = one_leaf while (split_to_add & parent_node.edge.clade_mask) != split_to_add: parent_node = parent_node.parent_node if parent_node is None or parent_node.edge.clade_mask == split_to_add: continue # split is not in tree, or already in tree. new_node = trees.Node() new_node_children = [] new_edge = new_node.edge new_edge.clade_mask = 0 for child in parent_node.child_nodes(): # might need to modify the following if rooted splits # are used cecm = child.edge.clade_mask if (cecm & split_to_add ): assert cecm != split_to_add new_edge.clade_mask |= cecm new_node_children.append(child) # Check to see if we have accumulated all of the bits that we # needed, but none that we don't need. if new_edge.clade_mask == split_to_add: for child in new_node_children: parent_node.remove_child(child) new_node.add_child(child) parent_node.add_child(new_node) con_tree.split_edges[split_to_add] = new_edge curr_compat_splits_list.append(split_to_add) ref_set = set() for s in ref_tree.split_edges.iterkeys(): m = s & taxa_mask if 1 & m: k = (~m) & taxa_mask else: k = m if not splits.is_trivial_split(k, taxa_mask): ref_set.add(k) all_set = set() compat_set = set() _LOG.debug("%d edges is the reference tree" % (len(ref_set))) print "freq\tcompatFP\tcompatFN\tcompatSD\tallFP\tallFN\tallSD" for all_el, compat_el in itertools.izip(all_splits_by_freq, compat_splits_by_freq): freq = all_el[0] all_sp = all_el[1] all_set.update(all_sp) all_fn = len(ref_set - all_set) all_fp = len(all_set - ref_set) compat_sp = compat_el[1] compat_set.update(compat_sp) compat_fn = len(ref_set - compat_set) compat_fp = len(compat_set - ref_set) print "%f\t%d\t%d\t%d\t%d\t%d\t%d" % (freq, compat_fp, compat_fn, compat_fp + compat_fn, all_fp, all_fn, all_fp + all_fn )
def main_cli(): description = '%s %s ' % (_program_name, _program_version) usage = "%prog [options] <TREES FILE> [<TREES FILE> [<TREES FILE> [...]]" parser = OptionParser(usage=usage, add_help_option=True, version=_program_version, description=description) parser.add_option('-r', '--reference', dest='reference_tree_filepath', default=None, help="path to file containing the reference (true) tree") parser.add_option('-v', '--verbose', action='store_false', dest='quiet', default=True, help="Verbose mode") (opts, args) = parser.parse_args() ################################################### # Support file idiot checking sampled_filepaths = [] missing = False for fpath in args: fpath = os.path.expanduser(os.path.expandvars(fpath)) if not os.path.exists(fpath): sys.exit('Sampled trees file not found: "%s"' % fpath) sampled_filepaths.append(fpath) if not sampled_filepaths: sys.exit( "Expecting arguments indicating files that contain sampled trees") sampled_file_objs = [open(f, "rU") for f in sampled_filepaths] ################################################### # Lots of other idiot-checking ... # target tree if opts.reference_tree_filepath is None: sys.exit( "A reference tree must be specified (use -h to see all options)") reference_tree_filepath = os.path.expanduser( os.path.expandvars(opts.reference_tree_filepath)) if not os.path.exists(reference_tree_filepath): sys.exit('Reference tree file not found: "%s"\n' % reference_tree_filepath) d = Dataset() ref_trees = d.read_trees(open(reference_tree_filepath, 'ru'), schema="NEXUS") if len(ref_trees) != 1: sys.exit("Expecting one reference tree") ref_tree = ref_trees[0] splits.encode_splits(ref_tree) assert (len(d.taxa_blocks) == 1) taxa = d.taxa_blocks[0] ################################################### # Main work begins here: Count the splits start_time = datetime.datetime.now() comments = [] tsum = treesum.TreeSummarizer() tsum.burnin = 0 if opts.quiet: tsum.verbose = False tsum.write_message = None else: tsum.verbose = True tsum.write_message = sys.stderr.write _LOG.debug("### COUNTING SPLITS ###\n") split_distribution = splits.SplitDistribution(taxa_block=taxa) tree_source = MultiFileTreeIterator(filepaths=sampled_filepaths, core_iterator=nexus.iterate_over_trees) tsum.count_splits_on_trees(tree_source, split_distribution) report = [] report.append("%d trees read from %d files." % (tsum.total_trees_read, len(sampled_filepaths))) report.append("%d trees ignored in total." % (tree_source.total_trees_ignored)) report.append( "%d trees considered in total for split support assessment." % (tsum.total_trees_counted)) report.append("%d unique taxa across all trees." % len(split_distribution.taxa_block)) num_splits, num_unique_splits, num_nt_splits, num_nt_unique_splits = split_distribution.splits_considered( ) report.append("%d unique splits out of %d total splits counted." % (num_unique_splits, num_splits)) report.append( "%d unique non-trivial splits out of %d total non-trivial splits counted." % (num_nt_unique_splits, num_nt_splits)) _LOG.debug("\n".join(report)) con_tree = treegen.star_tree(taxa) taxa_mask = taxa.all_taxa_bitmask() splits.encode_splits(con_tree) leaves = con_tree.leaf_nodes() to_leaf_dict = {} for leaf in leaves: to_leaf_dict[leaf.edge.clade_mask] = leaf unrooted = True n_read = float(tsum.total_trees_read) sp_list = [] for split, count in split_distribution.split_counts.iteritems(): freq = count / n_read if not splits.is_trivial_split(split, taxa_mask): m = split & taxa_mask if (m != taxa_mask) and ( (m - 1) & m ): # if not root (i.e., all "1's") and not singleton (i.e., one "1") if unrooted: c = (~m) & taxa_mask if (c - 1) & c: # not singleton (i.e., one "0") if 1 & m: k = c else: k = m sp_list.append((freq, k, m)) else: sp_list.append((freq, m, m)) sp_list.sort(reverse=True) root = con_tree.seed_node root_edge = root.edge curr_freq = 1.1 curr_all_splits_list = [] curr_compat_splits_list = [] all_splits_by_freq = [] compat_splits_by_freq = [] # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree for freq, split_to_add, split_in_dict in sp_list: if abs(curr_freq - freq) > 0.000001: # dropping down to the next lowest freq curr_l = [freq, []] curr_all_splits_list = curr_l[1] all_splits_by_freq.append(curr_l) curr_l = [freq, []] curr_compat_splits_list = curr_l[1] compat_splits_by_freq.append(curr_l) curr_freq = freq curr_all_splits_list.append(split_to_add) if (split_to_add & root_edge.clade_mask) != split_to_add: continue lb = splits.lowest_bit_only(split_to_add) one_leaf = to_leaf_dict[lb] parent_node = one_leaf while (split_to_add & parent_node.edge.clade_mask) != split_to_add: parent_node = parent_node.parent_node if parent_node is None or parent_node.edge.clade_mask == split_to_add: continue # split is not in tree, or already in tree. new_node = trees.Node() new_node_children = [] new_edge = new_node.edge new_edge.clade_mask = 0 for child in parent_node.child_nodes(): # might need to modify the following if rooted splits # are used cecm = child.edge.clade_mask if (cecm & split_to_add): assert cecm != split_to_add new_edge.clade_mask |= cecm new_node_children.append(child) # Check to see if we have accumulated all of the bits that we # needed, but none that we don't need. if new_edge.clade_mask == split_to_add: for child in new_node_children: parent_node.remove_child(child) new_node.add_child(child) parent_node.add_child(new_node) con_tree.split_edges[split_to_add] = new_edge curr_compat_splits_list.append(split_to_add) ref_set = set() for s in ref_tree.split_edges.iterkeys(): m = s & taxa_mask if 1 & m: k = (~m) & taxa_mask else: k = m if not splits.is_trivial_split(k, taxa_mask): ref_set.add(k) all_set = set() compat_set = set() _LOG.debug("%d edges is the reference tree" % (len(ref_set))) print "freq\tcompatFP\tcompatFN\tcompatSD\tallFP\tallFN\tallSD" for all_el, compat_el in itertools.izip(all_splits_by_freq, compat_splits_by_freq): freq = all_el[0] all_sp = all_el[1] all_set.update(all_sp) all_fn = len(ref_set - all_set) all_fp = len(all_set - ref_set) compat_sp = compat_el[1] compat_set.update(compat_sp) compat_fn = len(ref_set - compat_set) compat_fp = len(compat_set - ref_set) print "%f\t%d\t%d\t%d\t%d\t%d\t%d" % (freq, compat_fp, compat_fn, compat_fp + compat_fn, all_fp, all_fn, all_fp + all_fn)
dataset = Dataset(taxa_blocks=taxa_blocks) #setting this > 1.0 means that more trees are retained to the neighborhood search stage score_diff_multiplier = 1.0 commands = [] # first we collect all of the ParsedTree objects into all_parsed_trees and we # call encode_splits so that we can look up split info on each tree all_tree_groups.extend(nbhd_tree_groups) all_parsed_trees = [] for g in all_tree_groups: for el in g: newick_string = el.tree_string newick_stream = StringIO(newick_string) t = dataset.read_trees(newick_stream, format="newick")[0] encode_splits(t) el.tree = t all_parsed_trees.append(el) assert (all_taxa_bitmask == all_parsed_trees[0].tree.seed_node.edge.clade_mask) ######################################## # First, we make sure that there are not duplicate topologies # Because we reverse sort, we'll be retaining the tree with the # best score ##### all_parsed_trees.sort(reverse=True) set_of_split_sets = set() unique_topos = [] for tm in all_parsed_trees: add_nontriv_splits_attr(tm.tree, all_taxa_bitmask) tm.splits, tm.split_set = tm.tree.splits, tm.tree.split_set
def add_to_scm(to_modify, to_consume, rooted=False, gordons_supertree=False): """Adds the tree `to_consume` to the tree `to_modify` in a strict consensus merge operation. Both trees must have had encode_splits called on them.""" assert (to_modify.taxa_block is to_consume.taxa_block) taxa_block = to_consume.taxa_block if rooted: raise NotImplementedError("rooted form of add_to_scm not implemented") to_mod_root = to_modify.seed_node to_mod_split = to_mod_root.edge.clade_mask to_consume_root = to_consume.seed_node to_consume_split = to_consume_root.edge.clade_mask leaf_intersection = to_mod_split & to_consume_split if IS_DEBUG_LOGGING: _LOG.debug("add_to_scm:\n %s\n + %s\n%s" % (str(to_modify), str(to_consume), format_split(leaf_intersection, taxa=taxa_block))) n_common_leaves = count_bits(leaf_intersection) if n_common_leaves < 2: _LOG.error('trees must have at least 2 common leaves') raise ValueError('trees must have at least 2 common leaves') if n_common_leaves == 2: # SCM with 2 leaves in common results in a polytomy collapse_clade(to_mod_root) collapse_clade(to_consume_root) leaves_to_steal = [ c for c in to_consume_root.child_nodes() if not (leaf_intersection & c.edge.clade_mask) ] for leaf in leaves_to_steal: to_mod_root.add_child(leaf) to_mod_root.edge.clade_mask |= leaf.edge.clade_mask to_modify.split_edges = {to_mod_root.edge.clade_mask: to_mod_root.edge} for child in to_mod_root.child_nodes(): to_modify.split_edges[child.edge.clade_mask] = child.edge return # at least 3 leaves in common tmse = to_modify.split_edges to_mod_relevant_splits = {} to_consume_relevant_splits = {} if not rooted: if IS_DEBUG_LOGGING: to_modify.debug_check_tree(splits=True, logger_obj=_LOG) to_consume.debug_check_tree(splits=True, logger_obj=_LOG) reroot_on_lowest_common_index_path(to_modify, leaf_intersection) reroot_on_lowest_common_index_path(to_consume, leaf_intersection) if IS_DEBUG_LOGGING: to_modify.debug_check_tree(splits=True, logger_obj=_LOG) to_consume.debug_check_tree(splits=True, logger_obj=_LOG) to_mod_root = to_modify.seed_node assert (to_mod_root.edge.clade_mask == to_mod_split) to_consume_root = to_consume.seed_node assert (to_consume_root.edge.clade_mask == to_consume_split) #for s, e in tmse.iteritems(): for s, e in tmse.items(): s = e.clade_mask masked = s & leaf_intersection if masked and masked != leaf_intersection: e_list = to_mod_relevant_splits.setdefault(masked, []) e_list.append((s, e)) #for s, e in to_consume.split_edges.iteritems(): for s, e in to_consume.split_edges.items(): s = e.clade_mask masked = s & leaf_intersection if masked and masked != leaf_intersection: e_list = to_consume_relevant_splits.setdefault(masked, []) e_list.append((s, e)) # Because each of these paths radiates away from the root (none of the paths # cross the root), the clade_masks for deeper edges will be supersets # of the clade_masks for shallower nodes. Thus if we reverse sort we # get the edges in the order root->tip #for split, path in to_mod_relevant_splits.iteritems(): for split, path in to_mod_relevant_splits.items(): path.sort(reverse=True) t = [i[1] for i in path] del path[:] path.extend(t) #for split, path in to_consume_relevant_splits.iteritems(): for split, path in to_consume_relevant_splits.items(): path.sort(reverse=True) t = [i[1] for i in path] del path[:] path.extend(t) if IS_DEBUG_LOGGING: to_modify.debug_check_tree(splits=True, logger_obj=_LOG) to_consume.debug_check_tree(splits=True, logger_obj=_LOG) # first we'll collapse all paths in the common leafset in to_modify that # are not in to_consume _collapse_paths_not_found(to_mod_relevant_splits, to_consume_relevant_splits, tmse) # Now we'll collapse all paths in the common leafset in to_consume that # are not in to_modify _collapse_paths_not_found(to_consume_relevant_splits, to_mod_relevant_splits) # first we'll deal with subtrees that are: # - not in the leaf intersection set, and # - attached to "relevant" nodes # We simply move these subtrees from the to_consume tree to the appropriate # node in to_modify to_steal = [ i for i in to_consume_root.child_nodes() if (i.edge.clade_mask & leaf_intersection) == 0 ] for child in to_steal: to_mod_root.add_child(child) to_mod_root.edge.clade_mask |= child.edge.clade_mask #for masked_split, to_consume_path in to_consume_relevant_splits.iteritems(): for masked_split, to_consume_path in to_consume_relevant_splits.items(): to_mod_path = to_mod_relevant_splits.get(masked_split) if IS_DEBUG_LOGGING and to_mod_path is None: #to_mod_path is None: _LOG.debug("%s = mask" % format_split(leaf_intersection, taxa=taxa_block)) _LOG.debug("%s = masked" % format_split(masked_split, taxa=taxa_block)) _LOG.debug( "%s = raw" % format_split(to_consume_path[-1].clade_mask, taxa=taxa_block)) #for k, v in to_mod_relevant_splits.iteritems(): for k, v in to_mod_relevant_splits.items(): _LOG.debug("%s in to_mod_relevant_splits" % format_split(k, taxa=taxa_block)) assert to_mod_path is not None to_mod_head = to_mod_path[-1].head_node to_mod_head_edge = to_mod_head.edge to_consume_head = to_consume_path[-1].head_node for child in to_consume_head.child_nodes(): if (child.edge.clade_mask & leaf_intersection) == 0: # child is the root of a subtree that has no children in the leaf_intersection to_mod_head.add_child(child) to_mod_head_edge.clade_mask |= child.edge.clade_mask if len(to_consume_path) > 1: if len(to_mod_path) > 1: # collision if gordons_supertree: for edge in to_mod_path[2:]: p = edge.tail_node c = edge.head_node sibs = p.child_nodes() for sib in sibs: _LOG.debug("sib is %s" % (sib.compose_newick())) if sib is not c: if not sib.is_leaf(): collapse_clade(sib) collapse_edge(sib.edge) collapse_edge(p.edge) mid_node = to_mod_path[0].head_node for edge in to_consume_path[1:]: p = edge.tail_node avoid = edge.head_node for child in p.child_nodes(): _LOG.debug("child is %s" % (child.compose_newick())) if child is not avoid: mid_node.add_child(child) collapse_clade(child) if not child.is_leaf(): collapse_edge(child.edge) mid_node.edge.clade_mask |= child.edge.clade_mask else: for edge in to_mod_path[1:-1]: collapse_edge(edge) mid_node = to_mod_path[0].head_node for edge in to_consume_path[1:]: p = edge.tail_node avoid = edge.head_node for child in p.child_nodes(): if child is not avoid: mid_node.add_child(child) mid_node.edge.clade_mask |= child.edge.clade_mask else: # we have to move the subtrees from to_consume to to_modify to_mod_edge = to_mod_path[0] to_mod_tail, to_mod_head = to_mod_edge.tail_node, to_mod_edge.head_node deepest_edge_to_move = to_consume_path[0] deepest_node_to_move = deepest_edge_to_move.head_node tipmost_edge_to_move = to_consume_path[-1] tipmost_node_to_move = tipmost_edge_to_move.tail_node prev_head = tipmost_edge_to_move.head_node to_mod_tail.add_child(deepest_node_to_move) to_mod_tail.remove_child(to_mod_head) tipmost_node_to_move.add_child(to_mod_head) tipmost_node_to_move.remove_child(prev_head) encode_splits(to_modify)