def main_cli(): description = '%s %s ' % (_program_name, _program_version) usage = "%prog [options] <TREES FILE> [<TREES FILE> [<TREES FILE> [...]]" parser = OptionParser(usage=usage, add_help_option=True, version = _program_version, description=description) parser.add_option('-r','--reference', dest='reference_tree_filepath', default=None, help="path to file containing the reference (true) tree") parser.add_option('-v', '--verbose', action='store_false', dest='quiet', default=True, help="Verbose mode") (opts, args) = parser.parse_args() ################################################### # Support file idiot checking sampled_filepaths = [] missing = False for fpath in args: fpath = os.path.expanduser(os.path.expandvars(fpath)) if not os.path.exists(fpath): sys.exit('Sampled trees file not found: "%s"' % fpath) sampled_filepaths.append(fpath) if not sampled_filepaths: sys.exit("Expecting arguments indicating files that contain sampled trees") sampled_file_objs = [open(f, "rU") for f in sampled_filepaths] ################################################### # Lots of other idiot-checking ... # target tree if opts.reference_tree_filepath is None: sys.exit("A reference tree must be specified (use -h to see all options)") reference_tree_filepath = os.path.expanduser(os.path.expandvars(opts.reference_tree_filepath)) if not os.path.exists(reference_tree_filepath): sys.exit('Reference tree file not found: "%s"\n' % reference_tree_filepath) d = Dataset() ref_trees = d.read_trees(open(reference_tree_filepath, 'ru'), schema="NEXUS") if len(ref_trees) != 1: sys.exit("Expecting one reference tree") ref_tree = ref_trees[0] splits.encode_splits(ref_tree) assert(len(d.taxa_blocks) == 1) taxa = d.taxa_blocks[0] ################################################### # Main work begins here: Count the splits start_time = datetime.datetime.now() comments = [] tsum = treesum.TreeSummarizer() tsum.burnin = 0 if opts.quiet: tsum.verbose = False tsum.write_message = None else: tsum.verbose = True tsum.write_message = sys.stderr.write _LOG.debug("### COUNTING SPLITS ###\n") split_distribution = splits.SplitDistribution(taxa_block=taxa) tree_source = MultiFileTreeIterator(filepaths=sampled_filepaths, core_iterator=nexus.iterate_over_trees) tsum.count_splits_on_trees(tree_source, split_distribution) report = [] report.append("%d trees read from %d files." % (tsum.total_trees_read, len(sampled_filepaths))) report.append("%d trees ignored in total." % (tree_source.total_trees_ignored)) report.append("%d trees considered in total for split support assessment." % (tsum.total_trees_counted)) report.append("%d unique taxa across all trees." % len(split_distribution.taxa_block)) num_splits, num_unique_splits, num_nt_splits, num_nt_unique_splits = split_distribution.splits_considered() report.append("%d unique splits out of %d total splits counted." % (num_unique_splits, num_splits)) report.append("%d unique non-trivial splits out of %d total non-trivial splits counted." % (num_nt_unique_splits, num_nt_splits)) _LOG.debug("\n".join(report)) con_tree = treegen.star_tree(taxa) taxa_mask = taxa.all_taxa_bitmask() splits.encode_splits(con_tree) leaves = con_tree.leaf_nodes() to_leaf_dict = {} for leaf in leaves: to_leaf_dict[leaf.edge.clade_mask] = leaf unrooted = True n_read = float(tsum.total_trees_read) sp_list = [] for split, count in split_distribution.split_counts.iteritems(): freq = count/n_read if not splits.is_trivial_split(split, taxa_mask): m = split & taxa_mask if (m != taxa_mask) and ((m-1) & m): # if not root (i.e., all "1's") and not singleton (i.e., one "1") if unrooted: c = (~m) & taxa_mask if (c-1) & c: # not singleton (i.e., one "0") if 1 & m: k = c else: k = m sp_list.append((freq, k, m)) else: sp_list.append((freq, m, m)) sp_list.sort(reverse=True) root = con_tree.seed_node root_edge = root.edge curr_freq = 1.1 curr_all_splits_list = [] curr_compat_splits_list = [] all_splits_by_freq = [] compat_splits_by_freq = [] # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree for freq, split_to_add, split_in_dict in sp_list: if abs(curr_freq-freq) > 0.000001: # dropping down to the next lowest freq curr_l = [freq, []] curr_all_splits_list = curr_l[1] all_splits_by_freq.append(curr_l) curr_l = [freq, []] curr_compat_splits_list = curr_l[1] compat_splits_by_freq.append(curr_l) curr_freq = freq curr_all_splits_list.append(split_to_add) if (split_to_add & root_edge.clade_mask) != split_to_add: continue lb = splits.lowest_bit_only(split_to_add) one_leaf = to_leaf_dict[lb] parent_node = one_leaf while (split_to_add & parent_node.edge.clade_mask) != split_to_add: parent_node = parent_node.parent_node if parent_node is None or parent_node.edge.clade_mask == split_to_add: continue # split is not in tree, or already in tree. new_node = trees.Node() new_node_children = [] new_edge = new_node.edge new_edge.clade_mask = 0 for child in parent_node.child_nodes(): # might need to modify the following if rooted splits # are used cecm = child.edge.clade_mask if (cecm & split_to_add ): assert cecm != split_to_add new_edge.clade_mask |= cecm new_node_children.append(child) # Check to see if we have accumulated all of the bits that we # needed, but none that we don't need. if new_edge.clade_mask == split_to_add: for child in new_node_children: parent_node.remove_child(child) new_node.add_child(child) parent_node.add_child(new_node) con_tree.split_edges[split_to_add] = new_edge curr_compat_splits_list.append(split_to_add) ref_set = set() for s in ref_tree.split_edges.iterkeys(): m = s & taxa_mask if 1 & m: k = (~m) & taxa_mask else: k = m if not splits.is_trivial_split(k, taxa_mask): ref_set.add(k) all_set = set() compat_set = set() _LOG.debug("%d edges is the reference tree" % (len(ref_set))) print "freq\tcompatFP\tcompatFN\tcompatSD\tallFP\tallFN\tallSD" for all_el, compat_el in itertools.izip(all_splits_by_freq, compat_splits_by_freq): freq = all_el[0] all_sp = all_el[1] all_set.update(all_sp) all_fn = len(ref_set - all_set) all_fp = len(all_set - ref_set) compat_sp = compat_el[1] compat_set.update(compat_sp) compat_fn = len(ref_set - compat_set) compat_fp = len(compat_set - ref_set) print "%f\t%d\t%d\t%d\t%d\t%d\t%d" % (freq, compat_fp, compat_fn, compat_fp + compat_fn, all_fp, all_fn, all_fp + all_fn )
def tree_from_splits(self, split_distribution, min_freq=0.5, include_edge_lengths=True): "Returns a consensus tree from splits in `split_distribution`." leaf_to_root_search = True taxa_block = split_distribution.taxa_block con_tree = treegen.star_tree(taxa_block) split_freqs = split_distribution.split_frequencies taxa_mask = taxa_block.all_taxa_bitmask() splits.encode_splits(con_tree) leaves = con_tree.leaf_nodes() if leaf_to_root_search: to_leaf_dict = {} for leaf in leaves: to_leaf_dict[leaf.edge.clade_mask] = leaf include_edge_lengths = self.support_as_labels and include_edge_lengths unrooted = split_distribution.unrooted to_try_to_add = [] for s, f in split_freqs.iteritems(): if (f > min_freq): m = s & taxa_mask if (m != taxa_mask) and ( (m - 1) & m ): # if not root (i.e., all "1's") and not singleton (i.e., one "1") if unrooted: c = (~m) & taxa_mask if (c - 1) & c: # not singleton (i.e., one "0") if 1 & m: k = c else: k = m to_try_to_add.append((f, k, m)) else: to_try_to_add.append((f, m, m)) to_try_to_add.sort(reverse=True) root = con_tree.seed_node root_edge = root.edge # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree for freq, split_to_add, split_in_dict in to_try_to_add: if (split_to_add & root_edge.clade_mask) != split_to_add: continue elif leaf_to_root_search: lb = splits.lowest_bit_only(split_to_add) one_leaf = to_leaf_dict[lb] parent_node = one_leaf while (split_to_add & parent_node.edge.clade_mask) != split_to_add: parent_node = parent_node.parent_node else: parent_node = shallowest_containing_node( start_node=con_tree.seed_node, split=split_to_add, taxa_mask=taxa_mask) if parent_node is None or parent_node.edge.clade_mask == split_to_add: continue # split is not in tree, or already in tree. new_node = trees.Node() self.map_split_support_to_node(node=new_node, split_support=freq) new_node_children = [] new_edge = new_node.edge new_edge.clade_mask = 0 for child in parent_node.child_nodes(): # might need to modify the following if rooted splits # are used cecm = child.edge.clade_mask if (cecm & split_to_add): assert cecm != split_to_add new_edge.clade_mask |= cecm new_node_children.append(child) # Check to see if we have accumulated all of the bits that we # needed, but none that we don't need. if new_edge.clade_mask == split_to_add: if include_edge_lengths: elen = split_distribution.split_edge_lengths[split_in_dict] if len(elen) > 0: new_edge.length = float(sum(elen)) / len(elen) else: new_edge.length = None for child in new_node_children: parent_node.remove_child(child) new_node.add_child(child) parent_node.add_child(new_node) con_tree.split_edges[split_to_add] = new_edge ## here we add the support values and/or edge lengths for the terminal taxa ## for node in leaves: if unrooted: split = con_tree.split_edges.normalize_key( node.edge.clade_mask) else: split = node.edge.clade_mask self.map_split_support_to_node(node, 1.0) if include_edge_lengths: elen = split_distribution.split_edge_lengths.get(split, [0.0]) if len(elen) > 0: node.edge.length = float(sum(elen)) / len(elen) else: node.edge.length = None return con_tree
garli.datafname = os.path.join("data.nex") raw_trees = full_dataset.read_trees(open(intree_file, "rU"), format="NEXUS") assert(raw_trees) current_taxon_mask = None # read initial trees and verify that they have the correct set of taxa for tree in raw_trees: assert tree.taxa_block is taxa encode_splits(tree) if current_taxon_mask is None: current_taxon_mask = tree.seed_node.edge.clade_mask _LOG.debug("%s = current_taxon_mask" % bin(current_taxon_mask)) assert( (current_taxon_mask | full_taxa_mask) == full_taxa_mask) toadd_taxon_mask = current_taxon_mask ^ full_taxa_mask else: assert(current_taxon_mask == tree.seed_node.edge.clade_mask) next_toadd = lowest_bit_only(current_taxon_mask^full_taxa_mask) if (next_toadd - 1) != current_taxon_mask: _LOG.debug("%s = next_toadd" % format_split(next_toadd, taxa=taxa)) _LOG.debug("%s = current_taxon_mask\n(next_toadd - 1) != current_taxon_mask" % format_split(current_taxon_mask, taxa=taxa)) sys.exit("In this version, taxa must be added to the tree in the order that they appear in the matrix") inp_trees = [TreeModel(tree=i) for i in raw_trees] garli.incrementally_build_trees(full_dataset, inp_trees) sys.exit(0)
def main_cli(): description = '%s %s ' % (_program_name, _program_version) usage = "%prog [options] <TREES FILE> [<TREES FILE> [<TREES FILE> [...]]" parser = OptionParser(usage=usage, add_help_option=True, version=_program_version, description=description) parser.add_option('-r', '--reference', dest='reference_tree_filepath', default=None, help="path to file containing the reference (true) tree") parser.add_option('-v', '--verbose', action='store_false', dest='quiet', default=True, help="Verbose mode") (opts, args) = parser.parse_args() ################################################### # Support file idiot checking sampled_filepaths = [] missing = False for fpath in args: fpath = os.path.expanduser(os.path.expandvars(fpath)) if not os.path.exists(fpath): sys.exit('Sampled trees file not found: "%s"' % fpath) sampled_filepaths.append(fpath) if not sampled_filepaths: sys.exit( "Expecting arguments indicating files that contain sampled trees") sampled_file_objs = [open(f, "rU") for f in sampled_filepaths] ################################################### # Lots of other idiot-checking ... # target tree if opts.reference_tree_filepath is None: sys.exit( "A reference tree must be specified (use -h to see all options)") reference_tree_filepath = os.path.expanduser( os.path.expandvars(opts.reference_tree_filepath)) if not os.path.exists(reference_tree_filepath): sys.exit('Reference tree file not found: "%s"\n' % reference_tree_filepath) d = Dataset() ref_trees = d.read_trees(open(reference_tree_filepath, 'ru'), schema="NEXUS") if len(ref_trees) != 1: sys.exit("Expecting one reference tree") ref_tree = ref_trees[0] splits.encode_splits(ref_tree) assert (len(d.taxa_blocks) == 1) taxa = d.taxa_blocks[0] ################################################### # Main work begins here: Count the splits start_time = datetime.datetime.now() comments = [] tsum = treesum.TreeSummarizer() tsum.burnin = 0 if opts.quiet: tsum.verbose = False tsum.write_message = None else: tsum.verbose = True tsum.write_message = sys.stderr.write _LOG.debug("### COUNTING SPLITS ###\n") split_distribution = splits.SplitDistribution(taxa_block=taxa) tree_source = MultiFileTreeIterator(filepaths=sampled_filepaths, core_iterator=nexus.iterate_over_trees) tsum.count_splits_on_trees(tree_source, split_distribution) report = [] report.append("%d trees read from %d files." % (tsum.total_trees_read, len(sampled_filepaths))) report.append("%d trees ignored in total." % (tree_source.total_trees_ignored)) report.append( "%d trees considered in total for split support assessment." % (tsum.total_trees_counted)) report.append("%d unique taxa across all trees." % len(split_distribution.taxa_block)) num_splits, num_unique_splits, num_nt_splits, num_nt_unique_splits = split_distribution.splits_considered( ) report.append("%d unique splits out of %d total splits counted." % (num_unique_splits, num_splits)) report.append( "%d unique non-trivial splits out of %d total non-trivial splits counted." % (num_nt_unique_splits, num_nt_splits)) _LOG.debug("\n".join(report)) con_tree = treegen.star_tree(taxa) taxa_mask = taxa.all_taxa_bitmask() splits.encode_splits(con_tree) leaves = con_tree.leaf_nodes() to_leaf_dict = {} for leaf in leaves: to_leaf_dict[leaf.edge.clade_mask] = leaf unrooted = True n_read = float(tsum.total_trees_read) sp_list = [] for split, count in split_distribution.split_counts.iteritems(): freq = count / n_read if not splits.is_trivial_split(split, taxa_mask): m = split & taxa_mask if (m != taxa_mask) and ( (m - 1) & m ): # if not root (i.e., all "1's") and not singleton (i.e., one "1") if unrooted: c = (~m) & taxa_mask if (c - 1) & c: # not singleton (i.e., one "0") if 1 & m: k = c else: k = m sp_list.append((freq, k, m)) else: sp_list.append((freq, m, m)) sp_list.sort(reverse=True) root = con_tree.seed_node root_edge = root.edge curr_freq = 1.1 curr_all_splits_list = [] curr_compat_splits_list = [] all_splits_by_freq = [] compat_splits_by_freq = [] # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree for freq, split_to_add, split_in_dict in sp_list: if abs(curr_freq - freq) > 0.000001: # dropping down to the next lowest freq curr_l = [freq, []] curr_all_splits_list = curr_l[1] all_splits_by_freq.append(curr_l) curr_l = [freq, []] curr_compat_splits_list = curr_l[1] compat_splits_by_freq.append(curr_l) curr_freq = freq curr_all_splits_list.append(split_to_add) if (split_to_add & root_edge.clade_mask) != split_to_add: continue lb = splits.lowest_bit_only(split_to_add) one_leaf = to_leaf_dict[lb] parent_node = one_leaf while (split_to_add & parent_node.edge.clade_mask) != split_to_add: parent_node = parent_node.parent_node if parent_node is None or parent_node.edge.clade_mask == split_to_add: continue # split is not in tree, or already in tree. new_node = trees.Node() new_node_children = [] new_edge = new_node.edge new_edge.clade_mask = 0 for child in parent_node.child_nodes(): # might need to modify the following if rooted splits # are used cecm = child.edge.clade_mask if (cecm & split_to_add): assert cecm != split_to_add new_edge.clade_mask |= cecm new_node_children.append(child) # Check to see if we have accumulated all of the bits that we # needed, but none that we don't need. if new_edge.clade_mask == split_to_add: for child in new_node_children: parent_node.remove_child(child) new_node.add_child(child) parent_node.add_child(new_node) con_tree.split_edges[split_to_add] = new_edge curr_compat_splits_list.append(split_to_add) ref_set = set() for s in ref_tree.split_edges.iterkeys(): m = s & taxa_mask if 1 & m: k = (~m) & taxa_mask else: k = m if not splits.is_trivial_split(k, taxa_mask): ref_set.add(k) all_set = set() compat_set = set() _LOG.debug("%d edges is the reference tree" % (len(ref_set))) print "freq\tcompatFP\tcompatFN\tcompatSD\tallFP\tallFN\tallSD" for all_el, compat_el in itertools.izip(all_splits_by_freq, compat_splits_by_freq): freq = all_el[0] all_sp = all_el[1] all_set.update(all_sp) all_fn = len(ref_set - all_set) all_fp = len(all_set - ref_set) compat_sp = compat_el[1] compat_set.update(compat_sp) compat_fn = len(ref_set - compat_set) compat_fp = len(compat_set - ref_set) print "%f\t%d\t%d\t%d\t%d\t%d\t%d" % (freq, compat_fp, compat_fn, compat_fp + compat_fn, all_fp, all_fn, all_fp + all_fn)
assert (len(d.char_blocks) == 1) assert (len(characters) == len(taxa)) inp_trees = d.read_trees(open(intree_file, "rU"), format="NEXUS") assert (inp_trees) current_taxon_mask = None for tree in inp_trees: assert tree.taxa_block is taxa encode_splits(tree) if current_taxon_mask is None: current_taxon_mask = tree.seed_node.edge.clade_mask _LOG.debug("%s = current_taxon_mask" % bin(current_taxon_mask)) assert ((current_taxon_mask | full_taxa_mask) == full_taxa_mask) toadd_taxon_mask = current_taxon_mask ^ full_taxa_mask else: assert (current_taxon_mask == tree.seed_node.edge.clade_mask) next_toadd = lowest_bit_only(current_taxon_mask ^ full_taxa_mask) if (next_toadd - 1) != current_taxon_mask: _LOG.debug("%s = next_toadd" % format_split(next_toadd, taxa=taxa)) _LOG.debug( "%s = current_taxon_mask\n(next_toadd - 1) != current_taxon_mask" % format_split(current_taxon_mask, taxa=taxa)) sys.exit( "In this version, taxa must be added to the tree in the order that they appear in the matrix" ) conf["datafname"] = data_file for tree in inp_trees: trees = addToTree(tree, conf, d) sys.exit(0) ###################################################
def reroot_on_lowest_common_index_path(t, common_mask): """This operation is only for unrooted trees that are being merged using SCM. The path the separates the lowest index taxon in the leaf set intersection is placed as the first descendant path of the "seed_node" for the tree. This assures that all split representations are oriented in the same way for subsequent operations. The mask most contain more that 2 bits (there must be an internal node in the tree that is has degree > 2 wrt the common leafset). """ l = lowest_bit_only(common_mask) assert (l > 0) assert (count_bits(common_mask) > 2) # start at the lowest leaf in common. curr_n = t.split_edges[l].head_node # walk back toward the root until we find a node that has another bit p = curr_n.parent_node while p: if (p.edge.clade_mask & common_mask) != l: break curr_n = p p = curr_n.parent_node without_lowest = common_mask ^ l taxa_mask = t.seed_node.edge.clade_mask if (curr_n.edge.clade_mask & common_mask) == l: # we did not make it to the root. Make curr_n, the first_child of the root t.to_outgroup_position(curr_n, splits=True, delete_deg_two=True) avoid = curr_n nd_source = iter(t.seed_node.child_nodes()) try: while True: #curr_n = nd_source.next() for curr_n in nd_source: if curr_n is not avoid: cm = (curr_n.edge.clade_mask & without_lowest) if cm: if cm == without_lowest: r = t.seed_node assert curr_n.parent_node is r t.reroot_at(curr_n, splits=True, delete_deg_two=True) t.to_outgroup_position(r, splits=True, delete_deg_two=True) nd_source = iter(curr_n.child_nodes()) avoid = r break else: return except StopIteration: assert False return # we hit the root, now we walk up the tree, to find the a relevant internal lowest_on_path_to_l = curr_n comp_mask = (~common_mask) & taxa_mask children = curr_n.child_nodes() assert (len(children) > 1) nd_source = iter(children) try: while True: c = nd_source.next() cm = c.edge.clade_mask masked_cm = cm & common_mask if masked_cm: if masked_cm == without_lowest: curr_n = c children = curr_n.child_nodes() assert (len(children) > 1) nd_source = iter(children) else: break except StopIteration: raise AssertionError("Reaching supposedly unreachable code") if curr_n is not t.seed_node: # We have found the first relevant internal node, we want to make it # the root. We can do this by putting one of its children into the # "outgroup position" and then putting the path to lowest commond # leaf in the outgroup position (this last operation is just a # rearrangement of the order of children in the root. children = curr_n.child_nodes() assert (len(children) > 1) p = curr_n.parent t.to_outgroup_position(children[0], splits=True, delete_deg_two=True) t.to_outgroup_position(p, splits=True, delete_deg_two=True) else: # if the root first relevant, node then we just make the path leading # to the lowest index node the first child of the root t.to_outgroup_position(lowest_on_path_to_l, splits=True, delete_deg_two=True)