예제 #1
0
    def testRerootSplits(self):
        newick = "((Athrotaxi,(Callitris,(Juniperusc,Libocedrus))),(((((((Basichlsac,(Mougeotisp,Lamprothma)),Thuidium),(Petalaphy,Haplomitr2)),((Botrychbit,(Vittarifle,((Dicksonant,((Polypodapp,Oleandrapi),Dennstasam)),Azollacaro))),Angiopteri)),Isoetesmel),((Sagittari,(Calochort,(Tacca,(Calathea,Ravenala)))),((Nelumbo,((((((Verbena,((Thunbergi,Acanthus),(Proboscid,Harpogoph))),Asclepias),Menyanthe),(Phyllonom,(Chamaedap,Pyrola))),((((Mirabilus,Pisum),Circaea),((Rheinward,Octomeles),Greyia)),Dudleya)),Phoradend)),(((Liriodchi,Annona),Gyrocarpu),Illicium)))),(Pseudotsu,(Agathisova,Agathismac))));"
        d = dataio.trees_from_newick([newick])
        tree = d.trees_blocks[0][0]
        taxa_block = d.taxa_blocks[0]
        ref = dataio.trees_from_newick(
            [newick], taxa_block=taxa_block).trees_blocks[0][0]
        encode_splits(tree)
        encode_splits(ref)
        r = tree.seed_node
        curr_n = r.child_nodes()[1]
        former_mask = curr_n.edge.clade_mask
        tm = r.edge.clade_mask
        nbits = count_bits(tm)
        from dendropy.splits import split_as_string

        tree.reroot_at(curr_n, splits=True, delete_deg_two=False)

        new_root = tree.seed_node
        self.assertEqual(tm, new_root.edge.clade_mask)
        self.assertEqual(True, new_root is curr_n)
        self.assertEqual(True, r.parent_node is curr_n)
        flipped = (~(r.edge.clade_mask)) & tm
        self.assertEqual(True, (former_mask == r.edge.clade_mask)
                         or (flipped == former_mask))
예제 #2
0
    def testChangeTranslate(self):
        f = """#NEXUS
Begin taxa ;
    dimensions ntax = 4;
    taxlabels a b c d ;
end;
begin trees;
    translate 
        1 a,
        2 b,
        3 c,
        4 d;
    tree t = (1,2,(3,4));
end;
begin trees;
    translate 
        1 d,
        2 b,
        3 c,
        4 a;
    tree t = (4,2,(3,1));
end;
"""
        d = Dataset()
        d.read(StringIO(f), format="NEXUS")
        t = d.trees_blocks[0][0]
        s = d.trees_blocks[1][0]
        self.assertEqual(t.taxa_block, s.taxa_block)
        encode_splits(s)
        encode_splits(t)
        self.assertEqual(treedists.symmetric_difference(t, s), 0)
예제 #3
0
    def testEuclideanDist(self):
        d = dataio.trees_from_newick([
            "((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);",
            "((t5:2.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):1):0.065840,t3:0.170221):0.383247);",
            "((t5:0.161175,t6:0.161175):0.392293,((t2:0.075411,(t4:0.104381,t1:0.075411):1):0.065840,t3:0.170221):0.383247);",
            "((t5:0.161175,t6:0.161175):0.392293,((t4:0.104381,(t2:0.075411,t1:0.075411):0.028969):0.065840,t3:0.170221):0.383247);",
        ])
        tree_list = [i[0] for i in d.trees_blocks]
        #print "\n".join([str(i) for i in tree_list])
        for i in tree_list:
            encode_splits(i)
        assert_approx_equal(
            treedists.euclidean_distance(tree_list[0], tree_list[1]), 2.0)
        assert_approx_equal(
            treedists.euclidean_distance(tree_list[0], tree_list[2]),
            math.sqrt(2.0))
        assert_approx_equal(
            treedists.euclidean_distance(tree_list[0], tree_list[3]),
            0.97103099999999998)

        assert_approx_equal(
            treedists.euclidean_distance(tree_list[1], tree_list[2]),
            math.sqrt(6.0))
        assert_approx_equal(
            treedists.euclidean_distance(tree_list[1], tree_list[3]),
            2.2232636377544162)

        assert_approx_equal(
            treedists.euclidean_distance(tree_list[2], tree_list[3]),
            1.000419513484718)
예제 #4
0
 def testCollapseClade(self):
     tree = dataio.trees_from_newick(["(t5,t6,((t4,(t2,t1)),t3));"
                                      ]).trees_blocks[0][0]
     encode_splits(tree)
     root = tree.seed_node
     root_children = root.child_nodes()
     fc = root_children[0]
     collapse_clade(fc)
     tree.debug_check_tree(splits=True)
     self.assertEqual(str(tree), "(t5,t6,((t4,(t2,t1)),t3))")
     fc2 = root_children[2]
     fc2children = fc2.child_nodes()
     t124child = fc2children[0]
     collapse_clade(t124child)
     tree.debug_check_tree(logger_obj=_LOG)
     self.assertEqual(str(tree), "(t5,t6,((t4,t2,t1),t3))")
     collapse_clade(fc2)
     tree.debug_check_tree(logger_obj=_LOG)
     self.assertEqual(str(tree), "(t5,t6,(t4,t2,t1,t3))")
     collapse_clade(root)
     tree.debug_check_tree(logger_obj=_LOG)
     tree.debug_check_tree(logger_obj=_LOG)
     self.assertEqual(str(tree), "(t5,t6,t4,t2,t1,t3)")
     tree = dataio.trees_from_newick(["((t5,t6),((t4,(t2,t1)),t3));"
                                      ]).trees_blocks[0][0]
     root = tree.seed_node
     collapse_clade(root)
     tree.debug_check_tree(logger_obj=_LOG)
     self.assertEqual(str(tree), "(t5,t6,t4,t2,t1,t3)")
예제 #5
0
 def kernelOfTest(self, trees):
     expected = trees[-1]
     input = trees[:-1]
     output = strict_consensus_merge(input)
     encode_splits(output)
     encode_splits(expected)
     if symmetric_difference(expected, output) != 0:
         self.fail("\n%s\n!=\n%s" % (str(output), str(expected)))
예제 #6
0
 def map_split_support_to_tree(self, tree, split_distribution):
     "Maps splits support to the given tree."
     split_frequencies = split_distribution.split_frequencies
     tree.normalize_taxa(taxa_block=split_distribution.taxa_block)
     assert tree.taxa_block is split_distribution.taxa_block
     splits.encode_splits(tree)
     for split in tree.split_edges:
         if split in split_frequencies:
             split_support = split_frequencies[split]
         else:
             split_support = 0.0
         self.map_split_support_to_node(tree.split_edges[split].head_node,
                                        split_support)
     return tree
예제 #7
0
    def testSymmDiff(self):
        newick = "((t5,t6),((t4,(t2,t1)),t3));"
        d = dataio.trees_from_newick([newick])
        ref = d.trees_blocks[0][0]
        taxa_block = d.taxa_blocks[0]

        encode_splits(ref)

        o_newick = "((t1,t2),((t4,(t5,t6)),t3));"
        o_tree = dataio.trees_from_newick(
            [o_newick], taxa_block=taxa_block).trees_blocks[0][0]
        encode_splits(o_tree)

        self.assertEqual(treedists.symmetric_difference(o_tree, ref), 2)
예제 #8
0
 def check_tree(self, tree_str):          
     d1 = datasets.Dataset()
     tree1 = d1.trees_from_string(tree_str, format="newick")[0]
     pa, edge_lens = to_parent_array(tree1, True, False)
     _LOG.info('Original tree: %s' % tree_str)
     cmd = self.prog_path + " " + " ".join(pa)
     stdout, stderr, returncode = run_program(cmd)
     assert returncode == 0, "Program exited with error:\n%s" % stderr
     _LOG.info('Returned tree: %s' % stdout)
     tree2 = d1.trees_from_string(stdout, format="newick")[0]
     splits.encode_splits(tree1)
     splits.encode_splits(tree2)
     d = treedists.symmetric_difference(tree1, tree2)
     assert d == 0, "Symmetric distance = %d:\n%s;\n%s;" % (d, tree_str, stdout)
예제 #9
0
 def score_tree_list(self, full_dataset, inp_trees, stop_gen):
     culled = self._write_garli_input(full_dataset)
     culled_taxa = culled.taxa_blocks[0]
     self.set_active_taxa(culled_taxa)
     rescored = []
     for tree_ind, tree in enumerate(inp_trees):
         tm = self.score_tree(tree, culled, tree_ind, stop_gen=stop_gen)
         encode_splits(tm.tree)
         rescored.append(tm)
     rescored.sort(reverse=True)
     del full_dataset.trees_blocks[:]
     full_dataset.trees_blocks.append([i.tree for i in rescored])
     o = open("incrgarli.tre", "w")
     write_tree_file(o, rescored, culled)
     o.close()
     return rescored
예제 #10
0
 def score_tree_list(self, full_dataset, inp_trees, stop_gen):
     culled = self._write_garli_input(full_dataset)
     culled_taxa = culled.taxa_blocks[0]
     self.set_active_taxa(culled_taxa)
     rescored = []
     for tree_ind, tree in enumerate(inp_trees):
         tm = self.score_tree(tree, culled, tree_ind, stop_gen=stop_gen)
         encode_splits(tm.tree)
         rescored.append(tm)
     rescored.sort(reverse=True)
     del full_dataset.trees_blocks[:]
     full_dataset.trees_blocks.append([i.tree for i in rescored])
     o = open("incrgarli.tre", "w")
     write_tree_file(o, rescored, culled)
     o.close()
     return rescored
예제 #11
0
 def testRandomlyReorient(self):
     n = '(Basichlsac,(Lamprothma,Mougeotisp),(((Haplomitr2,Petalaphy),((Angiopteri,(((Azollacaro,((Dennstasam,(Oleandrapi,Polypodapp)),Dicksonant)),Vittarifle),Botrychbit)),(Isoetesmel,((((Agathismac,Agathisova),Pseudotsu),(((Libocedrus,Juniperusc),Callitris),Athrotaxi)),((Liriodchi,Nelumbo),Sagittari))))),Thuidium));'
     m = [n, n]
     dataset = dataio.trees_from_newick(m)
     trees = [i[0] for i in dataset.trees_blocks]
     ref = trees[0]
     changing = trees[1]
     rng = DebuggingRandom()
     encode_splits(ref)
     encode_splits(changing)
     for i in xrange(50):
         randomly_reorient_tree(changing, rng=rng, splits=True)
         self.assertNotEqual(str(changing), n)
         changing.debug_check_tree(logger_obj=_LOG, splits=True)
         if symmetric_difference(ref, changing) != 0:
             self.fail("\n%s\n!=\n%s" % (str(ref), str(changing)))
예제 #12
0
def strict_consensus_merge(trees_to_merge,
                           copy_trees=False,
                           rooted=False,
                           gordons_supertree=False):
    """Returns a tree that is the strict consensus merger of the input trees.
    
    If copy_trees is True then the trees will be copied before the merger 
    operation, if the `copy_trees` is False then the input trees will be 
    destroyed by the operation (and the modified first tree will be returned).
    """
    if copy_trees:
        tree_list = [copy.copy(i) for i in trees_to_merge]
    else:
        tree_list = list(trees_to_merge)
        del trees_to_merge[1:]
    nTrees = len(tree_list)
    _LOG.debug('%d Trees to merge:\n%s\n' %
               (nTrees, '\n'.join([str(i) for i in tree_list])))
    if nTrees < 2:
        return tree_list[0]
    #tree_iter = iter(tree_list)
    #to_modify = tree_iter.next()
    to_modify = tree_list[0]

    if rooted:
        raise NotImplementedError("Rooted SCM is not implemented")
    else:
        to_modify.deroot()
    encode_splits(to_modify)
    if IS_DEBUG_LOGGING:
        assert to_modify._debug_tree_is_valid(splits=False)
    #for to_consume in tree_iter:
    for to_consume in tree_list[1:]:
        if not rooted:
            to_consume.deroot()
        encode_splits(to_consume)
        if IS_DEBUG_LOGGING:
            assert to_consume._debug_tree_is_valid(splits=True)
        add_to_scm(to_modify,
                   to_consume,
                   rooted,
                   gordons_supertree=gordons_supertree)
        if IS_DEBUG_LOGGING:
            assert to_modify._debug_tree_is_valid(splits=False)

    return to_modify
예제 #13
0
    def find_best_conflicting(self, starting_tree, split, dataset):
        new_starting = TreeModel(model=starting_tree.model)
        new_starting.tree = copy.deepcopy(starting_tree.tree)
        root = new_starting.tree.seed_node
        e = find_edge_from_split(root, split, root.edge.clade_mask)
        if e:
            e.collapse()

        tmp_tree_filename = ".tmp.tre"
        write_trees_to_filepath([new_starting], dataset, tmp_tree_filename)
        self.cache_settings()

        try:

            tmp_constrain_filename = ".tmpconstrain.tre"
            self.constraintfile = tmp_constrain_filename
            f = open(tmp_constrain_filename, "w")
            f.write("-%s\n" %
                    split_as_string_rev(split, self.curr_n_taxa, '.', '*'))
            f.close()

            self.ofprefix = "negconst%d" % (split)

            # it seems a little odd to call this incompletetreefname rather
            #   than streefname but we'd like to trigger the interactive mode,
            #   and this is one way of doing that.
            self.incompletetreefname = tmp_tree_filename
            self.runmode = GARLI_ENUM.INCR_RUNMODE  # NORMAL_RUNMODE
            self.topoweight = self.negcon_topoweight
            self.modweight = self.negcon_modweight
            self.stopgen = self.negcon_stopgen
            invoc = []
            if new_starting.model:
                invoc.append("model = %s" % str(new_starting.model))
            invoc.append("run")
            self.run(invoc, terminate_run=True)
        finally:
            self.restore_settings()

        err_lines = self.stderrThread.lines_between_prompt()
        r = self.parse_igarli_lines(err_lines, dataset)
        r.sort(reverse=True)
        for tm in r:
            encode_splits(tm.tree)
            assert split not in tm.tree.split_edges
        return r
예제 #14
0
    def find_best_conflicting(self, starting_tree, split, dataset):
        new_starting = TreeModel(model=starting_tree.model)
        new_starting.tree = copy.deepcopy(starting_tree.tree)
        root = new_starting.tree.seed_node
        e = find_edge_from_split(root, split, root.edge.clade_mask)
        if e:
            e.collapse()

        tmp_tree_filename = ".tmp.tre"
        write_trees_to_filepath([new_starting], dataset, tmp_tree_filename)
        self.cache_settings()

        try:
    
            tmp_constrain_filename = ".tmpconstrain.tre"
            self.constraintfile = tmp_constrain_filename
            f = open(tmp_constrain_filename, "w")
            f.write("-%s\n" % split_as_string_rev(split, self.curr_n_taxa, '.', '*'))
            f.close()
    
            self.ofprefix = "negconst%d" % (split)
            
            # it seems a little odd to call this incompletetreefname rather 
            #   than streefname but we'd like to trigger the interactive mode, 
            #   and this is one way of doing that.
            self.incompletetreefname = tmp_tree_filename
            self.runmode = GARLI_ENUM.INCR_RUNMODE # NORMAL_RUNMODE
            self.topoweight = self.negcon_topoweight
            self.modweight = self.negcon_modweight
            self.stopgen = self.negcon_stopgen
            invoc = []
            if new_starting.model:
                invoc.append("model = %s" % str(new_starting.model))
            invoc.append("run")
            self.run(invoc, terminate_run=True)
        finally:
            self.restore_settings()

        err_lines = self.stderrThread.lines_between_prompt()
        r = self.parse_igarli_lines(err_lines, dataset)
        r.sort(reverse=True)
        for tm in r:
            encode_splits(tm.tree)
            assert split not in tm.tree.split_edges
        return r
예제 #15
0
    def testSplits(self):
        unrooted = True
        for tc in test_cases:
            tree_filepaths = [dendropy.tests.data_source_path(tc[0])]
            taxa_filepath = dendropy.tests.data_source_path(tc[1])
            paup_sd = paup.get_split_distribution(tree_filepaths,
                                                  taxa_filepath,
                                                  unrooted=unrooted,
                                                  burnin=0)
            taxa_block = paup_sd.taxa_block
            dp_sd = splits.SplitDistribution(taxa_block=taxa_block)
            dp_sd.ignore_edge_lengths = True
            dp_sd.ignore_node_ages = True
            dp_sd.unrooted = unrooted

            taxa_mask = taxa_block.all_taxa_bitmask()
            taxa_block.lock()
            for tree_filepath in tree_filepaths:
                for tree in nexus.iterate_over_trees(open(tree_filepath, "rU"),
                                                     taxa_block=taxa_block):
                    #_LOG.debug("tree = %s" % str(tree))
                    splits.encode_splits(tree)
                    dp_sd.count_splits_on_tree(tree)

            self.assertEqual(dp_sd.total_trees_counted,
                             paup_sd.total_trees_counted)

            # SplitsDistribution counts trivial splits, whereas PAUP*
            # contree does not, so the following will not work
            #             assert len(dp_sd.splits) == len(paup_sd.splits),\
            #                 "dp = %d, sd = %d" % (len(dp_sd.splits), len(paup_sd.splits))

            taxa_mask = taxa_block.all_taxa_bitmask()
            for split in dp_sd.splits:
                if not splits.is_trivial_split(split, taxa_mask):
                    self.assertTrue(split in paup_sd.splits)
                    self.assertEqual(dp_sd.split_counts[split],
                                     paup_sd.split_counts[split])
                    paup_sd.splits.remove(split)

            # if any splits remain here, they were not
            # in dp_sd
            assert len(paup_sd.splits) == 0
예제 #16
0
    def testReroot(self):
        newick = "((t5,t6),((t4,(t2,t1)),t3));"
        d = dataio.trees_from_newick([newick])
        tree = d.trees_blocks[0][0]
        taxa_block = d.taxa_blocks[0]
        ref = dataio.trees_from_newick(
            [newick], taxa_block=taxa_block).trees_blocks[0][0]
        encode_splits(ref)

        o_newick = "((t2, t1),((t4,(t5,t6)),t3));"
        o_tree = dataio.trees_from_newick(
            [o_newick], taxa_block=taxa_block).trees_blocks[0][0]
        encode_splits(o_tree)
        self.assertEqual(symmetric_difference(o_tree, ref), 2)

        taxa_labels = ["t%d" % i for i in xrange(1, 7)]
        for leaf_name in taxa_labels:
            f = lambda x: x.label == leaf_name
            nd = tree.find_taxon_node(f)
            tree.to_outgroup_position(nd)
            r_newick = str(tree)
            r_tree = dataio.trees_from_newick(
                [r_newick], taxa_block=taxa_block).trees_blocks[0][0]
            encode_splits(r_tree)
            self.assertEqual(symmetric_difference(r_tree, ref), 0)
예제 #17
0
def gather_neighborhood_commands(tree_list):
    dim = len(tree_list)
    mat = [[0]*dim for i in range(dim)]
    _LOG.debug("tree_list = %s\n" % str(tree_list))
    for row_n, i in enumerate(tree_list[:-1]):
        offset = 1 + row_n
        row_splits = tree_list[row_n].splits
        _LOG.debug("row_splits = %s\n" % str(row_splits))
        row = mat[row_n]
        for col_disp, j in enumerate(tree_list[offset:]):
            col_n = offset + col_disp
            col_splits = tree_list[col_n].splits
            _LOG.debug("col_splits = %s\n" % str(col_splits))
            d = len(row_splits.symmetric_difference(col_splits))
            _LOG.debug("d = %s\n" % str(d))
            row[col_n] = d
            mat[col_n][row_n] = d
    connected_indices = connected_at(mat, 4)
    _LOG.debug("connected_indices = %s\n" % str(connected_indices))
    sc_tr_commands_list = []
    for group_indices in connected_indices:
        trees = [tree_list[i] for i in group_indices]
        first_tree = trees[0]
        cmd_list = ["model = %s\ntree = %s\n" % (first_tree.model, first_tree.tree_string)]
        sc_tr_commands = ScoreConstraintCommands(first_tree.score, None, cmd_list)
        cmd_list.append("clearconstraints = 1\n")

        
        if len(trees) > 1:
            trees.sort(reverse=True, cmp=lambda x,y: cmp(x.score,y.score))
            c = copy.deepcopy(first_tree.tree)
            encode_splits(c)
            e = find_edge_from_split(c.seed_node, last_split)
            edge_dist = 3
            e.head_node.collapse_neighborhood(edge_dist)
            c.splits = get_norm_nontrivial_split_set(c)
            si = set(c.splits)
            for t in trees[1:]:
                si.intersection_update(t.splits)
            if len(si):
                sd = SplitDistribution(taxa_block=taxa_block, split_set=si)
                ts = TreeSummarizer()
                sc = ts.tree_from_splits(sd, min_freq=None, include_edge_lengths=False)
                encode_splits(sc)
                sc.splits = si
                sc_tr_commands.constr_splits = si
                cmd_list.append("posconstraint = %s\n" % sc.compose_newick(edge_lengths=False))
        else:
            c = copy.deepcopy(first_tree.tree)
            e = find_edge_from_split(c.seed_node, last_split)
            edge_dist = 3
            e.head_node.collapse_neighborhood(edge_dist)
            encode_splits(c)
            sc_tr_commands.constr_splits = get_norm_nontrivial_split_set(c)
            if len(sc_tr_commands.constr_splits) > 0:
                cmd_list.append("posconstraint = %s\n" % c.compose_newick(edge_lengths=False))
        cmd_list.append("run\n")
        sc_tr_commands_list.append(sc_tr_commands)
    return sc_tr_commands_list
예제 #18
0
 def count_splits_on_trees(self,
                           tree_iterator,
                           split_distribution=None,
                           trees_splits_encoded=False):
     """
     Given a list of trees file, a SplitsDistribution object (a new one, or,
     if passed as an argument) is returned collating the split data in the files.
     """
     if split_distribution is None:
         split_distribution = splits.SplitDistribution()
     taxa_block = split_distribution.taxa_block
     for tree_idx, tree in enumerate(tree_iterator):
         self.total_trees_counted += 1
         if taxa_block is None:
             assert (split_distribution.taxa_block is None)
             split_distribution.taxa_block = tree.taxa_block
             taxa_block = tree.taxa_block
         else:
             assert (taxa_block is tree.taxa_block)
         if not trees_splits_encoded:
             splits.encode_splits(tree)
         split_distribution.count_splits_on_tree(tree)
     return split_distribution
예제 #19
0
def main():
    """
    Main CLI handler.
    """
    
    parser = OptionParser(usage=_prog_usage, 
        add_help_option=True, 
        version=_prog_version, 
        description=_prog_description)    
       
    parser.add_option('-d', '--database',
        action='store',
        dest='db_uri',
        type='string', # also 'float', 'string' etc.
        default=None,
        metavar='URI',
        help='[MANDATORY] database URI (e.g. "postgres://*****:*****@localhost/demodb")')
        
    parser.add_option('-q', '--quiet',
        action='store_true',
        dest='quiet',
        default=False,
        help='suppress progress messages')  
        
    parser.add_option('-e', '--echo',
        action='store_true',
        dest='echo',
        default=False,
        help='echo database communications')
        
    (opts, args) = parser.parse_args()

    if opts.db_uri is None:
        sys.stderr.write('Database URI needs to be specified ("-d" flag; see "--help").\n')
        sys.exit(1)
        
    if len(args) == 0:
        sys.stderr.write("Tree file(s) not specified.\n")
        sys.exit(1)        
    
    src_fpaths = []
    for a in args:
        f = os.path.expandvars(os.path.expanduser(a))
#         src_fpaths.append(f)
        if not os.path.exists(f):
            sys.stderr.write('File not found: "%s"\n' % f)
            sys.exit(1)
        elif not os.path.isfile(f):
            sys.stderr.write('Directory specified instead of file: "%s"\n' % f)
            sys.exit(1)
        else:
            src_fpaths.append(f)
            
    for f in src_fpaths:
    
        ## initial read ##
        if not opts.quiet:
            sys.stderr.write("Pre-import parse ...\n")
        ds1 = datasets.Dataset()
        ds1.read(open(f, "rU"), "nexml")
        tree_list = []
        for trees_block in ds1.trees_blocks:
            for tree in trees_block:
                tree_list.append(tree)
    
        ## import ##
        cmd = ["python", "biosql-insert.py", 
                '-d %s' % opts.db_uri, 
                '-b %s' % TEST_BIODB]
        if opts.quiet:
            cmd.append("-q")
        if opts.echo:
            cmd.append("-e")
        cmd.append(f)
        cmd = " ".join(cmd)
        if not opts.quiet:
            sys.stderr.write("Executing import: %s\n" % cmd)
        input_p = subprocess.Popen([cmd],
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)                               
        stdout, stderr = input_p.communicate()
        if input_p.returncode:
            sys.stderr.write('*** IMPORT ERROR ***\n')
            sys.stderr.write(stderr)
            sys.exit(1)
            
        names = stdout.split("\n")
        for idx, name in enumerate(names):
            if name:
                tree_list[idx].name = name
            
        for idx, model_tree in enumerate(tree_list):   
            ## export ##
            cmd = ["python", "biosql-gettree.py", 
                    '-d %s' % opts.db_uri, 
                    '-b %s' % TEST_BIODB]
            if opts.quiet:
                cmd.append("-q")
            if opts.echo:
                cmd.append("-e")
            cmd.append(tree.name)
            cmd = " ".join(cmd)
            if not opts.quiet:
                sys.stderr.write("Executing export: %s\n" % cmd)
            export_p = subprocess.Popen([cmd],
                                    shell=True,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)        
            stdout, stderr = export_p.communicate()
            if export_p.returncode:
                sys.stderr.write('*** EXPORT ERROR ***\n')
                sys.stderr.write(stderr)
                sys.exit(1)
                
            ds2 = datasets.Dataset()
            result_tree = ds2.trees_from_string(stdout, "nexml")[0]
            
            ## compare ##
            if not opts.quiet:
                sys.stderr.write("Comparing splits ...\n")
            taxa_block = model_tree.taxa_block
            result_tree.normalize_taxa(taxa_block)
            assert model_tree.taxa_block is result_tree.taxa_block
            splits.encode_splits(model_tree)
            splits.encode_splits(result_tree)
            sd = treedists.symmetric_difference(model_tree, result_tree)
            if not opts.quiet:
                sys.stderr.write("Symmetric distance = %d\n" % sd)
            rfd = treedists.robinson_foulds_distance(model_tree, result_tree)
            if not opts.quiet:
                sys.stderr.write("Weighted Robinson-Fould's distance = %d\n" % rfd)            
            if abs(rfd) < 0.0001:
                sys.stdout.write("%s (%d/%d): SUCCESS\n" % (f, idx+1, len(tree_list)))
            else:
                sys.stdout.write("%s (%d/%d): FAIL\n" % (f, idx+1, len(tree_list)))
예제 #20
0
    def _do_add_taxon_incremental_step(self, full_dataset, inp_trees):
        culled = self._write_garli_input(full_dataset)
        culled_taxa = culled.taxa_blocks[0]
        self.set_active_taxa(culled_taxa)
        next_round_trees = []

        for tree_ind, tree in enumerate(inp_trees):
            tree_model_list = self.add_to_tree(tree, culled, tree_ind, self.add_tree_stopgen)
            to_save = []
            for tm in tree_model_list:
                print "Tree %d for %d taxa: %f" % (tree_ind, self.curr_n_taxa, tm.score)
                step_add_tree = tm.tree
                encode_splits(step_add_tree)
                split = 1 << (self.curr_n_taxa - 1)
                e = find_edge_from_split(step_add_tree.seed_node, split)
                assert e is not None, "Could not find split %s.  Root mask is %s" % (bin(split)[2:], bin(step_add_tree.seed_node.edge.clade_mask)[2:])


                nt_list = self.check_neighborhood_after_addition(tm, e.head_node, self.first_neighborhood, culled, tree_ind)
                deeper_search_start = []
                better_tm = tm
                for nt in nt_list:
                    encode_splits(nt.tree)
                    if symmetric_difference(nt.tree, step_add_tree) != 0:
                        deeper_search_start.append(nt)
                    elif nt.score > better_tm.score:
                        better_tm = nt


                if deeper_search_start:
                    entire_neighborhood = [better_tm] + deeper_search_start
                    for alt_tm in deeper_search_start:
                        e = find_edge_from_split(alt_tm.tree.seed_node, split)

                        assert e is not None, "Could not find split %s.  Root mask is %s" % (bin(split)[2:], bin(alt_tm.tree.seed_node.edge.clade_mask)[2:])

                        nt_list = self.check_neighborhood_after_addition(alt_tm, e.head_node, self.first_neighborhood + self.neighborhood_incr, culled, tree_ind)
                        for nt in nt_list:
                            encode_splits(nt.tree)
                            entire_neighborhood.append(nt)
                    entire_neighborhood.sort(reverse=True)
                    to_add = []
                    for nt in entire_neighborhood:
                        found = False
                        for x in to_add:
                            if symmetric_difference(x.tree, nt.tree) == 0:
                                found = True
                                break
                        if not found:
                            to_add.append(nt)
                    to_save.extend(to_add)
                else:
                    to_save.append(better_tm)

            # this is where we should evaluate which trees need to be maintained for the next round.
            next_round_trees.extend(to_save)
        
        next_round_trees = self.select_trees_for_next_round(culled, next_round_trees)
        
        del full_dataset.trees_blocks[:]
        full_dataset.trees_blocks.append([i.tree for i in next_round_trees])
        o = open("incrgarli.tre", "w")
        write_tree_file(o, next_round_trees, culled)
        o.close()
        return next_round_trees
예제 #21
0
    def _do_add_taxon_incremental_step(self, full_dataset, inp_trees):
        culled = self._write_garli_input(full_dataset)
        culled_taxa = culled.taxa_blocks[0]
        self.set_active_taxa(culled_taxa)
        next_round_trees = []

        for tree_ind, tree in enumerate(inp_trees):
            tree_model_list = self.add_to_tree(tree, culled, tree_ind,
                                               self.add_tree_stopgen)
            to_save = []
            for tm in tree_model_list:
                print "Tree %d for %d taxa: %f" % (tree_ind, self.curr_n_taxa,
                                                   tm.score)
                step_add_tree = tm.tree
                encode_splits(step_add_tree)
                split = 1 << (self.curr_n_taxa - 1)
                e = find_edge_from_split(step_add_tree.seed_node, split)
                assert e is not None, "Could not find split %s.  Root mask is %s" % (
                    bin(split)[2:], bin(
                        step_add_tree.seed_node.edge.clade_mask)[2:])

                nt_list = self.check_neighborhood_after_addition(
                    tm, e.head_node, self.first_neighborhood, culled, tree_ind)
                deeper_search_start = []
                better_tm = tm
                for nt in nt_list:
                    encode_splits(nt.tree)
                    if symmetric_difference(nt.tree, step_add_tree) != 0:
                        deeper_search_start.append(nt)
                    elif nt.score > better_tm.score:
                        better_tm = nt

                if deeper_search_start:
                    entire_neighborhood = [better_tm] + deeper_search_start
                    for alt_tm in deeper_search_start:
                        e = find_edge_from_split(alt_tm.tree.seed_node, split)

                        assert e is not None, "Could not find split %s.  Root mask is %s" % (
                            bin(split)[2:],
                            bin(alt_tm.tree.seed_node.edge.clade_mask)[2:])

                        nt_list = self.check_neighborhood_after_addition(
                            alt_tm, e.head_node,
                            self.first_neighborhood + self.neighborhood_incr,
                            culled, tree_ind)
                        for nt in nt_list:
                            encode_splits(nt.tree)
                            entire_neighborhood.append(nt)
                    entire_neighborhood.sort(reverse=True)
                    to_add = []
                    for nt in entire_neighborhood:
                        found = False
                        for x in to_add:
                            if symmetric_difference(x.tree, nt.tree) == 0:
                                found = True
                                break
                        if not found:
                            to_add.append(nt)
                    to_save.extend(to_add)
                else:
                    to_save.append(better_tm)

            # this is where we should evaluate which trees need to be maintained for the next round.
            next_round_trees.extend(to_save)

        next_round_trees = self.select_trees_for_next_round(
            culled, next_round_trees)

        del full_dataset.trees_blocks[:]
        full_dataset.trees_blocks.append([i.tree for i in next_round_trees])
        o = open("incrgarli.tre", "w")
        write_tree_file(o, next_round_trees, culled)
        o.close()
        return next_round_trees
예제 #22
0
 d.read(open(data_file, "rU"), format="NEXUS")
 taxa = d.taxa_blocks[0]
 full_taxa_mask = taxa.all_taxa_bitmask()
 for n, taxon in enumerate(taxa):
     TAXON_TO_TRANSLATE[taxon] = str(n + 1)
 _LOG.debug("%s = full_taxa_mask" % bin(full_taxa_mask))
 assert (len(d.taxa_blocks) == 1)
 characters = d.char_blocks[0]
 assert (len(d.char_blocks) == 1)
 assert (len(characters) == len(taxa))
 inp_trees = d.read_trees(open(intree_file, "rU"), format="NEXUS")
 assert (inp_trees)
 current_taxon_mask = None
 for tree in inp_trees:
     assert tree.taxa_block is taxa
     encode_splits(tree)
     if current_taxon_mask is None:
         current_taxon_mask = tree.seed_node.edge.clade_mask
         _LOG.debug("%s = current_taxon_mask" % bin(current_taxon_mask))
         assert ((current_taxon_mask | full_taxa_mask) == full_taxa_mask)
         toadd_taxon_mask = current_taxon_mask ^ full_taxa_mask
     else:
         assert (current_taxon_mask == tree.seed_node.edge.clade_mask)
 next_toadd = lowest_bit_only(current_taxon_mask ^ full_taxa_mask)
 if (next_toadd - 1) != current_taxon_mask:
     _LOG.debug("%s = next_toadd" % format_split(next_toadd, taxa=taxa))
     _LOG.debug(
         "%s = current_taxon_mask\n(next_toadd - 1) != current_taxon_mask" %
         format_split(current_taxon_mask, taxa=taxa))
     sys.exit(
         "In this version, taxa must be added to the tree in the order that they appear in the matrix"
예제 #23
0
    full_taxa_mask = taxa.all_taxa_bitmask()
    for n, taxon in enumerate(taxa):
        TAXON_TO_TRANSLATE[taxon] = str(n + 1)
    _LOG.debug("%s = full_taxa_mask" % bin(full_taxa_mask))


    garli.datafname = os.path.join("data.nex")

    raw_trees = full_dataset.read_trees(open(intree_file, "rU"), format="NEXUS")
    assert(raw_trees)
    current_taxon_mask = None

    # read initial trees and verify that they have the correct set of taxa
    for tree in raw_trees:
        assert tree.taxa_block is taxa
        encode_splits(tree)
        if current_taxon_mask is None:
            current_taxon_mask = tree.seed_node.edge.clade_mask
            _LOG.debug("%s = current_taxon_mask" % bin(current_taxon_mask))
            assert( (current_taxon_mask | full_taxa_mask) == full_taxa_mask)
            toadd_taxon_mask = current_taxon_mask ^ full_taxa_mask
        else:
            assert(current_taxon_mask == tree.seed_node.edge.clade_mask)
    next_toadd = lowest_bit_only(current_taxon_mask^full_taxa_mask)
    if (next_toadd - 1) != current_taxon_mask:
        _LOG.debug("%s = next_toadd" % format_split(next_toadd, taxa=taxa))
        _LOG.debug("%s = current_taxon_mask\n(next_toadd - 1) != current_taxon_mask" % format_split(current_taxon_mask, taxa=taxa))
        sys.exit("In this version, taxa must be added to the tree in the order that they appear in the matrix")

    inp_trees = [TreeModel(tree=i) for i in raw_trees]
예제 #24
0
    def tree_from_splits(self,
                         split_distribution,
                         min_freq=0.5,
                         include_edge_lengths=True):
        "Returns a consensus tree from splits in `split_distribution`."
        leaf_to_root_search = True

        taxa_block = split_distribution.taxa_block
        con_tree = treegen.star_tree(taxa_block)
        split_freqs = split_distribution.split_frequencies
        taxa_mask = taxa_block.all_taxa_bitmask()
        splits.encode_splits(con_tree)
        leaves = con_tree.leaf_nodes()

        if leaf_to_root_search:
            to_leaf_dict = {}
            for leaf in leaves:
                to_leaf_dict[leaf.edge.clade_mask] = leaf
        include_edge_lengths = self.support_as_labels and include_edge_lengths
        unrooted = split_distribution.unrooted

        to_try_to_add = []
        for s, f in split_freqs.iteritems():
            if (f > min_freq):
                m = s & taxa_mask
                if (m != taxa_mask) and (
                    (m - 1) & m
                ):  # if not root (i.e., all "1's") and not singleton (i.e., one "1")
                    if unrooted:
                        c = (~m) & taxa_mask
                        if (c - 1) & c:  # not singleton (i.e., one "0")
                            if 1 & m:
                                k = c
                            else:
                                k = m
                            to_try_to_add.append((f, k, m))
                    else:
                        to_try_to_add.append((f, m, m))
        to_try_to_add.sort(reverse=True)

        root = con_tree.seed_node
        root_edge = root.edge
        # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree
        for freq, split_to_add, split_in_dict in to_try_to_add:
            if (split_to_add & root_edge.clade_mask) != split_to_add:
                continue
            elif leaf_to_root_search:
                lb = splits.lowest_bit_only(split_to_add)
                one_leaf = to_leaf_dict[lb]
                parent_node = one_leaf
                while (split_to_add
                       & parent_node.edge.clade_mask) != split_to_add:
                    parent_node = parent_node.parent_node
            else:
                parent_node = shallowest_containing_node(
                    start_node=con_tree.seed_node,
                    split=split_to_add,
                    taxa_mask=taxa_mask)
            if parent_node is None or parent_node.edge.clade_mask == split_to_add:
                continue  # split is not in tree, or already in tree.
            new_node = trees.Node()
            self.map_split_support_to_node(node=new_node, split_support=freq)
            new_node_children = []
            new_edge = new_node.edge
            new_edge.clade_mask = 0
            for child in parent_node.child_nodes():
                # might need to modify the following if rooted splits
                # are used
                cecm = child.edge.clade_mask
                if (cecm & split_to_add):
                    assert cecm != split_to_add
                    new_edge.clade_mask |= cecm
                    new_node_children.append(child)
            # Check to see if we have accumulated all of the bits that we
            #   needed, but none that we don't need.
            if new_edge.clade_mask == split_to_add:
                if include_edge_lengths:
                    elen = split_distribution.split_edge_lengths[split_in_dict]
                    if len(elen) > 0:
                        new_edge.length = float(sum(elen)) / len(elen)
                    else:
                        new_edge.length = None
                for child in new_node_children:
                    parent_node.remove_child(child)
                    new_node.add_child(child)
                parent_node.add_child(new_node)
                con_tree.split_edges[split_to_add] = new_edge

        ## here we add the support values and/or edge lengths for the terminal taxa ##
        for node in leaves:
            if unrooted:
                split = con_tree.split_edges.normalize_key(
                    node.edge.clade_mask)
            else:
                split = node.edge.clade_mask
            self.map_split_support_to_node(node, 1.0)
            if include_edge_lengths:
                elen = split_distribution.split_edge_lengths.get(split, [0.0])
                if len(elen) > 0:
                    node.edge.length = float(sum(elen)) / len(elen)
                else:
                    node.edge.length = None
        return con_tree
taxa_block = TaxaBlock([str(i+1) for i in range(n_tax)])
taxa_blocks = [taxa_block]
dataset = Dataset(taxa_blocks=taxa_blocks)

#setting this > 1.0 means that more trees are retained to the neighborhood search stage
score_diff_multiplier = 1.0
    
commands = []
if nbhd_tree_groups is None:
    _LOG.debug("Invocation of igarli_neighborhood.py with only one tree file -- need to set up initial neighborhood searches") 
    for g in all_tree_groups:
        for el in g:
            newick_string = el.tree_string
            newick_stream = StringIO(newick_string)
            t = dataset.read_trees(newick_stream, format="newick")[0]
            encode_splits(t)
            el.tree = t
        _LOG.debug("len(g) = %d" % len(g))
        opt_tree_el = g[0]
        opt_tree = opt_tree_el.tree
        opt_tree_el.splits = get_norm_nontrivial_split_set(opt_tree)
        _LOG.debug("opt_tree_el.splits = %s" % str(opt_tree_el.splits))
        unopt_score = None
        to_preserve = [opt_tree_el]
        for el in g[1:]:
            other_tree = el.tree
            el.splits = get_norm_nontrivial_split_set(other_tree)
            if unopt_score is None and el.splits == opt_tree_el.splits:
                unopt_score = el.score
            else:
                to_preserve.append(el)
def main_cli():

    description =  '%s %s ' % (_program_name, _program_version)
    usage = "%prog [options] <TREES FILE> [<TREES FILE> [<TREES FILE> [...]]"

    parser = OptionParser(usage=usage, add_help_option=True, version = _program_version, description=description)
    parser.add_option('-r','--reference',
                  dest='reference_tree_filepath',
                  default=None,
                  help="path to file containing the reference (true) tree")
    parser.add_option('-v', '--verbose',
                      action='store_false',
                      dest='quiet',
                      default=True,
                      help="Verbose mode")

    (opts, args) = parser.parse_args()

    ###################################################
    # Support file idiot checking

    sampled_filepaths = []
    missing = False
    for fpath in args:
        fpath = os.path.expanduser(os.path.expandvars(fpath))
        if not os.path.exists(fpath):
            sys.exit('Sampled trees file not found: "%s"' % fpath)
        sampled_filepaths.append(fpath)
    if not sampled_filepaths:
        sys.exit("Expecting arguments indicating files that contain sampled trees")

    sampled_file_objs = [open(f, "rU") for f in sampled_filepaths]

    ###################################################
    # Lots of other idiot-checking ...

    # target tree
    if opts.reference_tree_filepath is None:
        sys.exit("A reference tree must be specified (use -h to see all options)")
    reference_tree_filepath = os.path.expanduser(os.path.expandvars(opts.reference_tree_filepath))
    if not os.path.exists(reference_tree_filepath):
        sys.exit('Reference tree file not found: "%s"\n' % reference_tree_filepath)

    d = Dataset()
    ref_trees  = d.read_trees(open(reference_tree_filepath, 'ru'), schema="NEXUS")

    if len(ref_trees) != 1:
        sys.exit("Expecting one reference tree")
    ref_tree = ref_trees[0]
    splits.encode_splits(ref_tree)
    assert(len(d.taxa_blocks) == 1)
    taxa = d.taxa_blocks[0]


    ###################################################
    # Main work begins here: Count the splits

    start_time = datetime.datetime.now()

    comments = []
    tsum = treesum.TreeSummarizer()
    tsum.burnin = 0
    if opts.quiet:
        tsum.verbose = False
        tsum.write_message = None
    else:
        tsum.verbose = True
        tsum.write_message = sys.stderr.write




    _LOG.debug("### COUNTING SPLITS ###\n")
    split_distribution = splits.SplitDistribution(taxa_block=taxa)
    tree_source = MultiFileTreeIterator(filepaths=sampled_filepaths, core_iterator=nexus.iterate_over_trees)
    tsum.count_splits_on_trees(tree_source, split_distribution)

    report = []
    report.append("%d trees read from %d files." % (tsum.total_trees_read, len(sampled_filepaths)))
    report.append("%d trees ignored in total." % (tree_source.total_trees_ignored))
    report.append("%d trees considered in total for split support assessment." % (tsum.total_trees_counted))
    report.append("%d unique taxa across all trees." % len(split_distribution.taxa_block))
    num_splits, num_unique_splits, num_nt_splits, num_nt_unique_splits = split_distribution.splits_considered()
    report.append("%d unique splits out of %d total splits counted." % (num_unique_splits, num_splits))
    report.append("%d unique non-trivial splits out of %d total non-trivial splits counted." % (num_nt_unique_splits, num_nt_splits))

    _LOG.debug("\n".join(report))


    con_tree = treegen.star_tree(taxa)
    taxa_mask = taxa.all_taxa_bitmask()
    splits.encode_splits(con_tree)
    leaves = con_tree.leaf_nodes()

    to_leaf_dict = {}
    for leaf in leaves:
        to_leaf_dict[leaf.edge.clade_mask] = leaf
    unrooted = True
    n_read = float(tsum.total_trees_read)
    sp_list = []
    for split, count in split_distribution.split_counts.iteritems():
        freq = count/n_read
        if not splits.is_trivial_split(split, taxa_mask):
            m = split & taxa_mask
            if (m != taxa_mask) and ((m-1) & m): # if not root (i.e., all "1's") and not singleton (i.e., one "1")
                if unrooted:
                    c = (~m) & taxa_mask
                    if (c-1) & c: # not singleton (i.e., one "0")
                        if 1 & m:
                            k = c
                        else:
                            k = m
                        sp_list.append((freq, k, m))
                else:
                    sp_list.append((freq, m, m))
    sp_list.sort(reverse=True)

    root = con_tree.seed_node
    root_edge = root.edge

    curr_freq = 1.1
    curr_all_splits_list = []
    curr_compat_splits_list = []
    all_splits_by_freq = []
    compat_splits_by_freq = []

    # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree
    for freq, split_to_add, split_in_dict in sp_list:
        if abs(curr_freq-freq) > 0.000001:
            # dropping down to the next lowest freq
            curr_l = [freq, []]
            curr_all_splits_list = curr_l[1]
            all_splits_by_freq.append(curr_l)
            curr_l = [freq, []]
            curr_compat_splits_list = curr_l[1]
            compat_splits_by_freq.append(curr_l)
            curr_freq = freq

        curr_all_splits_list.append(split_to_add)

        if (split_to_add & root_edge.clade_mask) != split_to_add:
            continue
        lb = splits.lowest_bit_only(split_to_add)
        one_leaf = to_leaf_dict[lb]
        parent_node = one_leaf
        while (split_to_add & parent_node.edge.clade_mask) != split_to_add:
            parent_node = parent_node.parent_node
        if parent_node is None or parent_node.edge.clade_mask == split_to_add:
            continue # split is not in tree, or already in tree.

        new_node = trees.Node()
        new_node_children = []
        new_edge = new_node.edge
        new_edge.clade_mask = 0
        for child in parent_node.child_nodes():
            # might need to modify the following if rooted splits
            # are used
            cecm = child.edge.clade_mask
            if (cecm & split_to_add ):
                assert cecm != split_to_add
                new_edge.clade_mask |= cecm
                new_node_children.append(child)
        # Check to see if we have accumulated all of the bits that we
        #   needed, but none that we don't need.
        if new_edge.clade_mask == split_to_add:
            for child in new_node_children:
                parent_node.remove_child(child)
                new_node.add_child(child)
            parent_node.add_child(new_node)
            con_tree.split_edges[split_to_add] = new_edge
            curr_compat_splits_list.append(split_to_add)
    ref_set = set()
    for s in ref_tree.split_edges.iterkeys():
        m = s & taxa_mask
        if 1 & m:
            k = (~m) & taxa_mask
        else:
            k = m
        if not splits.is_trivial_split(k, taxa_mask):
            ref_set.add(k)

    all_set = set()
    compat_set = set()

    _LOG.debug("%d edges is the reference tree" % (len(ref_set)))

    print "freq\tcompatFP\tcompatFN\tcompatSD\tallFP\tallFN\tallSD"
    for all_el, compat_el in itertools.izip(all_splits_by_freq, compat_splits_by_freq):
        freq = all_el[0]
        all_sp = all_el[1]
        all_set.update(all_sp)
        all_fn = len(ref_set - all_set)
        all_fp = len(all_set - ref_set)
        compat_sp = compat_el[1]
        compat_set.update(compat_sp)
        compat_fn = len(ref_set - compat_set)
        compat_fp = len(compat_set - ref_set)

        print "%f\t%d\t%d\t%d\t%d\t%d\t%d" % (freq, compat_fp, compat_fn, compat_fp + compat_fn, all_fp, all_fn, all_fp + all_fn )
예제 #27
0
def main_cli():

    description = '%s %s ' % (_program_name, _program_version)
    usage = "%prog [options] <TREES FILE> [<TREES FILE> [<TREES FILE> [...]]"

    parser = OptionParser(usage=usage,
                          add_help_option=True,
                          version=_program_version,
                          description=description)
    parser.add_option('-r',
                      '--reference',
                      dest='reference_tree_filepath',
                      default=None,
                      help="path to file containing the reference (true) tree")
    parser.add_option('-v',
                      '--verbose',
                      action='store_false',
                      dest='quiet',
                      default=True,
                      help="Verbose mode")

    (opts, args) = parser.parse_args()

    ###################################################
    # Support file idiot checking

    sampled_filepaths = []
    missing = False
    for fpath in args:
        fpath = os.path.expanduser(os.path.expandvars(fpath))
        if not os.path.exists(fpath):
            sys.exit('Sampled trees file not found: "%s"' % fpath)
        sampled_filepaths.append(fpath)
    if not sampled_filepaths:
        sys.exit(
            "Expecting arguments indicating files that contain sampled trees")

    sampled_file_objs = [open(f, "rU") for f in sampled_filepaths]

    ###################################################
    # Lots of other idiot-checking ...

    # target tree
    if opts.reference_tree_filepath is None:
        sys.exit(
            "A reference tree must be specified (use -h to see all options)")
    reference_tree_filepath = os.path.expanduser(
        os.path.expandvars(opts.reference_tree_filepath))
    if not os.path.exists(reference_tree_filepath):
        sys.exit('Reference tree file not found: "%s"\n' %
                 reference_tree_filepath)

    d = Dataset()
    ref_trees = d.read_trees(open(reference_tree_filepath, 'ru'),
                             schema="NEXUS")

    if len(ref_trees) != 1:
        sys.exit("Expecting one reference tree")
    ref_tree = ref_trees[0]
    splits.encode_splits(ref_tree)
    assert (len(d.taxa_blocks) == 1)
    taxa = d.taxa_blocks[0]

    ###################################################
    # Main work begins here: Count the splits

    start_time = datetime.datetime.now()

    comments = []
    tsum = treesum.TreeSummarizer()
    tsum.burnin = 0
    if opts.quiet:
        tsum.verbose = False
        tsum.write_message = None
    else:
        tsum.verbose = True
        tsum.write_message = sys.stderr.write

    _LOG.debug("### COUNTING SPLITS ###\n")
    split_distribution = splits.SplitDistribution(taxa_block=taxa)
    tree_source = MultiFileTreeIterator(filepaths=sampled_filepaths,
                                        core_iterator=nexus.iterate_over_trees)
    tsum.count_splits_on_trees(tree_source, split_distribution)

    report = []
    report.append("%d trees read from %d files." %
                  (tsum.total_trees_read, len(sampled_filepaths)))
    report.append("%d trees ignored in total." %
                  (tree_source.total_trees_ignored))
    report.append(
        "%d trees considered in total for split support assessment." %
        (tsum.total_trees_counted))
    report.append("%d unique taxa across all trees." %
                  len(split_distribution.taxa_block))
    num_splits, num_unique_splits, num_nt_splits, num_nt_unique_splits = split_distribution.splits_considered(
    )
    report.append("%d unique splits out of %d total splits counted." %
                  (num_unique_splits, num_splits))
    report.append(
        "%d unique non-trivial splits out of %d total non-trivial splits counted."
        % (num_nt_unique_splits, num_nt_splits))

    _LOG.debug("\n".join(report))

    con_tree = treegen.star_tree(taxa)
    taxa_mask = taxa.all_taxa_bitmask()
    splits.encode_splits(con_tree)
    leaves = con_tree.leaf_nodes()

    to_leaf_dict = {}
    for leaf in leaves:
        to_leaf_dict[leaf.edge.clade_mask] = leaf
    unrooted = True
    n_read = float(tsum.total_trees_read)
    sp_list = []
    for split, count in split_distribution.split_counts.iteritems():
        freq = count / n_read
        if not splits.is_trivial_split(split, taxa_mask):
            m = split & taxa_mask
            if (m != taxa_mask) and (
                (m - 1) & m
            ):  # if not root (i.e., all "1's") and not singleton (i.e., one "1")
                if unrooted:
                    c = (~m) & taxa_mask
                    if (c - 1) & c:  # not singleton (i.e., one "0")
                        if 1 & m:
                            k = c
                        else:
                            k = m
                        sp_list.append((freq, k, m))
                else:
                    sp_list.append((freq, m, m))
    sp_list.sort(reverse=True)

    root = con_tree.seed_node
    root_edge = root.edge

    curr_freq = 1.1
    curr_all_splits_list = []
    curr_compat_splits_list = []
    all_splits_by_freq = []
    compat_splits_by_freq = []

    # Now when we add splits in order, we will do a greedy, extended majority-rule consensus tree
    for freq, split_to_add, split_in_dict in sp_list:
        if abs(curr_freq - freq) > 0.000001:
            # dropping down to the next lowest freq
            curr_l = [freq, []]
            curr_all_splits_list = curr_l[1]
            all_splits_by_freq.append(curr_l)
            curr_l = [freq, []]
            curr_compat_splits_list = curr_l[1]
            compat_splits_by_freq.append(curr_l)
            curr_freq = freq

        curr_all_splits_list.append(split_to_add)

        if (split_to_add & root_edge.clade_mask) != split_to_add:
            continue
        lb = splits.lowest_bit_only(split_to_add)
        one_leaf = to_leaf_dict[lb]
        parent_node = one_leaf
        while (split_to_add & parent_node.edge.clade_mask) != split_to_add:
            parent_node = parent_node.parent_node
        if parent_node is None or parent_node.edge.clade_mask == split_to_add:
            continue  # split is not in tree, or already in tree.

        new_node = trees.Node()
        new_node_children = []
        new_edge = new_node.edge
        new_edge.clade_mask = 0
        for child in parent_node.child_nodes():
            # might need to modify the following if rooted splits
            # are used
            cecm = child.edge.clade_mask
            if (cecm & split_to_add):
                assert cecm != split_to_add
                new_edge.clade_mask |= cecm
                new_node_children.append(child)
        # Check to see if we have accumulated all of the bits that we
        #   needed, but none that we don't need.
        if new_edge.clade_mask == split_to_add:
            for child in new_node_children:
                parent_node.remove_child(child)
                new_node.add_child(child)
            parent_node.add_child(new_node)
            con_tree.split_edges[split_to_add] = new_edge
            curr_compat_splits_list.append(split_to_add)
    ref_set = set()
    for s in ref_tree.split_edges.iterkeys():
        m = s & taxa_mask
        if 1 & m:
            k = (~m) & taxa_mask
        else:
            k = m
        if not splits.is_trivial_split(k, taxa_mask):
            ref_set.add(k)

    all_set = set()
    compat_set = set()

    _LOG.debug("%d edges is the reference tree" % (len(ref_set)))

    print "freq\tcompatFP\tcompatFN\tcompatSD\tallFP\tallFN\tallSD"
    for all_el, compat_el in itertools.izip(all_splits_by_freq,
                                            compat_splits_by_freq):
        freq = all_el[0]
        all_sp = all_el[1]
        all_set.update(all_sp)
        all_fn = len(ref_set - all_set)
        all_fp = len(all_set - ref_set)
        compat_sp = compat_el[1]
        compat_set.update(compat_sp)
        compat_fn = len(ref_set - compat_set)
        compat_fp = len(compat_set - ref_set)

        print "%f\t%d\t%d\t%d\t%d\t%d\t%d" % (freq, compat_fp, compat_fn,
                                              compat_fp + compat_fn, all_fp,
                                              all_fn, all_fp + all_fn)
예제 #28
0
dataset = Dataset(taxa_blocks=taxa_blocks)

#setting this > 1.0 means that more trees are retained to the neighborhood search stage
score_diff_multiplier = 1.0

commands = []
# first we collect all of the ParsedTree objects into all_parsed_trees and we
#   call encode_splits so that we can look up split info on each tree
all_tree_groups.extend(nbhd_tree_groups)
all_parsed_trees = []
for g in all_tree_groups:
    for el in g:
        newick_string = el.tree_string
        newick_stream = StringIO(newick_string)
        t = dataset.read_trees(newick_stream, format="newick")[0]
        encode_splits(t)
        el.tree = t
        all_parsed_trees.append(el)
assert (all_taxa_bitmask == all_parsed_trees[0].tree.seed_node.edge.clade_mask)

########################################
# First, we make sure that there are not duplicate topologies
# Because we reverse sort, we'll be retaining the tree with the
#   best score
#####
all_parsed_trees.sort(reverse=True)
set_of_split_sets = set()
unique_topos = []
for tm in all_parsed_trees:
    add_nontriv_splits_attr(tm.tree, all_taxa_bitmask)
    tm.splits, tm.split_set = tm.tree.splits, tm.tree.split_set
예제 #29
0
def add_to_scm(to_modify, to_consume, rooted=False, gordons_supertree=False):
    """Adds the tree `to_consume` to the tree `to_modify` in a strict consensus
    merge operation.  Both trees must have had encode_splits called on them."""
    assert (to_modify.taxa_block is to_consume.taxa_block)
    taxa_block = to_consume.taxa_block
    if rooted:
        raise NotImplementedError("rooted form of add_to_scm not implemented")
    to_mod_root = to_modify.seed_node
    to_mod_split = to_mod_root.edge.clade_mask

    to_consume_root = to_consume.seed_node
    to_consume_split = to_consume_root.edge.clade_mask

    leaf_intersection = to_mod_split & to_consume_split
    if IS_DEBUG_LOGGING:
        _LOG.debug("add_to_scm:\n  %s\n  + %s\n%s" %
                   (str(to_modify), str(to_consume),
                    format_split(leaf_intersection, taxa=taxa_block)))

    n_common_leaves = count_bits(leaf_intersection)
    if n_common_leaves < 2:
        _LOG.error('trees must have at least 2 common leaves')
        raise ValueError('trees must have at least 2 common leaves')
    if n_common_leaves == 2:
        # SCM with 2 leaves in common results in a polytomy
        collapse_clade(to_mod_root)
        collapse_clade(to_consume_root)
        leaves_to_steal = [
            c for c in to_consume_root.child_nodes()
            if not (leaf_intersection & c.edge.clade_mask)
        ]
        for leaf in leaves_to_steal:
            to_mod_root.add_child(leaf)
            to_mod_root.edge.clade_mask |= leaf.edge.clade_mask
        to_modify.split_edges = {to_mod_root.edge.clade_mask: to_mod_root.edge}
        for child in to_mod_root.child_nodes():
            to_modify.split_edges[child.edge.clade_mask] = child.edge
        return

    # at least 3 leaves in common
    tmse = to_modify.split_edges

    to_mod_relevant_splits = {}
    to_consume_relevant_splits = {}
    if not rooted:
        if IS_DEBUG_LOGGING:
            to_modify.debug_check_tree(splits=True, logger_obj=_LOG)
            to_consume.debug_check_tree(splits=True, logger_obj=_LOG)

        reroot_on_lowest_common_index_path(to_modify, leaf_intersection)
        reroot_on_lowest_common_index_path(to_consume, leaf_intersection)

        if IS_DEBUG_LOGGING:
            to_modify.debug_check_tree(splits=True, logger_obj=_LOG)
            to_consume.debug_check_tree(splits=True, logger_obj=_LOG)

        to_mod_root = to_modify.seed_node
        assert (to_mod_root.edge.clade_mask == to_mod_split)
        to_consume_root = to_consume.seed_node
        assert (to_consume_root.edge.clade_mask == to_consume_split)

    #for s, e in tmse.iteritems():
    for s, e in tmse.items():
        s = e.clade_mask
        masked = s & leaf_intersection
        if masked and masked != leaf_intersection:
            e_list = to_mod_relevant_splits.setdefault(masked, [])
            e_list.append((s, e))

    #for s, e in to_consume.split_edges.iteritems():
    for s, e in to_consume.split_edges.items():
        s = e.clade_mask
        masked = s & leaf_intersection
        if masked and masked != leaf_intersection:
            e_list = to_consume_relevant_splits.setdefault(masked, [])
            e_list.append((s, e))

    # Because each of these paths radiates away from the root (none of the paths
    #   cross the root), the clade_masks for deeper edges will be supersets
    #   of the clade_masks for shallower nodes.  Thus if we reverse sort we
    #   get the edges in the order root->tip
    #for split, path in to_mod_relevant_splits.iteritems():
    for split, path in to_mod_relevant_splits.items():
        path.sort(reverse=True)
        t = [i[1] for i in path]
        del path[:]
        path.extend(t)
    #for split, path in to_consume_relevant_splits.iteritems():
    for split, path in to_consume_relevant_splits.items():
        path.sort(reverse=True)
        t = [i[1] for i in path]
        del path[:]
        path.extend(t)
    if IS_DEBUG_LOGGING:
        to_modify.debug_check_tree(splits=True, logger_obj=_LOG)
        to_consume.debug_check_tree(splits=True, logger_obj=_LOG)

    # first we'll collapse all paths in the common leafset in to_modify that
    #   are not in to_consume
    _collapse_paths_not_found(to_mod_relevant_splits,
                              to_consume_relevant_splits, tmse)
    # Now we'll collapse all paths in the common leafset in to_consume that
    #   are not in to_modify
    _collapse_paths_not_found(to_consume_relevant_splits,
                              to_mod_relevant_splits)

    # first we'll deal with subtrees that are:
    #       - not in the leaf intersection set, and
    #       - attached to "relevant" nodes
    # We simply move these subtrees from the to_consume tree to the appropriate
    #   node in to_modify
    to_steal = [
        i for i in to_consume_root.child_nodes()
        if (i.edge.clade_mask & leaf_intersection) == 0
    ]
    for child in to_steal:
        to_mod_root.add_child(child)
        to_mod_root.edge.clade_mask |= child.edge.clade_mask

    #for masked_split, to_consume_path in to_consume_relevant_splits.iteritems():
    for masked_split, to_consume_path in to_consume_relevant_splits.items():
        to_mod_path = to_mod_relevant_splits.get(masked_split)
        if IS_DEBUG_LOGGING and to_mod_path is None:  #to_mod_path is None:
            _LOG.debug("%s = mask" %
                       format_split(leaf_intersection, taxa=taxa_block))
            _LOG.debug("%s = masked" %
                       format_split(masked_split, taxa=taxa_block))
            _LOG.debug(
                "%s = raw" %
                format_split(to_consume_path[-1].clade_mask, taxa=taxa_block))
            #for k, v in to_mod_relevant_splits.iteritems():
            for k, v in to_mod_relevant_splits.items():
                _LOG.debug("%s in to_mod_relevant_splits" %
                           format_split(k, taxa=taxa_block))

        assert to_mod_path is not None
        to_mod_head = to_mod_path[-1].head_node
        to_mod_head_edge = to_mod_head.edge
        to_consume_head = to_consume_path[-1].head_node
        for child in to_consume_head.child_nodes():
            if (child.edge.clade_mask & leaf_intersection) == 0:
                # child is the root of a subtree that has no children in the leaf_intersection
                to_mod_head.add_child(child)
                to_mod_head_edge.clade_mask |= child.edge.clade_mask
        if len(to_consume_path) > 1:
            if len(to_mod_path) > 1:
                # collision
                if gordons_supertree:
                    for edge in to_mod_path[2:]:
                        p = edge.tail_node
                        c = edge.head_node
                        sibs = p.child_nodes()
                        for sib in sibs:
                            _LOG.debug("sib is %s" % (sib.compose_newick()))
                            if sib is not c:
                                if not sib.is_leaf():
                                    collapse_clade(sib)
                                    collapse_edge(sib.edge)
                        collapse_edge(p.edge)
                    mid_node = to_mod_path[0].head_node
                    for edge in to_consume_path[1:]:
                        p = edge.tail_node
                        avoid = edge.head_node
                        for child in p.child_nodes():
                            _LOG.debug("child is %s" %
                                       (child.compose_newick()))
                            if child is not avoid:
                                mid_node.add_child(child)
                                collapse_clade(child)
                                if not child.is_leaf():
                                    collapse_edge(child.edge)
                                mid_node.edge.clade_mask |= child.edge.clade_mask
                else:
                    for edge in to_mod_path[1:-1]:
                        collapse_edge(edge)
                    mid_node = to_mod_path[0].head_node
                    for edge in to_consume_path[1:]:
                        p = edge.tail_node
                        avoid = edge.head_node
                        for child in p.child_nodes():
                            if child is not avoid:
                                mid_node.add_child(child)
                                mid_node.edge.clade_mask |= child.edge.clade_mask
            else:
                # we have to move the subtrees from to_consume to to_modify
                to_mod_edge = to_mod_path[0]
                to_mod_tail, to_mod_head = to_mod_edge.tail_node, to_mod_edge.head_node
                deepest_edge_to_move = to_consume_path[0]
                deepest_node_to_move = deepest_edge_to_move.head_node
                tipmost_edge_to_move = to_consume_path[-1]
                tipmost_node_to_move = tipmost_edge_to_move.tail_node
                prev_head = tipmost_edge_to_move.head_node

                to_mod_tail.add_child(deepest_node_to_move)
                to_mod_tail.remove_child(to_mod_head)
                tipmost_node_to_move.add_child(to_mod_head)
                tipmost_node_to_move.remove_child(prev_head)
    encode_splits(to_modify)