示例#1
0
def tree_iterator(fname, restrict_species=None, start_line=None, end_line=None):
    for ln, line in enumerate(open(fname)):
        if start_line is not None and ln < start_line:
            continue
        elif end_line is not None and ln >= end_line:
            break

        if line.startswith("#") or not line.strip(): continue
        treeid, newick = line.split("\t")
        t = PhyloTree(newick, sp_naming_function=extract_species)
        if restrict_species:
            t.prune([n for n in t.iter_leaves() if n.species in restrict_species])

        n2content = t.get_cached_content()
        if len(n2content[t]) < 2:
            continue
        yield treeid, t, n2content
示例#2
0
def tree_iterator(fname,
                  restrict_species=None,
                  start_line=None,
                  end_line=None):
    for ln, line in enumerate(open(fname)):
        if start_line is not None and ln < start_line:
            continue
        elif end_line is not None and ln >= end_line:
            break

        if line.startswith("#") or not line.strip(): continue
        treeid, newick = line.split("\t")
        t = PhyloTree(newick, sp_naming_function=extract_species)
        if restrict_species:
            t.prune(
                [n for n in t.iter_leaves() if n.species in restrict_species])

        n2content = t.get_cached_content()
        if len(n2content[t]) < 2:
            continue
        yield treeid, t, n2content
示例#3
0
def main(argv):

    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("target_trees",
                        metavar='target_trees',
                        type=str,
                        nargs="*",
                        help='a list of target tree files')

    parser.add_argument(
        "--targets_file",
        dest="targets_file",
        type=str,
        help="""path to a file containing target trees, one per line""")

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        help="""Path to the tab delimited report file""")

    parser.add_argument("-r",
                        dest="reftree",
                        type=str,
                        required=True,
                        help="""Reference tree""")

    parser.add_argument(
        "--outgroup",
        dest="outgroup",
        nargs="+",
        help=
        """outgroup used to root reference and target trees before distance computation"""
    )

    parser.add_argument("--expand_polytomies",
                        dest="polytomies",
                        action="store_true",
                        help="""expand politomies if necessary""")

    parser.add_argument("--unrooted",
                        dest="unrooted",
                        action="store_true",
                        help="""compare trees as unrooted""")

    parser.add_argument(
        "--min_support",
        dest="min_support",
        type=float,
        default=0.0,
        help=
        ("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"
         ))

    parser.add_argument(
        "--extract_species",
        dest="extract_species",
        action="store_true",
        help=
        """When used, reference tree is assumed to contain species names, while target trees as expected to be gene trees. Species name will be extracted from gene tree nodes and treeko will be used if duplication events are found."""
    )

    parser.add_argument("--spname_delimiter",
                        dest="spname_delimiter",
                        type=str,
                        default="_",
                        help=("species code delimiter in node names"))

    parser.add_argument(
        "--spname_field",
        dest="spname_field",
        type=int,
        default=-1,
        help=
        ("position of the species code extracted from node names. -1 = last field"
         ))

    parser.add_argument("--collateral",
                        dest="collateral",
                        action='store_true',
                        help=(""))

    parser.add_argument("--ref_attr",
                        dest="ref_attr",
                        type=str,
                        help=("attribute in ref tree used as leaf name"))

    parser.add_argument("--target_attr",
                        dest="target_attr",
                        type=str,
                        help=("attribute in target tree used as leaf name"))

    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.targets_file and args.target_trees:
        print >> sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)

    if args.targets_file:
        target_trees = tree_iterator(args.targets_file)
    else:
        target_trees = args.target_trees

    t = Tree(reftree)

    if args.ref_attr:
        for lf in t.iter_leaves():
            lf._origname = lf.name
            if args.ref_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_attr)

    if args.outgroup:
        if len(args.outgroup) > 1:
            out = t.get_common_ancestor(args.outgroup)
        else:
            out = t.search_nodes(name=args.outgroup[0])[0]
        t.set_outgroup(out)

    ref_names = set(t.get_leaf_names())
    reftree_len = len(t)
    reftree_edges = (reftree_len * 2) - 2
    ncollapsed_branches = len([
        n for n in t.traverse() if n.children and n.support < args.min_support
    ])
    #reftree_edges -= ncollapsed_branches
    #if ncollapsed_branches:
    #    print '%d branches collapsed in reference tree' %ncollapsed_branches

    HEADER = ("target tree", 'dups', 'subtrees', 'used trees', 'treeko', "RF",
              "maxRF", 'normRF', "%reftree", "%genetree", "avgSize", "minSize",
              "common tips", "refSize", "targetSize")
    if args.output:
        OUT = open(args.output, "w")
        print >> OUT, '# ' + ctime()
        print >> OUT, '# ' + ' '.join(sys.argv)
        print >> OUT, '#' + '\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv)
        COL_WIDTHS = [20, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')

    prev_tree = None

    for counter, tfile in enumerate(target_trees):
        if args.targets_file:
            seedid, tfile = tfile
        else:
            seedid = None

        if args.extract_species:
            tt = PhyloTree(tfile,
                           sp_naming_function=lambda name: name.split(
                               args.spname_delimiter)[args.spname_field])
        else:
            tt = Tree(tfile)

        if args.target_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.target_attr)

        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)

        if args.target_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' % counter

        max_size, min_size, avg_size, common = -1, -1, -1, -1
        total_rf, max_rf, norm_rf = -1, -1, -1
        treeko_d = -1
        ref_branches_in_target, target_branches_in_ref = -1, -1
        target_tree_len = -1
        used_subtrees = -1
        if args.extract_species:
            orig_target_size = len(tt)
            ntrees, ndups, sp_trees = tt.get_speciation_trees(
                autodetect_duplications=True, newick_only=True)

            if ntrees < 1000:
                all_rf = []
                ref_found = []
                target_found = []
                tree_sizes = []
                all_max_rf = []
                common_names = 0

                for subtree_nw in sp_trees:
                    if seedid and not args.collateral and (seedid
                                                           not in subtree_nw):
                        continue
                    subtree = PhyloTree(
                        subtree_nw,
                        sp_naming_function=lambda name: name.split(
                            args.spname_delimiter)[args.spname_field])

                    # only necessary if rf function is going to filter by support value. It slows downs the analysis, obviously
                    if args.min_support:
                        subtree_content = subtree.get_cached_content(
                            store_attr='name')
                        for n in subtree.traverse():
                            if n.children:
                                n.support = tt.get_common_ancestor(
                                    subtree_content[n]).support

                    rf, maxr, common, p1, p2, d1, d2 = t.robinson_foulds(
                        subtree,
                        expand_polytomies=args.polytomies,
                        unrooted_trees=args.unrooted,
                        attr_t2='species',
                        min_support_t2=args.min_support)
                    if maxr > 0 and p1 and p2:
                        all_rf.append(rf)
                        tree_sizes.append(len(common))
                        all_max_rf.append(maxr)
                        common_names = max(common_names, len(common))

                        ref_found.append(float(len(p2 & p1)) / reftree_edges)
                        p2bis = set([
                            p for p in (p2 - d2)
                            if len(p[0]) > 1 and len(p[1]) > 1
                        ])  # valid edges in target not leaves
                        if p2bis:
                            incompatible_target_branches = float(
                                len((p2 - d2) - p1))
                            target_found.append(1 -
                                                (incompatible_target_branches /
                                                 (len(p2 - d2))))

                        # valid_target = p2-d2
                        # valid_ref = p1-d1
                        # ref_found.append(float(len(valid_target & valid_ref)) / reftree_edges)

                        # p2bis = set([p for p in (p2-d2) if len(p[0])>1 and len(p[1])>1])
                        # if p2bis-d2:
                        #     incompatible_target_branches = float(len((p2-d2) - p1))
                        #     target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))

                if all_rf:
                    # Treeko speciation distance
                    alld = [(all_rf[i] / float(all_max_rf[i]))
                            for i in xrange(len(all_rf))]
                    a = numpy.sum(
                        [alld[i] * tree_sizes[i] for i in xrange(len(all_rf))])
                    b = float(numpy.sum(tree_sizes))
                    treeko_d = a / b
                    total_rf = numpy.mean(all_rf)
                    norm_rf = numpy.mean([(all_rf[i] / float(all_max_rf[i]))
                                          for i in xrange(len(all_rf))])
                    max_rf = numpy.max(all_max_rf)
                    ref_branches_in_target = numpy.mean(ref_found)
                    target_branches_in_ref = numpy.mean(
                        target_found) if target_found else -1
                    target_tree_len = numpy.mean(tree_sizes)
                    used_subtrees = len(all_rf)
        else:
            target_tree_len = len(tt)
            ndups, ntrees, used_subtrees = 0, 1, 1
            treeko_d = -1
            total_rf, max_rf, common, p1, p2, d1, d2 = tt.robinson_foulds(
                t,
                expand_polytomies=args.polytomies,
                unrooted_trees=args.unrooted)
            common_names = len(common)
            if max_rf:
                norm_rf = total_rf / float(max_rf)
            if p1 and p2:
                sizes = [len(p) for p in p2 ^ p1]
                if sizes:
                    avg_size = sum(sizes) / float(len(sizes))
                    max_size, min_size = max(sizes), min(sizes)
                else:
                    max_size, min_size, avg_size = 0, 0, 0

                ref_branches_in_target = float(len(p2 & p1)) / reftree_edges
                #if p2-d2:
                #    incompatible_target_branches = float(len((p2-d2) - p1))
                #    target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))
            else:
                ref_branches_in_target = 0.0
                target_branches_in_ref = 0.0
                max_size, min_size, avg_size = -1, -1, -1

        if args.output:
            print >> OUT, '\t'.join(
                map(str, (fname, ndups, ntrees, used_subtrees, treeko_d,
                          total_rf, max_rf, norm_rf, ref_branches_in_target,
                          target_branches_in_ref, avg_size, min_size,
                          common_names, reftree_len, target_tree_len)))
        else:
            print_table([
                map(istr,
                    (fname[-30:], ndups, ntrees, used_subtrees, treeko_d,
                     total_rf, max_rf, norm_rf,
                     '%0.4f' % ref_branches_in_target,
                     '%0.4f' % target_branches_in_ref, avg_size, min_size,
                     common_names, reftree_len, target_tree_len))
            ],
                        fix_col_width=COL_WIDTHS,
                        wrap_style='cut')

    if args.output:
        OUT.close()
示例#4
0
def get_supported_branches(source_tree, reftree, refclades, seedid=None):
    """
        Given a reference species tree and a rooted gene tree in which
        duplication events are already mapped, this function does the following:
        
          - Split gene tree into all possible species tree (Treeko method)
          
          - Find matches between each subtree branch and all branches in the
            reference tree. 
          
              - Each branch in each species subtree is compared to all branches
                in the reftree. If left/right side of the subtree branch
                coincide with a the left/right side of a reference tree branch,
                this is considered a gene tree support point. Coincidences must
                comply with the following conditions:

                   - All species in the left/right sides of the subtree branch
                     exist in the left/right sides of the reference branch.

                   - Species in the left/right sides of the reference branch are
                     never mixed in the subtree branch.
                     
                   - Missing species are allowed in the subtree split, only if
                     such species are not present in any other part of the
                     original gene tree.
    """

    # Run Treeko to get all possible species tree combinations. We assume dups are already mapped
    ntrees, ndups, sp_trees = source_tree.get_speciation_trees(
        autodetect_duplications=DETECT_DUPLICATIONS, newick_only=True)
    if ntrees > 100000:
        return {}, {}

    branches_found = []
    branch2supports = defaultdict(list)
    branch2coll_supports = defaultdict(list)
    for nw in sp_trees:
        # Use all treeko trees or only those subtrees containing the seed?
        if seedid and seedid not in nw:
            container = branch2coll_supports
        else:
            container = branch2supports

        subtree = PhyloTree(nw, sp_naming_function=extract_species)
        subtreenode2content = subtree.get_cached_content(store_attr="species")
        #set([phy3(_c.name) for _c in subtreenode2content[subtree]])
        all_sp_in_subtree = subtreenode2content[subtree]

        # Visit all nodes in the tree
        for n in subtree.traverse("preorder"):
            if not n.is_leaf():
                c1 = subtreenode2content[n.children[0]]
                c2 = subtreenode2content[n.children[1]]
                #branches_found.append([all_sp_in_subtree, c1, c2])

                for refnode, m1, m2 in refclades:
                    all_expected_sp = m1 | m2

                    # We add one supporting point to every observed split that coincides
                    # with a reference tree branch. This is, seqs in one side and seqs
                    # on the other side of the observed split matches a ref_tree branch
                    # without having extra seqs in any of the sides. However, we allow
                    # for split matches where some seqs are lost in the observed split.

                    #for all_sp_in_subtree, c1, c2 in branches_found:
                    all_seen_sp = c1 | c2
                    notfound, found = 0, 0

                    false_missing = (all_expected_sp -
                                     all_seen_sp) & all_sp_in_subtree
                    outside_species = (all_seen_sp - all_expected_sp)

                    # Compare expected (m1,m2) splits with observed splits (c1,c2).
                    a_straight = m1 & c1
                    b_straight = m2 & c2
                    a_cross = m1 & c2
                    b_cross = m2 & c1

                    # if matches are found for one of the first possible comparison
                    if (a_straight and b_straight):
                        # and the match contains all the observed species, species
                        # from both sides are not mixed and missing species are real
                        if not outside_species and not a_cross and not b_cross and not false_missing:
                            found += 1
                        else:
                            notfound += 1

                    # if matches are found for the second possible comparison (This
                    # would never occur if found variable was increased in the
                    # previous if)
                    if (a_cross and b_cross):
                        # and the match contains all the observed species, species
                        # from both sides are not mixed and missing species are real
                        if not outside_species and not a_straight and not b_straight and not false_missing:
                            found += 1
                        else:
                            notfound += 1

                    if notfound > 0:
                        container[refnode].append(0)
                    elif found > 0:
                        container[refnode].append(1)

                    if found == 2:
                        raw_input(
                            "Two possible matches? This should never occur!!")

    return branch2supports, branch2coll_supports
示例#5
0
文件: ete_dist.py 项目: daisieh/ete
def main(argv):
    
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                            formatter_class=argparse.RawDescriptionHelpFormatter)


    parser.add_argument("target_trees", metavar='target_trees', type=str, nargs="*",
                   help='a list of target tree files')
    
    parser.add_argument("--targets_file", dest="targets_file", 
                        type=str, 
                        help="""path to a file containing target trees, one per line""")
    
    parser.add_argument("-o", dest="output", 
                        type=str,
                        help="""Path to the tab delimited report file""")

    parser.add_argument("-r", dest="reftree", 
                        type=str, required=True,
                        help="""Reference tree""")

    parser.add_argument("--outgroup", dest="outgroup", 
                        nargs = "+",
                        help="""outgroup used to root reference and target trees before distance computation""")
  
    parser.add_argument("--expand_polytomies", dest="polytomies", 
                        action = "store_true",
                        help="""expand politomies if necessary""")
  
    parser.add_argument("--unrooted", dest="unrooted", 
                        action = "store_true",
                        help="""compare trees as unrooted""")

    parser.add_argument("--min_support", dest="min_support", 
                        type=float, default=0.0,
                        help=("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"))
    
    parser.add_argument("--extract_species", dest="extract_species", 
                        action = "store_true",
                        help="""When used, reference tree is assumed to contain species names, while target trees as expected to be gene trees. Species name will be extracted from gene tree nodes and treeko will be used if duplication events are found.""")

    parser.add_argument("--spname_delimiter", dest="spname_delimiter", 
                        type=str, default="_",
                        help=("species code delimiter in node names"))
    
    parser.add_argument("--spname_field", dest="spname_field", 
                        type=int, default=-1,
                        help=("position of the species code extracted from node names. -1 = last field"))
    

    parser.add_argument("--collateral", dest="collateral", 
                        action='store_true', 
                        help=(""))

    parser.add_argument("--ref_attr", dest="ref_attr", 
                        type=str, 
                        help=("attribute in ref tree used as leaf name"))
    
    parser.add_argument("--target_attr", dest="target_attr", 
                        type=str, 
                        help=("attribute in target tree used as leaf name"))


    
    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.targets_file and args.target_trees:
        print >>sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)
        
    if args.targets_file:
        target_trees = tree_iterator(args.targets_file)
    else:
        target_trees = args.target_trees
        
    t = Tree(reftree)

    if args.ref_attr:
        for lf in t.iter_leaves():
            lf._origname = lf.name
            if args.ref_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_attr)
    
    if args.outgroup:
        if len(args.outgroup) > 1:
            out = t.get_common_ancestor(args.outgroup)
        else:
            out = t.search_nodes(name=args.outgroup[0])[0]
        t.set_outgroup(out)
             
        
    ref_names = set(t.get_leaf_names())
    reftree_len = len(t)
    reftree_edges = (reftree_len*2)-2
    ncollapsed_branches = len([n for n in t.traverse() if n.children and n.support < args.min_support])
    #reftree_edges -= ncollapsed_branches
    #if ncollapsed_branches:
    #    print '%d branches collapsed in reference tree' %ncollapsed_branches
    
    HEADER = ("target tree", 'dups', 'subtrees', 'used trees', 'treeko', "RF", "maxRF", 'normRF', "%reftree", "%genetree", "avgSize", "minSize", "common tips", "refSize", "targetSize")
    if args.output:
        OUT = open(args.output, "w")
        print >>OUT, '# ' + ctime()
        print >>OUT, '# ' + ' '.join(sys.argv) 
        print >>OUT, '#'+'\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv) 
        COL_WIDTHS = [20, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')
                
    prev_tree = None

    for counter, tfile in enumerate(target_trees):
        if args.targets_file:
            seedid, tfile = tfile
        else:
            seedid = None

           
        if args.extract_species:
            tt = PhyloTree(tfile, sp_naming_function = lambda name: name.split(args.spname_delimiter)[args.spname_field])
        else:
            tt = Tree(tfile)

        if args.target_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.target_attr)
            
        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)
        
        if args.target_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' %counter

        max_size, min_size, avg_size, common = -1, -1, -1, -1
        total_rf, max_rf, norm_rf = -1, -1, -1
        treeko_d = -1
        ref_branches_in_target, target_branches_in_ref = -1, -1
        target_tree_len = -1
        used_subtrees = -1             
        if args.extract_species:
            orig_target_size = len(tt)
            ntrees, ndups, sp_trees = tt.get_speciation_trees(autodetect_duplications=True, newick_only=True)

            if ntrees < 1000:
                all_rf = []
                ref_found = []
                target_found = []
                tree_sizes = []
                all_max_rf = []
                common_names = 0

                for subtree_nw in sp_trees:
                    if seedid and not args.collateral and (seedid not in subtree_nw):
                        continue
                    subtree = PhyloTree(subtree_nw, sp_naming_function = lambda name: name.split(args.spname_delimiter)[args.spname_field])

                    # only necessary if rf function is going to filter by support value. It slows downs the analysis, obviously
                    if args.min_support:
                        subtree_content = subtree.get_cached_content(store_attr='name')
                        for n in subtree.traverse():
                            if n.children:
                                n.support = tt.get_common_ancestor(subtree_content[n]).support
                                
                    rf, maxr, common, p1, p2, d1, d2 = t.robinson_foulds(subtree, expand_polytomies=args.polytomies, unrooted_trees=args.unrooted,
                                                                         attr_t2='species', min_support_t2=args.min_support)
                    if maxr > 0 and p1 and p2:
                        all_rf.append(rf)
                        tree_sizes.append(len(common))
                        all_max_rf.append(maxr)
                        common_names = max(common_names, len(common))
                        
                        ref_found.append(float(len(p2 & p1)) / reftree_edges)
                        p2bis = set([p for p in (p2-d2) if len(p[0])>1 and len(p[1])>1]) # valid edges in target not leaves
                        if p2bis:
                            incompatible_target_branches = float(len((p2-d2) - p1))
                            target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))
                            
                        # valid_target = p2-d2
                        # valid_ref = p1-d1
                        # ref_found.append(float(len(valid_target & valid_ref)) / reftree_edges)
                        
                        # p2bis = set([p for p in (p2-d2) if len(p[0])>1 and len(p[1])>1])
                        # if p2bis-d2:
                        #     incompatible_target_branches = float(len((p2-d2) - p1))
                        #     target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))

                        
                if all_rf:
                    # Treeko speciation distance
                    alld = [(all_rf[i]/float(all_max_rf[i])) for i in xrange(len(all_rf))]
                    a = numpy.sum([alld[i] * tree_sizes[i] for i in xrange(len(all_rf))])
                    b = float(numpy.sum(tree_sizes))
                    treeko_d  = a/b
                    total_rf = numpy.mean(all_rf)                    
                    norm_rf = numpy.mean([(all_rf[i]/float(all_max_rf[i])) for i in xrange(len(all_rf))])
                    max_rf = numpy.max(all_max_rf)
                    ref_branches_in_target = numpy.mean(ref_found)
                    target_branches_in_ref = numpy.mean(target_found) if target_found else -1
                    target_tree_len = numpy.mean(tree_sizes)
                    used_subtrees = len(all_rf)
        else:
            target_tree_len = len(tt)
            ndups, ntrees, used_subtrees = 0, 1, 1
            treeko_d = -1
            total_rf, max_rf, common, p1, p2, d1, d2 = tt.robinson_foulds(t, expand_polytomies=args.polytomies, unrooted_trees=args.unrooted)
            common_names = len(common)
            if max_rf:
                norm_rf = total_rf / float(max_rf)
            if p1 and p2: 
                sizes = [len(p) for p in p2 ^ p1]
                if sizes: 
                    avg_size = sum(sizes) / float(len(sizes))
                    max_size, min_size = max(sizes), min(sizes)
                else:
                    max_size, min_size, avg_size = 0, 0, 0
                    
                ref_branches_in_target = float(len(p2 & p1)) / reftree_edges
                #if p2-d2:
                #    incompatible_target_branches = float(len((p2-d2) - p1))
                #    target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))
            else:
                ref_branches_in_target = 0.0
                target_branches_in_ref = 0.0
                max_size, min_size, avg_size = -1, -1, -1

        if args.output:
            print >>OUT, '\t'.join(map(str, (fname, ndups, ntrees, used_subtrees, treeko_d, total_rf, max_rf, norm_rf, ref_branches_in_target, target_branches_in_ref,
                                             avg_size, min_size, common_names, reftree_len, target_tree_len)))
        else:
            print_table([map(istr, (fname[-30:], ndups, ntrees, used_subtrees, treeko_d, total_rf, max_rf, norm_rf, '%0.4f' %ref_branches_in_target, '%0.4f' %target_branches_in_ref,
                 avg_size, min_size, common_names, reftree_len, target_tree_len))], fix_col_width = COL_WIDTHS, wrap_style='cut')

    if args.output:
        OUT.close()
示例#6
0
def get_supported_branches(source_tree, reftree, refclades, seedid=None):
    """
        Given a reference species tree and a rooted gene tree in which
        duplication events are already mapped, this function does the following:
        
          - Split gene tree into all possible species tree (Treeko method)
          
          - Find matches between each subtree branch and all branches in the
            reference tree. 
          
              - Each branch in each species subtree is compared to all branches
                in the reftree. If left/right side of the subtree branch
                coincide with a the left/right side of a reference tree branch,
                this is considered a gene tree support point. Coincidences must
                comply with the following conditions:

                   - All species in the left/right sides of the subtree branch
                     exist in the left/right sides of the reference branch.

                   - Species in the left/right sides of the reference branch are
                     never mixed in the subtree branch.
                     
                   - Missing species are allowed in the subtree split, only if
                     such species are not present in any other part of the
                     original gene tree.
    """
    
    # Run Treeko to get all possible species tree combinations. We assume dups are already mapped
    ntrees, ndups, sp_trees = source_tree.get_speciation_trees(autodetect_duplications=DETECT_DUPLICATIONS, newick_only=True)
    if ntrees > 100000:
        return {}, {}
        
    branches_found = []
    branch2supports = defaultdict(list)
    branch2coll_supports = defaultdict(list)
    for nw in sp_trees:
        # Use all treeko trees or only those subtrees containing the seed?
        if seedid and seedid not in nw:
            container = branch2coll_supports
        else:
            container = branch2supports
            
        subtree = PhyloTree(nw, sp_naming_function = extract_species)
        subtreenode2content = subtree.get_cached_content(store_attr="species")
        #set([phy3(_c.name) for _c in subtreenode2content[subtree]])
        all_sp_in_subtree = subtreenode2content[subtree]
        
        # Visit all nodes in the tree
        for n in subtree.traverse("preorder"):
            if not n.is_leaf():
                c1 = subtreenode2content[n.children[0]]
                c2 = subtreenode2content[n.children[1]]
                #branches_found.append([all_sp_in_subtree, c1, c2])

                for refnode, m1, m2 in refclades:
                    all_expected_sp = m1 | m2

                    # We add one supporting point to every observed split that coincides
                    # with a reference tree branch. This is, seqs in one side and seqs
                    # on the other side of the observed split matches a ref_tree branch
                    # without having extra seqs in any of the sides. However, we allow
                    # for split matches where some seqs are lost in the observed split.
                    
                    #for all_sp_in_subtree, c1, c2 in branches_found:
                    all_seen_sp = c1|c2
                    notfound, found = 0, 0

                    false_missing = (all_expected_sp - all_seen_sp) & all_sp_in_subtree
                    outside_species = (all_seen_sp - all_expected_sp)

                    # Compare expected (m1,m2) splits with observed splits (c1,c2). 
                    a_straight  = m1 & c1
                    b_straight = m2 & c2
                    a_cross = m1 & c2
                    b_cross = m2 & c1

                    # if matches are found for one of the first possible comparison
                    if (a_straight and b_straight):
                        # and the match contains all the observed species, species
                        # from both sides are not mixed and missing species are real
                        if not outside_species and not a_cross and not b_cross and not false_missing:
                            found += 1
                        else:
                            notfound += 1

                    # if matches are found for the second possible comparison (This
                    # would never occur if found variable was increased in the
                    # previous if)
                    if (a_cross and b_cross):
                        # and the match contains all the observed species, species
                        # from both sides are not mixed and missing species are real
                        if not outside_species and not a_straight and not b_straight and not false_missing:
                            found += 1
                        else:
                            notfound += 1

                    if notfound > 0:
                        container[refnode].append(0)
                    elif found > 0:
                        container[refnode].append(1)                    

                    if found == 2:
                        raw_input("Two possible matches? This should never occur!!")
                
    return branch2supports, branch2coll_supports