示例#1
0
def main(argv):

    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("target_trees",
                        metavar='target_trees',
                        type=str,
                        nargs="*",
                        help='a list of target tree files')

    parser.add_argument(
        "--targets_file",
        dest="targets_file",
        type=str,
        help="""path to a file containing target trees, one per line""")

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        help="""Path to the tab delimited report file""")

    parser.add_argument("-r",
                        dest="reftree",
                        type=str,
                        required=True,
                        help="""Reference tree""")

    parser.add_argument(
        "--outgroup",
        dest="outgroup",
        nargs="+",
        help=
        """outgroup used to root reference and target trees before distance computation"""
    )

    parser.add_argument("--expand_polytomies",
                        dest="polytomies",
                        action="store_true",
                        help="""expand politomies if necessary""")

    parser.add_argument("--unrooted",
                        dest="unrooted",
                        action="store_true",
                        help="""compare trees as unrooted""")

    parser.add_argument(
        "--min_support",
        dest="min_support",
        type=float,
        default=0.0,
        help=
        ("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"
         ))

    parser.add_argument(
        "--extract_species",
        dest="extract_species",
        action="store_true",
        help=
        """When used, reference tree is assumed to contain species names, while target trees as expected to be gene trees. Species name will be extracted from gene tree nodes and treeko will be used if duplication events are found."""
    )

    parser.add_argument("--spname_delimiter",
                        dest="spname_delimiter",
                        type=str,
                        default="_",
                        help=("species code delimiter in node names"))

    parser.add_argument(
        "--spname_field",
        dest="spname_field",
        type=int,
        default=-1,
        help=
        ("position of the species code extracted from node names. -1 = last field"
         ))

    parser.add_argument("--collateral",
                        dest="collateral",
                        action='store_true',
                        help=(""))

    parser.add_argument("--ref_attr",
                        dest="ref_attr",
                        type=str,
                        help=("attribute in ref tree used as leaf name"))

    parser.add_argument("--target_attr",
                        dest="target_attr",
                        type=str,
                        help=("attribute in target tree used as leaf name"))

    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.targets_file and args.target_trees:
        print >> sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)

    if args.targets_file:
        target_trees = tree_iterator(args.targets_file)
    else:
        target_trees = args.target_trees

    t = Tree(reftree)

    if args.ref_attr:
        for lf in t.iter_leaves():
            lf._origname = lf.name
            if args.ref_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_attr)

    if args.outgroup:
        if len(args.outgroup) > 1:
            out = t.get_common_ancestor(args.outgroup)
        else:
            out = t.search_nodes(name=args.outgroup[0])[0]
        t.set_outgroup(out)

    ref_names = set(t.get_leaf_names())
    reftree_len = len(t)
    reftree_edges = (reftree_len * 2) - 2
    ncollapsed_branches = len([
        n for n in t.traverse() if n.children and n.support < args.min_support
    ])
    #reftree_edges -= ncollapsed_branches
    #if ncollapsed_branches:
    #    print '%d branches collapsed in reference tree' %ncollapsed_branches

    HEADER = ("target tree", 'dups', 'subtrees', 'used trees', 'treeko', "RF",
              "maxRF", 'normRF', "%reftree", "%genetree", "avgSize", "minSize",
              "common tips", "refSize", "targetSize")
    if args.output:
        OUT = open(args.output, "w")
        print >> OUT, '# ' + ctime()
        print >> OUT, '# ' + ' '.join(sys.argv)
        print >> OUT, '#' + '\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv)
        COL_WIDTHS = [20, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')

    prev_tree = None

    for counter, tfile in enumerate(target_trees):
        if args.targets_file:
            seedid, tfile = tfile
        else:
            seedid = None

        if args.extract_species:
            tt = PhyloTree(tfile,
                           sp_naming_function=lambda name: name.split(
                               args.spname_delimiter)[args.spname_field])
        else:
            tt = Tree(tfile)

        if args.target_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.target_attr)

        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)

        if args.target_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' % counter

        max_size, min_size, avg_size, common = -1, -1, -1, -1
        total_rf, max_rf, norm_rf = -1, -1, -1
        treeko_d = -1
        ref_branches_in_target, target_branches_in_ref = -1, -1
        target_tree_len = -1
        used_subtrees = -1
        if args.extract_species:
            orig_target_size = len(tt)
            ntrees, ndups, sp_trees = tt.get_speciation_trees(
                autodetect_duplications=True, newick_only=True)

            if ntrees < 1000:
                all_rf = []
                ref_found = []
                target_found = []
                tree_sizes = []
                all_max_rf = []
                common_names = 0

                for subtree_nw in sp_trees:
                    if seedid and not args.collateral and (seedid
                                                           not in subtree_nw):
                        continue
                    subtree = PhyloTree(
                        subtree_nw,
                        sp_naming_function=lambda name: name.split(
                            args.spname_delimiter)[args.spname_field])

                    # only necessary if rf function is going to filter by support value. It slows downs the analysis, obviously
                    if args.min_support:
                        subtree_content = subtree.get_cached_content(
                            store_attr='name')
                        for n in subtree.traverse():
                            if n.children:
                                n.support = tt.get_common_ancestor(
                                    subtree_content[n]).support

                    rf, maxr, common, p1, p2, d1, d2 = t.robinson_foulds(
                        subtree,
                        expand_polytomies=args.polytomies,
                        unrooted_trees=args.unrooted,
                        attr_t2='species',
                        min_support_t2=args.min_support)
                    if maxr > 0 and p1 and p2:
                        all_rf.append(rf)
                        tree_sizes.append(len(common))
                        all_max_rf.append(maxr)
                        common_names = max(common_names, len(common))

                        ref_found.append(float(len(p2 & p1)) / reftree_edges)
                        p2bis = set([
                            p for p in (p2 - d2)
                            if len(p[0]) > 1 and len(p[1]) > 1
                        ])  # valid edges in target not leaves
                        if p2bis:
                            incompatible_target_branches = float(
                                len((p2 - d2) - p1))
                            target_found.append(1 -
                                                (incompatible_target_branches /
                                                 (len(p2 - d2))))

                        # valid_target = p2-d2
                        # valid_ref = p1-d1
                        # ref_found.append(float(len(valid_target & valid_ref)) / reftree_edges)

                        # p2bis = set([p for p in (p2-d2) if len(p[0])>1 and len(p[1])>1])
                        # if p2bis-d2:
                        #     incompatible_target_branches = float(len((p2-d2) - p1))
                        #     target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))

                if all_rf:
                    # Treeko speciation distance
                    alld = [(all_rf[i] / float(all_max_rf[i]))
                            for i in xrange(len(all_rf))]
                    a = numpy.sum(
                        [alld[i] * tree_sizes[i] for i in xrange(len(all_rf))])
                    b = float(numpy.sum(tree_sizes))
                    treeko_d = a / b
                    total_rf = numpy.mean(all_rf)
                    norm_rf = numpy.mean([(all_rf[i] / float(all_max_rf[i]))
                                          for i in xrange(len(all_rf))])
                    max_rf = numpy.max(all_max_rf)
                    ref_branches_in_target = numpy.mean(ref_found)
                    target_branches_in_ref = numpy.mean(
                        target_found) if target_found else -1
                    target_tree_len = numpy.mean(tree_sizes)
                    used_subtrees = len(all_rf)
        else:
            target_tree_len = len(tt)
            ndups, ntrees, used_subtrees = 0, 1, 1
            treeko_d = -1
            total_rf, max_rf, common, p1, p2, d1, d2 = tt.robinson_foulds(
                t,
                expand_polytomies=args.polytomies,
                unrooted_trees=args.unrooted)
            common_names = len(common)
            if max_rf:
                norm_rf = total_rf / float(max_rf)
            if p1 and p2:
                sizes = [len(p) for p in p2 ^ p1]
                if sizes:
                    avg_size = sum(sizes) / float(len(sizes))
                    max_size, min_size = max(sizes), min(sizes)
                else:
                    max_size, min_size, avg_size = 0, 0, 0

                ref_branches_in_target = float(len(p2 & p1)) / reftree_edges
                #if p2-d2:
                #    incompatible_target_branches = float(len((p2-d2) - p1))
                #    target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))
            else:
                ref_branches_in_target = 0.0
                target_branches_in_ref = 0.0
                max_size, min_size, avg_size = -1, -1, -1

        if args.output:
            print >> OUT, '\t'.join(
                map(str, (fname, ndups, ntrees, used_subtrees, treeko_d,
                          total_rf, max_rf, norm_rf, ref_branches_in_target,
                          target_branches_in_ref, avg_size, min_size,
                          common_names, reftree_len, target_tree_len)))
        else:
            print_table([
                map(istr,
                    (fname[-30:], ndups, ntrees, used_subtrees, treeko_d,
                     total_rf, max_rf, norm_rf,
                     '%0.4f' % ref_branches_in_target,
                     '%0.4f' % target_branches_in_ref, avg_size, min_size,
                     common_names, reftree_len, target_tree_len))
            ],
                        fix_col_width=COL_WIDTHS,
                        wrap_style='cut')

    if args.output:
        OUT.close()
示例#2
0
文件: ete_dist.py 项目: daisieh/ete
def main(argv):
    
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                            formatter_class=argparse.RawDescriptionHelpFormatter)


    parser.add_argument("target_trees", metavar='target_trees', type=str, nargs="*",
                   help='a list of target tree files')
    
    parser.add_argument("--targets_file", dest="targets_file", 
                        type=str, 
                        help="""path to a file containing target trees, one per line""")
    
    parser.add_argument("-o", dest="output", 
                        type=str,
                        help="""Path to the tab delimited report file""")

    parser.add_argument("-r", dest="reftree", 
                        type=str, required=True,
                        help="""Reference tree""")

    parser.add_argument("--outgroup", dest="outgroup", 
                        nargs = "+",
                        help="""outgroup used to root reference and target trees before distance computation""")
  
    parser.add_argument("--expand_polytomies", dest="polytomies", 
                        action = "store_true",
                        help="""expand politomies if necessary""")
  
    parser.add_argument("--unrooted", dest="unrooted", 
                        action = "store_true",
                        help="""compare trees as unrooted""")

    parser.add_argument("--min_support", dest="min_support", 
                        type=float, default=0.0,
                        help=("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"))
    
    parser.add_argument("--extract_species", dest="extract_species", 
                        action = "store_true",
                        help="""When used, reference tree is assumed to contain species names, while target trees as expected to be gene trees. Species name will be extracted from gene tree nodes and treeko will be used if duplication events are found.""")

    parser.add_argument("--spname_delimiter", dest="spname_delimiter", 
                        type=str, default="_",
                        help=("species code delimiter in node names"))
    
    parser.add_argument("--spname_field", dest="spname_field", 
                        type=int, default=-1,
                        help=("position of the species code extracted from node names. -1 = last field"))
    

    parser.add_argument("--collateral", dest="collateral", 
                        action='store_true', 
                        help=(""))

    parser.add_argument("--ref_attr", dest="ref_attr", 
                        type=str, 
                        help=("attribute in ref tree used as leaf name"))
    
    parser.add_argument("--target_attr", dest="target_attr", 
                        type=str, 
                        help=("attribute in target tree used as leaf name"))


    
    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.targets_file and args.target_trees:
        print >>sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)
        
    if args.targets_file:
        target_trees = tree_iterator(args.targets_file)
    else:
        target_trees = args.target_trees
        
    t = Tree(reftree)

    if args.ref_attr:
        for lf in t.iter_leaves():
            lf._origname = lf.name
            if args.ref_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_attr)
    
    if args.outgroup:
        if len(args.outgroup) > 1:
            out = t.get_common_ancestor(args.outgroup)
        else:
            out = t.search_nodes(name=args.outgroup[0])[0]
        t.set_outgroup(out)
             
        
    ref_names = set(t.get_leaf_names())
    reftree_len = len(t)
    reftree_edges = (reftree_len*2)-2
    ncollapsed_branches = len([n for n in t.traverse() if n.children and n.support < args.min_support])
    #reftree_edges -= ncollapsed_branches
    #if ncollapsed_branches:
    #    print '%d branches collapsed in reference tree' %ncollapsed_branches
    
    HEADER = ("target tree", 'dups', 'subtrees', 'used trees', 'treeko', "RF", "maxRF", 'normRF', "%reftree", "%genetree", "avgSize", "minSize", "common tips", "refSize", "targetSize")
    if args.output:
        OUT = open(args.output, "w")
        print >>OUT, '# ' + ctime()
        print >>OUT, '# ' + ' '.join(sys.argv) 
        print >>OUT, '#'+'\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv) 
        COL_WIDTHS = [20, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')
                
    prev_tree = None

    for counter, tfile in enumerate(target_trees):
        if args.targets_file:
            seedid, tfile = tfile
        else:
            seedid = None

           
        if args.extract_species:
            tt = PhyloTree(tfile, sp_naming_function = lambda name: name.split(args.spname_delimiter)[args.spname_field])
        else:
            tt = Tree(tfile)

        if args.target_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.target_attr)
            
        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)
        
        if args.target_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' %counter

        max_size, min_size, avg_size, common = -1, -1, -1, -1
        total_rf, max_rf, norm_rf = -1, -1, -1
        treeko_d = -1
        ref_branches_in_target, target_branches_in_ref = -1, -1
        target_tree_len = -1
        used_subtrees = -1             
        if args.extract_species:
            orig_target_size = len(tt)
            ntrees, ndups, sp_trees = tt.get_speciation_trees(autodetect_duplications=True, newick_only=True)

            if ntrees < 1000:
                all_rf = []
                ref_found = []
                target_found = []
                tree_sizes = []
                all_max_rf = []
                common_names = 0

                for subtree_nw in sp_trees:
                    if seedid and not args.collateral and (seedid not in subtree_nw):
                        continue
                    subtree = PhyloTree(subtree_nw, sp_naming_function = lambda name: name.split(args.spname_delimiter)[args.spname_field])

                    # only necessary if rf function is going to filter by support value. It slows downs the analysis, obviously
                    if args.min_support:
                        subtree_content = subtree.get_cached_content(store_attr='name')
                        for n in subtree.traverse():
                            if n.children:
                                n.support = tt.get_common_ancestor(subtree_content[n]).support
                                
                    rf, maxr, common, p1, p2, d1, d2 = t.robinson_foulds(subtree, expand_polytomies=args.polytomies, unrooted_trees=args.unrooted,
                                                                         attr_t2='species', min_support_t2=args.min_support)
                    if maxr > 0 and p1 and p2:
                        all_rf.append(rf)
                        tree_sizes.append(len(common))
                        all_max_rf.append(maxr)
                        common_names = max(common_names, len(common))
                        
                        ref_found.append(float(len(p2 & p1)) / reftree_edges)
                        p2bis = set([p for p in (p2-d2) if len(p[0])>1 and len(p[1])>1]) # valid edges in target not leaves
                        if p2bis:
                            incompatible_target_branches = float(len((p2-d2) - p1))
                            target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))
                            
                        # valid_target = p2-d2
                        # valid_ref = p1-d1
                        # ref_found.append(float(len(valid_target & valid_ref)) / reftree_edges)
                        
                        # p2bis = set([p for p in (p2-d2) if len(p[0])>1 and len(p[1])>1])
                        # if p2bis-d2:
                        #     incompatible_target_branches = float(len((p2-d2) - p1))
                        #     target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))

                        
                if all_rf:
                    # Treeko speciation distance
                    alld = [(all_rf[i]/float(all_max_rf[i])) for i in xrange(len(all_rf))]
                    a = numpy.sum([alld[i] * tree_sizes[i] for i in xrange(len(all_rf))])
                    b = float(numpy.sum(tree_sizes))
                    treeko_d  = a/b
                    total_rf = numpy.mean(all_rf)                    
                    norm_rf = numpy.mean([(all_rf[i]/float(all_max_rf[i])) for i in xrange(len(all_rf))])
                    max_rf = numpy.max(all_max_rf)
                    ref_branches_in_target = numpy.mean(ref_found)
                    target_branches_in_ref = numpy.mean(target_found) if target_found else -1
                    target_tree_len = numpy.mean(tree_sizes)
                    used_subtrees = len(all_rf)
        else:
            target_tree_len = len(tt)
            ndups, ntrees, used_subtrees = 0, 1, 1
            treeko_d = -1
            total_rf, max_rf, common, p1, p2, d1, d2 = tt.robinson_foulds(t, expand_polytomies=args.polytomies, unrooted_trees=args.unrooted)
            common_names = len(common)
            if max_rf:
                norm_rf = total_rf / float(max_rf)
            if p1 and p2: 
                sizes = [len(p) for p in p2 ^ p1]
                if sizes: 
                    avg_size = sum(sizes) / float(len(sizes))
                    max_size, min_size = max(sizes), min(sizes)
                else:
                    max_size, min_size, avg_size = 0, 0, 0
                    
                ref_branches_in_target = float(len(p2 & p1)) / reftree_edges
                #if p2-d2:
                #    incompatible_target_branches = float(len((p2-d2) - p1))
                #    target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))
            else:
                ref_branches_in_target = 0.0
                target_branches_in_ref = 0.0
                max_size, min_size, avg_size = -1, -1, -1

        if args.output:
            print >>OUT, '\t'.join(map(str, (fname, ndups, ntrees, used_subtrees, treeko_d, total_rf, max_rf, norm_rf, ref_branches_in_target, target_branches_in_ref,
                                             avg_size, min_size, common_names, reftree_len, target_tree_len)))
        else:
            print_table([map(istr, (fname[-30:], ndups, ntrees, used_subtrees, treeko_d, total_rf, max_rf, norm_rf, '%0.4f' %ref_branches_in_target, '%0.4f' %target_branches_in_ref,
                 avg_size, min_size, common_names, reftree_len, target_tree_len))], fix_col_width = COL_WIDTHS, wrap_style='cut')

    if args.output:
        OUT.close()
示例#3
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections.
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    parser.add_argument("--show",
                        dest="show_tree",
                        action="store_true",
                        help="""Display tree after the analysis.""")

    parser.add_argument("--render",
                        dest="render",
                        action="store_true",
                        help="""Render tree.""")

    parser.add_argument("--dump",
                        dest="dump",
                        action="store_true",
                        help="""Dump analysis""")

    parser.add_argument(
        "--explore",
        dest="explore",
        type=str,
        help="""Reads a previously analyzed tree and visualize it""")

    input_args = parser.add_mutually_exclusive_group()
    input_args.required = True
    input_args.add_argument("-t",
                            "--tree",
                            dest="target_tree",
                            nargs="+",
                            type=str,
                            help="""Tree file in newick format""")

    input_args.add_argument("-tf",
                            dest="tree_list_file",
                            type=str,
                            help="File with the list of tree files")

    parser.add_argument("--tax",
                        dest="tax_info",
                        type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument(
        "--sp_delimiter",
        dest="sp_delimiter",
        type=str,
        help=
        "If taxid is part of the leaf name, delimiter used to split the string"
    )

    parser.add_argument(
        "--sp_field",
        dest="sp_field",
        type=int,
        default=0,
        help="field position for taxid after splitting leaf names")

    parser.add_argument("--ref",
                        dest="ref_tree",
                        type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only",
                        dest="rf_only",
                        action="store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument(
        "--outgroup",
        dest="outgroup",
        type=str,
        nargs="+",
        help="A list of node names defining the trees outgroup")

    parser.add_argument("--is_sptree",
                        dest="is_sptree",
                        action="store_true",
                        help="Assumes no duplication nodes in the tree")

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str, help="")

    parser.add_argument("--tax2track", dest="tax2track", type=str, help="")

    parser.add_argument("--dump_tax_info",
                        dest="dump_tax_info",
                        action="store_true",
                        help="")

    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None

    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >> sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout
        ts.mode = "r"
        t.show(tree_style=ts)
        print >> sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()

    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >> sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Clade sizes", "RF (avg)", "RF (med)",
              "RF (std)", "RF (max)", "Shared tips")
    print >> OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >> sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)

        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")

            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t

        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(
                t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(
                t, None, tax2name, tax2track)

        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees = t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(
                t, subtrees, show_tree=SHOW_TREE)

            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0

        ndups = 0
        nsubtrees = len(subtrees)

        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf

        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(
                map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0  # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" % ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(
                        set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append(
                        (partial_rf[0] / float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size)  # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" % (numpy.mean(broken_sizes),
                                               numpy.median(broken_sizes),
                                               numpy.std(broken_sizes))
        iter_values = [
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std,
            rf_max, common_names
        ]
        print >> OUT, '|'.join(
            map(lambda x: str(x).strip().ljust(15), iter_values))
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems = sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems),
                                "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" % fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %
                  problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, fixed_string,
            problems_string
        ])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1

                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))

    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Broken branches", "Clade sizes",
              "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width=50, row_line=True, header=HEADER)

    if args.output:
        OUT.close()
示例#4
0
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections. 
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().
    
    parser.add_argument("--show", dest="show_tree",
                        action="store_true", 
                        help="""Display tree after the analysis.""")
    
    parser.add_argument("--render", dest="render",
                        action="store_true", 
                        help="""Render tree.""")

    parser.add_argument("--dump", dest="dump",
                        action="store_true", 
                        help="""Dump analysis""")

    parser.add_argument("--explore", dest="explore",
                        type=str,
                        help="""Reads a previously analyzed tree and visualize it""")
    
    input_args = parser.add_mutually_exclusive_group()
    input_args.required=True
    input_args.add_argument("-t", "--tree", dest="target_tree",  nargs="+",
                        type=str,
                        help="""Tree file in newick format""")

    input_args.add_argument("-tf", dest="tree_list_file",
                        type=str, 
                        help="File with the list of tree files")
    
    parser.add_argument("--tax", dest="tax_info", type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument("--sp_delimiter", dest="sp_delimiter", type=str,
                        help="If taxid is part of the leaf name, delimiter used to split the string")

    parser.add_argument("--sp_field", dest="sp_field", type=int, default=0,
                        help="field position for taxid after splitting leaf names")
    
    parser.add_argument("--ref", dest="ref_tree", type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only", dest="rf_only",
                        action = "store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument("--outgroup", dest="outgroup",
                        type=str, nargs="+",
                        help="A list of node names defining the trees outgroup")
    
    parser.add_argument("--is_sptree", dest="is_sptree",
                        action = "store_true",
                        help="Assumes no duplication nodes in the tree")
    
    parser.add_argument("-o", dest="output", type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str,
                        help="")
    
    parser.add_argument("--tax2track", dest="tax2track", type=str,
                        help="")
    
    parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true",
                        help="")
    
    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None
    
    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >>sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout 
        ts.mode = "r"
        t.show(tree_style=ts)
        print >>sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()
    
    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >>sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips")
    print >>OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >>sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

        
    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)
        
        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")
                
            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t
       
        
        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track)
            
        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees =  t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

          
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(t, subtrees, show_tree=SHOW_TREE)
            
            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0
            
        ndups = 0
        nsubtrees = len(subtrees)
           
        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf
            
        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0 # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" %ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append((partial_rf[0]/float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" %( numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes))
        iter_values = [os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med,
                       rf_std, rf_max, common_names] 
        print >>OUT, '|'.join(map(lambda x: str(x).strip().ljust(15), iter_values)) 
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems =  sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems), "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" %fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout 
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1
                    
                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))
            
    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width = 50, row_line=True, header=HEADER)
            
    if args.output:
        OUT.close()