Exemplo n.º 1
0
def dump_results(reftree, informed_branches, dup_per_branch, losses_per_branch,
                 losses_per_dup_branch, refbranch_supports,
                 coll_dup_per_branch, coll_losses_per_branch,
                 coll_losses_per_dup_branch, coll_refbranch_supports):

    # FInal annotation of the refTree
    annotate_tree(reftree, informed_branches, dup_per_branch,
                  losses_per_branch, losses_per_dup_branch, refbranch_supports,
                  coll_dup_per_branch, coll_losses_per_branch,
                  coll_losses_per_dup_branch, coll_refbranch_supports)
    # Summary newick tree with all features
    if IMG_REPORT:
        print >> sys.stderr, "Generating tree analysis image"
        ts = TreeStyle()
        ts.layout_fn = info_layout
        reftree.render("%s.tree_analysis.png" % args.output, tree_style=ts)

    summary_fetaures = [
        "ntrees", "nid", "ndups", "dup_rate", "losses", "losses_rate",
        "nduplosses", "duplosses_rate", "gt_support", "nsupport_trees",
        "coll_ndups", "coll_dup_rate", "coll_losses", "coll_losses_rate",
        "coll_nduplosses", "coll_duplosses_rate", "coll_gt_support",
        "coll_nsupport_trees"
    ]

    print >> sys.stderr, "Dumping annotated newick..."
    reftree.write(outfile="%s.nwx" % args.output, features=summary_fetaures)
    open("%s.log" % args.output, "w").write(' '.join(sys.argv))
Exemplo n.º 2
0
def dump_results(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
                 coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports):
        
    # FInal annotation of the refTree
    annotate_tree(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
                   coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports)
    # Summary newick tree with all features
    if IMG_REPORT:
        print >>sys.stderr, "Generating tree analysis image"
        ts = TreeStyle()
        ts.layout_fn = info_layout
        reftree.render("%s.tree_analysis.png"%args.output, tree_style=ts)
    
    summary_fetaures = [
        "ntrees",
        "nid",
        "ndups",
        "dup_rate",
        "losses",
        "losses_rate",
        "nduplosses",
        "duplosses_rate",
        "gt_support",
        "nsupport_trees",
        "coll_ndups",
        "coll_dup_rate",
        "coll_losses",
        "coll_losses_rate",
        "coll_nduplosses",
        "coll_duplosses_rate",
        "coll_gt_support",
        "coll_nsupport_trees"]
    
    print >>sys.stderr, "Dumping annotated newick..."
    reftree.write(outfile="%s.nwx"%args.output, features=summary_fetaures)
    open("%s.log"%args.output, "w").write(' '.join(sys.argv))
Exemplo n.º 3
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections.
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    input_gr = parser.add_argument_group(
        "TREE INPUT OPTIONS\n=================")

    input_gr.add_argument(
        'tree',
        metavar='tree_file',
        type=str,
        nargs=1,
        help='A tree file (or text string) in newick format.')

    input_gr.add_argument("--raxml",
                          dest="raxml",
                          action="store_true",
                          help="""Process newick as raxml bootstrap values""")

    img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================")

    img_gr.add_argument("-m",
                        "--mode",
                        dest="mode",
                        choices=["c", "r"],
                        default="r",
                        help="""(r)ectangular or (c)ircular visualization""")

    img_gr.add_argument(
        "-i",
        "--image",
        dest="image",
        type=str,
        help="Render tree image instead of showing it. A filename "
        " should be provided. PDF, SVG and PNG file extensions are"
        " supported (i.e. -i tree.svg)")

    img_gr.add_argument(
        "--Iw",
        "--width",
        dest="width",
        type=int,
        default=0,
        help="width of the rendered image in pixels (see --size-units).")

    img_gr.add_argument(
        "--Ih",
        "--height",
        dest="height",
        type=int,
        default=0,
        help="height of the rendered image in pixels (see --size-units).")

    img_gr.add_argument("--Ir",
                        "--resolution",
                        dest="resolution",
                        type=int,
                        default=300,
                        help="Resolution if the tree image (DPI)")

    img_gr.add_argument("--Iu",
                        "--size-units",
                        dest="size_units",
                        choices=["px", "mm", "in"],
                        default="px",
                        help="Units used to specify the size of the image."
                        " (px:pixels, mm:millimeters, in:inches). ")

    img_gr.add_argument(
        "-mbs",
        "--min-branch-separation",
        dest="branch_separation",
        type=int,
        default=3,
        help="Min number of pixels to separate branches vertically.")

    img_gr.add_argument("--ss",
                        "--show-support",
                        dest="show_support",
                        action="store_true",
                        help="""Shows branch bootstrap/support values""")

    img_gr.add_argument("--sbl",
                        "--branch-length",
                        dest="show_branch_length",
                        action="store_true",
                        help="""Show branch lengths.""")

    img_gr.add_argument(
        "--ft",
        "--force-topology",
        dest="force_topology",
        action="store_true",
        help="""Force branch length to have a minimum length in the image""")

    img_gr.add_argument("--hln",
                        "--hide-leaf-names",
                        dest="hide_leaf_names",
                        action="store_true",
                        help="""Hide leaf names.""")

    img_gr.add_argument(
        "--sin",
        "--show-internal-names",
        dest="show_internal_names",
        action="store_true",
        help="""Show the name attribute of all internal nodes.""")

    edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================")

    edit_gr.add_argument(
        "-r",
        "--root",
        dest="root",
        type=str,
        nargs="*",
        help="Roots the tree to the node grouping the list"
        " of node names provided (space separated). In example:"
        "'--root human rat mouse'")

    edit_gr.add_argument("-s",
                         "--sort-branches",
                         dest="sort",
                         action="store_true",
                         help="""Sort branches according to node names.""")

    edit_gr.add_argument("-l",
                         "--ladderize",
                         dest="ladderize",
                         action="store_true",
                         help="""Sort branches by partition size.""")

    edit_gr.add_argument("--color_by_rank",
                         dest="color_by_rank",
                         type=str,
                         nargs="+",
                         help="""If the attribute rank is present in nodes """)

    phylo_gr = parser.add_argument_group(
        "PHYLOGENETIC OPTIONS\n=================")

    phylo_gr.add_argument("--alg",
                          dest="alg",
                          type=str,
                          help="""Multiple sequence alignment.""")

    phylo_gr.add_argument(
        "--alg-format",
        dest="alg_format",
        type=str,
        default="fasta",
        help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""")

    phylo_gr.add_argument(
        "--sp-discovery",
        dest="species_discovery_regexp",
        type=str,
        default="^[^_]+_(.+)",
        help="Perl regular expression to capture species"
        " code from node names. By default, node names"
        " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ")

    phylo_gr.add_argument(
        "--dump-subtrees",
        dest="subtrees_output_file",
        type=str,
        help="Returns a file containing all possible species subtrees"
        " contained in a given gene tree ")

    phylo_gr.add_argument(
        "--newick",
        dest="newick",
        type=str,
        help="dumps newick file after applying editing options")

    args = parser.parse_args(argv)

    tfile = args.tree[0]

    if args.ladderize and args.sort:
        raise ValueError(
            "--sort-branches and --ladderize options are mutually exclusive")

    if args.raxml:
        nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]",
                    open(tfile).read())
        t = PhyloTree(nw)
        #for n in t.traverse():
        #n.support = getattr(n, "bootstrap", -1)
        #
    else:
        t = PhyloTree(tfile)

    if args.alg:
        t.link_to_alignment(args.alg, alg_format=args.alg_format)
        LEAF_ATTRIBUTES["sequence"] = 1

    if args.species_discovery_regexp:
        SPCODE_REGEXP = re.compile(args.species_discovery_regexp)
        t.set_species_naming_function(user_species_naming_function)

    if args.ladderize:
        t.ladderize()
    if args.sort:
        t.sort_descendants()

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    # EXTRACT INFO

    if args.subtrees_output_file:
        ntrees, ndups, treeiter = t.get_speciation_trees()
        print >> sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." % (
            ndups, ntrees)
        OUT = open(args.subtrees_output_file, "w")
        for tree in treeiter:
            print >> OUT, tree.write()
        OUT.close()

    # VISUALIZATION

    ts = TreeStyle()
    ts.mode = args.mode
    ts.show_leaf_name = False
    ts.branch_vertical_margin = args.branch_separation
    if args.show_support:
        ts.show_branch_support = True
    if args.show_branch_length:
        ts.show_branch_length = True
    if args.force_topology:
        ts.force_topology = True

    if args.hide_leaf_names:
        del LEAF_ATTRIBUTES["name"]

    if args.show_internal_names:
        INTERNAL_ATTRIBUTES["name"] = 1

    # scale the tree
    if not args.height:
        args.height = None
    if not args.width:
        args.width = None

    ts.layout_fn = master_layout
    if args.image:
        t.render(args.image,
                 tree_style=ts,
                 w=args.width,
                 h=args.height,
                 units=args.size_units)
    else:
        t.show(None, tree_style=ts)

    if args.newick:
        t.write(features=[], outfile=args.newick)
        print "Processed Newick dumped into", args.newick
Exemplo n.º 4
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections.
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    parser.add_argument("--show",
                        dest="show_tree",
                        action="store_true",
                        help="""Display tree after the analysis.""")

    parser.add_argument("--render",
                        dest="render",
                        action="store_true",
                        help="""Render tree.""")

    parser.add_argument("--dump",
                        dest="dump",
                        action="store_true",
                        help="""Dump analysis""")

    parser.add_argument(
        "--explore",
        dest="explore",
        type=str,
        help="""Reads a previously analyzed tree and visualize it""")

    input_args = parser.add_mutually_exclusive_group()
    input_args.required = True
    input_args.add_argument("-t",
                            "--tree",
                            dest="target_tree",
                            nargs="+",
                            type=str,
                            help="""Tree file in newick format""")

    input_args.add_argument("-tf",
                            dest="tree_list_file",
                            type=str,
                            help="File with the list of tree files")

    parser.add_argument("--tax",
                        dest="tax_info",
                        type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument(
        "--sp_delimiter",
        dest="sp_delimiter",
        type=str,
        help=
        "If taxid is part of the leaf name, delimiter used to split the string"
    )

    parser.add_argument(
        "--sp_field",
        dest="sp_field",
        type=int,
        default=0,
        help="field position for taxid after splitting leaf names")

    parser.add_argument("--ref",
                        dest="ref_tree",
                        type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only",
                        dest="rf_only",
                        action="store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument(
        "--outgroup",
        dest="outgroup",
        type=str,
        nargs="+",
        help="A list of node names defining the trees outgroup")

    parser.add_argument("--is_sptree",
                        dest="is_sptree",
                        action="store_true",
                        help="Assumes no duplication nodes in the tree")

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str, help="")

    parser.add_argument("--tax2track", dest="tax2track", type=str, help="")

    parser.add_argument("--dump_tax_info",
                        dest="dump_tax_info",
                        action="store_true",
                        help="")

    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None

    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >> sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout
        ts.mode = "r"
        t.show(tree_style=ts)
        print >> sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()

    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >> sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Clade sizes", "RF (avg)", "RF (med)",
              "RF (std)", "RF (max)", "Shared tips")
    print >> OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >> sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)

        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")

            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t

        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(
                t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(
                t, None, tax2name, tax2track)

        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees = t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(
                t, subtrees, show_tree=SHOW_TREE)

            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0

        ndups = 0
        nsubtrees = len(subtrees)

        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf

        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(
                map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0  # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" % ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(
                        set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append(
                        (partial_rf[0] / float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size)  # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" % (numpy.mean(broken_sizes),
                                               numpy.median(broken_sizes),
                                               numpy.std(broken_sizes))
        iter_values = [
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std,
            rf_max, common_names
        ]
        print >> OUT, '|'.join(
            map(lambda x: str(x).strip().ljust(15), iter_values))
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems = sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems),
                                "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" % fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %
                  problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, fixed_string,
            problems_string
        ])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1

                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))

    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Broken branches", "Clade sizes",
              "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width=50, row_line=True, header=HEADER)

    if args.output:
        OUT.close()
Exemplo n.º 5
0
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections. 
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    input_gr = parser.add_argument_group("TREE INPUT OPTIONS\n=================")
    
    input_gr.add_argument('tree', metavar='tree_file', type=str, nargs=1,
                      help='A tree file (or text string) in newick format.')

    input_gr.add_argument("--raxml", dest="raxml", 
                        action="store_true",
                        help="""Process newick as raxml bootstrap values""")
    
    img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================")
        
    img_gr.add_argument("-m", "--mode", dest="mode", 
                        choices=["c", "r"], default="r",
                        help="""(r)ectangular or (c)ircular visualization""")
  

    img_gr.add_argument("-i", "--image", dest="image", 
                        type=str, 
                        help="Render tree image instead of showing it. A filename "
                        " should be provided. PDF, SVG and PNG file extensions are"
                        " supported (i.e. -i tree.svg)"
                        )

    img_gr.add_argument("--Iw", "--width", dest="width", 
                        type=int, default=0, 
                        help="width of the rendered image in pixels (see --size-units)."
                        )

    img_gr.add_argument("--Ih", "--height", dest="height", 
                        type=int, default=0,
                        help="height of the rendered image in pixels (see --size-units)."
                        )

    img_gr.add_argument("--Ir", "--resolution", dest="resolution", 
                        type=int, default=300,
                        help="Resolution if the tree image (DPI)"
                        )

    img_gr.add_argument("--Iu", "--size-units", dest="size_units", 
                        choices=["px", "mm", "in"], default="px",
                        help="Units used to specify the size of the image."
                        " (px:pixels, mm:millimeters, in:inches). "
                        )

    img_gr.add_argument("-mbs", "--min-branch-separation", dest="branch_separation", 
                        type=int, default = 3, 
                        help="Min number of pixels to separate branches vertically."
                        )

    img_gr.add_argument("--ss", "--show-support", dest="show_support", 
                        action="store_true",
                        help="""Shows branch bootstrap/support values""")

    img_gr.add_argument("--sbl", "--branch-length", dest="show_branch_length", 
                        action="store_true",
                        help="""Show branch lengths.""")

    img_gr.add_argument("--ft", "--force-topology", dest="force_topology", 
                        action="store_true",
                        help="""Force branch length to have a minimum length in the image""")

    img_gr.add_argument("--hln", "--hide-leaf-names", dest="hide_leaf_names", 
                        action="store_true",
                        help="""Hide leaf names.""")

    img_gr.add_argument("--sin", "--show-internal-names", dest="show_internal_names", 
                        action="store_true",
                        help="""Show the name attribute of all internal nodes.""")

    edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================")
    
    edit_gr.add_argument("-r", "--root", dest="root", 
                         type=str, nargs="*",
                         help="Roots the tree to the node grouping the list"
                         " of node names provided (space separated). In example:"
                         "'--root human rat mouse'")
    
    edit_gr.add_argument("-s", "--sort-branches", dest="sort", 
                        action="store_true",
                        help="""Sort branches according to node names.""")

    edit_gr.add_argument("-l", "--ladderize", dest="ladderize", 
                        action="store_true",
                        help="""Sort branches by partition size.""")
    
    edit_gr.add_argument("--color_by_rank", dest="color_by_rank", 
                         type=str, nargs="+",
                         help="""If the attribute rank is present in nodes """)
    
    phylo_gr = parser.add_argument_group("PHYLOGENETIC OPTIONS\n=================")
    
    phylo_gr.add_argument("--alg", dest="alg", 
                        type=str, 
                        help="""Multiple sequence alignment.""")

    phylo_gr.add_argument("--alg-format", dest="alg_format", 
                        type=str, default="fasta",
                        help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""")
    
    phylo_gr.add_argument("--sp-discovery", dest="species_discovery_regexp", 
                          type=str, default="^[^_]+_(.+)",
                          help="Perl regular expression to capture species"
                          " code from node names. By default, node names"
                          " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ")
        
    phylo_gr.add_argument("--dump-subtrees", dest="subtrees_output_file", 
                          type=str, 
                          help="Returns a file containing all possible species subtrees"
                               " contained in a given gene tree ")

    phylo_gr.add_argument("--newick", dest="newick", 
                          type=str,
                          help="dumps newick file after applying editing options")

    
    args = parser.parse_args(argv)

    tfile = args.tree[0]


    if args.ladderize and args.sort:
        raise ValueError("--sort-branches and --ladderize options are mutually exclusive")
    
    if args.raxml:
        nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read())
        t = PhyloTree(nw)
        #for n in t.traverse():
            #n.support = getattr(n, "bootstrap", -1)
            #
    else:
        t = PhyloTree(tfile)
        
    if args.alg:
        t.link_to_alignment(args.alg, alg_format=args.alg_format)
        LEAF_ATTRIBUTES["sequence"] = 1
        
    if args.species_discovery_regexp:
        SPCODE_REGEXP = re.compile(args.species_discovery_regexp)
        t.set_species_naming_function(user_species_naming_function)
        
    if args.ladderize:
        t.ladderize()
    if args.sort:
        t.sort_descendants()

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    # EXTRACT INFO

    if args.subtrees_output_file:
        ntrees, ndups, treeiter = t.get_speciation_trees()
        print >>sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." %(ndups, ntrees)
        OUT = open(args.subtrees_output_file, "w")
        for tree in treeiter:
            print >>OUT, tree.write()
        OUT.close()

    # VISUALIZATION
        
    ts = TreeStyle()
    ts.mode = args.mode
    ts.show_leaf_name = False
    ts.branch_vertical_margin = args.branch_separation
    if args.show_support:
        ts.show_branch_support = True
    if args.show_branch_length:
        ts.show_branch_length = True
    if args.force_topology:
        ts.force_topology = True
        
    if args.hide_leaf_names:
        del LEAF_ATTRIBUTES["name"]

    if args.show_internal_names:
        INTERNAL_ATTRIBUTES["name"] = 1

    # scale the tree
    if not args.height: 
        args.height = None
    if not args.width: 
        args.width = None

    ts.layout_fn = master_layout
    if args.image:
        t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units)
    else:
        t.show(None, tree_style=ts)

    if args.newick:
        t.write(features=[], outfile=args.newick)
        print "Processed Newick dumped into", args.newick
Exemplo n.º 6
0
def process_trees(iter_data, reftree, total_trees, thread_name=""):
    # cache some common data
    reftree_content = reftree.get_cached_content(store_attr="name")
    sorted_ref_branches = [(n, reftree_content[n])
                           for n in reftree.traverse("preorder")]
    refclades = [(n, reftree_content[n.children[0]],
                  reftree_content[n.children[1]])
                 for n in reftree.traverse("preorder") if not n.is_leaf()]

    informed_branches = defaultdict(int)  # How many trees were used to
    # inform about each refTree branch

    losses_per_branch = defaultdict(
        int)  # Number of losses in each refTree branch
    coll_losses_per_branch = defaultdict(int)

    losses_per_dup_branch = defaultdict(
        list)  # Number of losses for duplication
    # in each refTreeBranch
    coll_losses_per_dup_branch = defaultdict(list)

    dup_per_branch = defaultdict(list)  # dUplication events sorted by
    # refTree branch
    coll_dup_per_branch = defaultdict(list)

    refbranch_supports = defaultdict(list)  # gene tree support values for
    # each refTree branch
    coll_refbranch_supports = defaultdict(list)

    skipped_trees = 0

    time0 = time.time()
    tracked_times = []
    for tree_counter, (treeid, t, tree_content) in enumerate(iter_data):
        if DEBUG:
            print treeid, t
            ts = TreeStyle()
            ts.title.add_face(faces.TextFace("Seedid = %s" % treeid), 1)
            t.render("%s.png" % treeid, tree_style=ts)

        if tree_counter % 100 == 0:
            etime = time.time() - time0
            tracked_times.append(etime)
            total_etime = ((total_trees - tree_counter) /
                           100.0) * numpy.mean(tracked_times)
            percent = (tree_counter / float(total_trees)) * 100
            print >> sys.stderr, "\r%s% 10d (%0.1f%%) skipped trees:% 5d. Remaining time ~= %d min" % (
                thread_name, tree_counter, percent, skipped_trees,
                total_etime / 60.)
            time0 = time.time()
            sys.stderr.flush()
            gc.collect()

        if tree_counter and MONITOR_STEP and tree_counter % MONITOR_STEP == 0:
            annotate_tree(reftree, informed_branches, dup_per_branch,
                          losses_per_branch, losses_per_dup_branch,
                          refbranch_supports, coll_dup_per_branch,
                          coll_losses_per_branch, coll_losses_per_dup_branch,
                          coll_refbranch_supports)

            ts = TreeStyle()
            ts.layout_fn = info_layout
            reftree.render("temp_tree_analysis.png", tree_style=ts)

        # Compute support of this tree over the whole refTree
        seedid = None if USE_COLLATERAL else treeid
        seedsp = None if USE_COLLATERAL else extract_species(treeid)
        branch2supports, branch2coll_supports = get_supported_branches(
            t, tree_content, refclades=refclades, seedid=seedid)
        if branch2supports == {} and branch2coll_supports == {}:
            skipped_trees += 1

        # We combine the information of all treeko trees, by averaging the
        # number of subtrees that supported or not a given refTree branch.
        for refbranch, supports in branch2supports.iteritems():
            if IS_VALID_TREEID is None or IS_VALID_TREEID(
                    treeid, extract_species(reftree_content[refbranch])):
                refbranch_supports[refbranch.nid].append(numpy.mean(supports))
        for refbranch, coll_supports in branch2coll_supports.iteritems():
            if IS_VALID_TREEID is None or IS_VALID_TREEID(
                    treeid, extract_species(reftree_content[refbranch])):
                coll_refbranch_supports[refbranch.nid].append(
                    numpy.mean(coll_supports))

        all_observed_sp = extract_species([n.name for n in tree_content[t]])

        if REPORT_PER_TREE_SUPPORTS:
            if branch2supports:
                mean_seed_support = numpy.mean([
                    numpy.mean(branch2supports[_b]) for _b in branch2supports
                ])
            else:
                mean_seed_support = 0.0
            if branch2coll_supports:
                mean_coll_support = numpy.mean([
                    numpy.mean(branch2coll_supports[_b])
                    for _b in branch2coll_supports
                ])
            else:
                mean_coll_support = 0.0
            species_coverage = float(
                len(all_observed_sp)) / len(REFTREE_SPECIES)
            print >> REPORT_SUPPORT_FILE, '\t'.join(
                map(str, [
                    treeid, species_coverage, mean_seed_support,
                    mean_coll_support,
                    len(branch2supports),
                    len(branch2coll_supports)
                ]))

        # Here I keep a counter on how many trees were potentially able to
        # inform about specific reftree branches. For instance, if outgroup
        # species X does not appear in a genetree, I dont want to count this
        # tree as a source for duplication in the X branch.
        if len(all_observed_sp) == 1:
            max_ref_branch = reftree.search_nodes(
                name=list(all_observed_sp)[0])[0]
        else:
            max_ref_branch = reftree.get_common_ancestor(all_observed_sp)

        for refbranch in max_ref_branch.traverse():
            if IS_VALID_TREEID is None or IS_VALID_TREEID(
                    treeid, extract_species(reftree_content[refbranch])):
                informed_branches[refbranch.nid] += 1

        # Start analyzing internal nodes
        for node in t.traverse("preorder"):
            if node.is_leaf():
                continue

            if len(node.children) != 2:
                print node
                raise ValueError("Binary trees are required")

            # Extract the species set at both sides of the node
            ch_left = node.children[0]
            ch_right = node.children[1]
            seqs_left = set([n.name for n in tree_content[ch_left]])
            seqs_right = set([n.name for n in tree_content[ch_right]])
            species_left = extract_species(seqs_left)
            species_right = extract_species(seqs_right)

            # Decide whether this node is a duplication or not
            if DETECT_DUPLICATIONS:
                if SP_OVERLAP == 0:
                    isdup = True if species_left & species_right else False
                else:
                    #overlap = len(species_left & species_right) / float(max(len(species_left), len(species_right)))
                    overlap = len(species_left & species_right) / float(
                        len(species_left | species_right))

                    isdup = True if overlap >= SP_OVERLAP else False
                    if DEBUG and overlap:
                        print species_left, species_right
                        print len(species_left & species_right), float(
                            len(species_left | species_right))
                        print overlap, isdup

            else:
                isdup = True if n.evoltype == "D" else False

            # if this is a dup or the root of tree, map the to node to its
            # corresponding refTree branch and infer the expected list of
            # species
            if isdup or node is t:
                observed_sp = species_left | species_right
                if len(observed_sp) == 1:
                    ref_branch = reftree.search_nodes(
                        name=list(observed_sp)[0])[0]
                else:
                    ref_branch = reftree.get_common_ancestor(observed_sp)
                expected_sp = reftree_content[ref_branch]

            if isdup:
                if IS_VALID_TREEID is None or IS_VALID_TREEID(
                        treeid, extract_species(reftree_content[ref_branch])):
                    # updates duplications per branch in ref tree (dup rate analysis)
                    if USE_COLLATERAL or seedsp in observed_sp:
                        dup_per_branch[ref_branch.nid].append(
                            [seqs_left, seqs_right])
                        __seed = True
                    elif not USE_COLLATERAL:
                        coll_dup_per_branch[ref_branch.nid].append(
                            [seqs_left, seqs_right])
                        __seed = False

            # Count losses observed after a duplication or at the root of the tree.
            if isdup or node is t:
                # get a list of losses at both sides of the dupli
                if not isdup and node is t:
                    losses_left = get_lost_branches(observed_sp, expected_sp,
                                                    ref_branch,
                                                    sorted_ref_branches)
                    losses_right = []
                else:
                    losses_left = get_lost_branches(species_left, expected_sp,
                                                    ref_branch,
                                                    sorted_ref_branches)
                    losses_right = get_lost_branches(species_right,
                                                     expected_sp, ref_branch,
                                                     sorted_ref_branches)

                if IS_VALID_TREEID is not None:
                    losses_left = [
                        branch for branch in losses_left if IS_VALID_TREEID(
                            treeid, extract_species(reftree_content[branch]))
                    ]
                    losses_right = [
                        branch for branch in losses_right if IS_VALID_TREEID(
                            treeid, extract_species(reftree_content[branch]))
                    ]

                if USE_COLLATERAL:
                    losses = losses_left + losses_right
                    coll_losses = []
                else:
                    if treeid in seqs_left:
                        # if the seed species is not found at the other side of
                        # the dup, we can assume that its losses will never be
                        # counted, so we combine data from both sides.
                        if seedsp not in species_right:
                            losses = losses_left + losses_right
                        # otherwise, we wait for info for a different seed tree
                        else:
                            losses = losses_left
                        # No collateral information as data come from a duplication including the seed
                        coll_losses = []
                    elif treeid in seqs_right:
                        # if the seed species is not found at the other side of
                        # the dup, we can assume that its losses will never be
                        # counted, so we combine data from both sides.
                        if seedsp not in species_left:
                            losses = losses_left + losses_right
                        # otherwise, we wait for info for a different seed tree
                        else:
                            losses = losses_right
                        # No collateral information as data come from a duplication including the seed
                        coll_losses = []
                    else:
                        # If this is a collateral duplication, process losses as such
                        losses = []
                        coll_losses = losses_left + losses_right

                if len(reftree_content[ref_branch]
                       ) == 1 and losses + coll_losses:
                    raw_input("This should never happen")

                # update gene loss counters
                for lost_branch in losses:
                    losses_per_branch[lost_branch.nid] += 1
                    if isdup:  # if losses come from a dup event
                        losses_per_dup_branch[ref_branch.nid].append(
                            lost_branch)
                for lost_branch in coll_losses:
                    coll_losses_per_branch[lost_branch.nid] += 1
                    if isdup:  # if losses come from a dup event
                        coll_losses_per_dup_branch[ref_branch.nid].append(
                            lost_branch)

    return (informed_branches, dup_per_branch, losses_per_branch,
            losses_per_dup_branch, refbranch_supports, coll_dup_per_branch,
            coll_losses_per_branch, coll_losses_per_dup_branch,
            coll_refbranch_supports)
Exemplo n.º 7
0
def main(argv):
    global args

    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("-r",
                        dest="reftree",
                        type=str,
                        required=True,
                        help="""Reference tree""")

    parser.add_argument(
        "--source_trees",
        dest="source_trees",
        type=str,
        required=True,
        help=
        ("A list of *rooted* genetrees, one per line, in the format: TreeID/SeedID [TAB] newick "
         ))

    parser.add_argument("--plot_newick",
                        dest="plot_newick",
                        type=str,
                        help=(""))

    parser.add_argument("--spname_delimiter",
                        dest="spname_delimiter",
                        type=str,
                        default="_",
                        help=("species code delimiter in node names"))

    parser.add_argument(
        "--spname_field",
        dest="spname_field",
        type=int,
        default=-1,
        help=
        ("position of the species code extracted from node names. -1 = last field"
         ))

    parser.add_argument(
        "--collateral",
        dest="use_collateral",
        action="store_true",
        help=("If enabled, collateral information will be used as"
              " equally qualified data. Otherwise, such data will"
              " be reported separatedly. Use this if your set of"
              " trees are not overlaping. "))

    parser.add_argument(
        "--skip_dup_detection",
        dest="skip_dup_detection",
        action="store_true",
        help=('If used, duplications will be expected to be annotated'
              ' in the source gene trees with the evoltype="D" tag.'
              ' Otherwise they will be inferred on the fly using'
              ' the species overlap algorithm.'))

    parser.add_argument(
        "--spoverlap",
        dest="species_overlap",
        type=float,
        default=0.0,
        help=("Species overlap cutoff. A number between 0 and 1 "
              "representing the percentage of species that should be "
              "shared between two sister partitions to be considered a"
              " duplication. 0 = any overlap represents a duplication. "))

    parser.add_argument(
        "--debug",
        dest="debug",
        action="store_true",
        help=
        ("generate an image of every input gene tree tree, so the result can be inspected"
         ))

    parser.add_argument(
        "--snapshot_step",
        dest="snapshot_step",
        type=int,
        default=1000,
        help=("How many trees should be processed between snapshots dumps?"))

    parser.add_argument(
        "--reftree_constraint",
        dest="reftree_constraint",
        type=str,
        help=("A python module from from which a function called "
              "*is_valid_treeid(treeid, refbranch)* should be importable. "
              "The function will be used to decide if the info of a given "
              "source tree is informative or not for each reftree branch. "))

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        required=True,
                        help=("output tag name (extensions will be added)"))

    parser.add_argument("--cpu",
                        dest="cpu",
                        type=int,
                        default=1,
                        help=("enable parallel computation"))

    parser.add_argument(
        "--img_report",
        dest="img_report",
        action="store_true",
        help=
        ("If true, it generates a summary image results with all the computed data"
         ))

    parser.add_argument(
        "--report_supports",
        dest="report_supports",
        action="store_true",
        help=
        ("If used, supported ref tree branches are individually reported for each gene tree "
         ))

    args = parser.parse_args(argv)
    if args.plot_newick:
        t = Tree(args.plot_newick)
        ts = TreeStyle()
        ts.layout_fn = info_layout
        t.render("tree_analysis.png", tree_style=ts)
        sys.exit(0)

    SPNAME_FIELD, SPNAME_DELIMITER = args.spname_field, args.spname_delimiter
    USE_COLLATERAL = args.use_collateral
    DETECT_DUPLICATIONS = True if not args.skip_dup_detection else False
    REPORT_PER_TREE_SUPPORTS = True if args.report_supports else False
    SP_OVERLAP = args.species_overlap
    DEBUG = args.debug
    IMG_REPORT = args.img_report
    reftree = PhyloTree(args.reftree, sp_naming_function=None)
    for nid, n in enumerate(reftree.traverse()):
        n.add_features(nid=nid)
    REFTREE_SPECIES = set(reftree.get_leaf_names())
    print __DESCRIPTION__

    if REPORT_PER_TREE_SUPPORTS:
        REPORT_SUPPORT_FILE = open("%s.gentree_supports" % args.output, "w")
        print >> REPORT_SUPPORT_FILE, '#' + '\t'.join(
            map(str, [
                "treeId", "spCoverage", "mean_support", "mean_coll_support",
                "tested_branches", 'tested_coll_branches'
            ]))

    TOTAL_TREES = int(
        commands.getoutput("wc -l %s" % args.source_trees).split()[0]) + 1
    print >> sys.stderr, "Processing %d source trees" % TOTAL_TREES
    if args.reftree_constraint:
        import imp
        constraint = imp.load_source('constraint', args.reftree_constraint)
        IS_VALID_TREEID = constraint.is_valid_treeid
    else:
        IS_VALID_TREEID = None

    if args.cpu > 1:
        MONITOR_STEP = 0
        #return (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
        #       coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports)
        # The output of the process_trees function are 9 dictionaries in which keys are refbranches
        target_dicts = [{} for x in range(9)]

        def merge_dict_results(target, source):
            def merge_dict(target, source):
                for k, v in source.iteritems():
                    if k not in target:
                        target[k] = v
                    elif isinstance(v, list):
                        target[k].extend(v)
                    elif isinstance(v, set):
                        target[k].update(v)
                    elif isinstance(v, int):
                        target[k] += v
                    else:
                        raise ValueError("Impossible to merge str results")

            for index in xrange(len(target)):
                merge_dict(target[index], out[index])

        from multiprocessing import Process, Queue
        from Queue import Empty as QueueEmpty
        outputs_queue = Queue()
        if TOTAL_TREES > args.cpu:
            trees_per_cpu = TOTAL_TREES / args.cpu
            trees_per_cpu += 1 if TOTAL_TREES % args.cpu else 0
        else:
            trees_per_cpu = 1
            args.cpu = TOTAL_TREES

        all_workers = set()
        for cpu_num in xrange(args.cpu):
            sline = (cpu_num * trees_per_cpu)
            eline = (cpu_num * trees_per_cpu) + trees_per_cpu
            data_iter = tree_iterator(args.source_trees,
                                      restrict_species=REFTREE_SPECIES,
                                      start_line=sline,
                                      end_line=eline)
            print >> sys.stderr, "Launching worker %d from %d to %d" % (
                cpu_num, sline, eline)
            worker = Process(target=run_parallel,
                             args=(cpu_num, outputs_queue, process_trees,
                                   data_iter, reftree, trees_per_cpu))
            worker.name = "Worker_%d" % cpu_num
            all_workers.add(worker)
            worker.start()

        while all_workers:
            # clear done threads
            for w in list(all_workers):
                if not w.is_alive():
                    print >> sys.stderr, "%s thread is done!" % w.name
                    all_workers.discard(w)
            # get and merge results
            while 1:
                try:
                    out = outputs_queue.get(False)
                except QueueEmpty:
                    break
                else:
                    # This merge depends on process_trees return output!!!!!
                    merge_dict_results(target_dicts, out)
                    # Dump a snapshot
                    dump_results(reftree, *target_dicts)
                time.sleep(0.1)
            if all_workers:
                time.sleep(1)
        # collected data
        (informed_branches, dup_per_branch, losses_per_branch,
         losses_per_dup_branch, refbranch_supports, coll_dup_per_branch,
         coll_losses_per_branch, coll_losses_per_dup_branch,
         coll_refbranch_supports) = target_dicts
    else:
        MONITOR_STEP = args.snapshot_step
        data_iter = tree_iterator(args.source_trees,
                                  restrict_species=REFTREE_SPECIES)

        (informed_branches, dup_per_branch, losses_per_branch,
         losses_per_dup_branch, refbranch_supports, coll_dup_per_branch,
         coll_losses_per_branch, coll_losses_per_dup_branch,
         coll_refbranch_supports) = process_trees(data_iter, reftree,
                                                  TOTAL_TREES)

    if REPORT_PER_TREE_SUPPORTS:
        REPORT_SUPPORT_FILE.close()

    dump_results(reftree, informed_branches, dup_per_branch, losses_per_branch,
                 losses_per_dup_branch, refbranch_supports,
                 coll_dup_per_branch, coll_losses_per_branch,
                 coll_losses_per_dup_branch, coll_refbranch_supports)

    print >> sys.stderr, "Dumping full analysis..."
    # Full dump, including duplication details
    cPickle.dump(reftree, open("%s.pkl" % args.output, "w"))
Exemplo n.º 8
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    input_gr = parser.add_argument_group(
        "TREE INPUT OPTIONS\n=================")

    input_gr.add_argument(
        'tree',
        metavar='tree_file',
        type=str,
        nargs=1,
        help='A tree file (or text string) in newick format.')

    input_gr.add_argument("--raxml",
                          dest="raxml",
                          action="store_true",
                          help="""Process newick as raxml bootstrap values""")

    img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================")

    img_gr.add_argument("-m",
                        "--mode",
                        dest="mode",
                        choices=["c", "r"],
                        default="r",
                        help="""(r)ectangular or (c)ircular visualization""")

    img_gr.add_argument(
        "-i",
        "--image",
        dest="image",
        type=str,
        help="Render tree image instead of showing it. A filename "
        " should be provided. PDF, SVG and PNG file extensions are"
        " supported (i.e. -i tree.svg)")

    img_gr.add_argument("--text",
                        dest="text_mode",
                        action="store_true",
                        help="Shows the tree using ASCII characters")

    img_gr.add_argument(
        "--attr",
        "--show_attributes",
        dest="show_attributes",
        nargs="+",
        help="Display the value of the specified attributes, if available")

    img_gr.add_argument(
        "--Iw",
        "--width",
        dest="width",
        type=int,
        default=0,
        help="width of the rendered image in pixels (see --size-units).")

    img_gr.add_argument(
        "--Ih",
        "--height",
        dest="height",
        type=int,
        default=0,
        help="height of the rendered image in pixels (see --size-units).")

    img_gr.add_argument("--Ir",
                        "--resolution",
                        dest="resolution",
                        type=int,
                        default=300,
                        help="Resolution if the tree image (DPI)")

    img_gr.add_argument("--Iu",
                        "--size_units",
                        dest="size_units",
                        choices=["px", "mm", "in"],
                        default="px",
                        help="Units used to specify the size of the image."
                        " (px:pixels, mm:millimeters, in:inches). ")

    img_gr.add_argument(
        "-mbs",
        "--min_branch_separation",
        dest="branch_separation",
        type=int,
        default=3,
        help="Min number of pixels to separate branches vertically.")

    img_gr.add_argument("--ss",
                        "--show_support",
                        dest="show_support",
                        action="store_true",
                        help="""Shows branch bootstrap/support values""")

    img_gr.add_argument("--sbl",
                        "--show_branch_length",
                        dest="show_branch_length",
                        action="store_true",
                        help="""Show branch lengths.""")

    img_gr.add_argument(
        "--ft",
        "--force_topology",
        dest="force_topology",
        action="store_true",
        help="""Force branch length to have a minimum length in the image""")

    img_gr.add_argument("--hln",
                        "--hide_leaf_names",
                        dest="hide_leaf_names",
                        action="store_true",
                        help="""Hide leaf names.""")

    img_gr.add_argument(
        "--sin",
        "--show_internal_names",
        dest="show_internal_names",
        action="store_true",
        help="""Show the name attribute of all internal nodes.""")

    edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================")

    edit_gr.add_argument(
        "-r",
        "--root",
        dest="root",
        type=str,
        nargs="*",
        help="Roots the tree to the node grouping the list"
        " of node names provided (space separated). In example:"
        "'--root human rat mouse'")

    edit_gr.add_argument("-s",
                         "--sort_branches",
                         dest="sort",
                         action="store_true",
                         help="""Sort branches according to node names.""")

    edit_gr.add_argument("-l",
                         "--ladderize",
                         dest="ladderize",
                         action="store_true",
                         help="""Sort branches by partition size.""")

    edit_gr.add_argument("--color_by_rank",
                         dest="color_by_rank",
                         type=str,
                         nargs="+",
                         help="""If the attribute rank is present in nodes """)

    edit_gr.add_argument(
        "--ncbi",
        dest="ncbi",
        action="store_true",
        help=""" Annotate tree using the NCBI taxonomy database""")

    edit_gr.add_argument(
        "--taxid_attr",
        dest="taxid_attr",
        type=str,
        default="name",
        help="node attribute encoding for valid taxid numbers.")

    edit_gr.add_argument(
        "--taxid_attr_regexp",
        dest="taxid_attr_regexp",
        type=str,
        help=
        "If taxid number is encoded as part of another text string, i.e. gene name, use this argument to define a Perl regular expression to extract taxid numbers."
    )

    phylo_gr = parser.add_argument_group(
        "PHYLOGENETIC OPTIONS\n=================")

    phylo_gr.add_argument(
        "--alg",
        dest="alg",
        type=str,
        help="""Link tree to a multiple sequence alignment.""")

    phylo_gr.add_argument(
        "--alg_format",
        dest="alg_format",
        type=str,
        default="fasta",
        help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""")

    phylo_gr.add_argument(
        "--sp_discovery",
        dest="species_discovery_regexp",
        type=str,
        default="^[^_]+_(.+)",
        help="Perl regular expression used to capture species"
        " code from node names. By default, node names"
        " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ")

    args = parser.parse_args(argv)

    tfile = args.tree[0]

    if args.ladderize and args.sort:
        raise ValueError(
            "--sort-branches and --ladderize options are mutually exclusive")

    if args.raxml:
        nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]",
                    open(tfile).read())
        t = PhyloTree(nw)
    else:
        t = PhyloTree(tfile)

    if args.ncbi:
        if args.taxid_attr_regexp:
            TAXIDMATCHER = re.compile(args.taxid_attr_regexp)

        for lf in t:
            if args.taxid_attr_regexp:
                lf.taxid = re.search(TAXIDMATCHER,
                                     getattr(lf, args.taxid_attr)).groups()[0]
            else:
                lf.taxid = getattr(lf, args.taxid_attr)
        t.annotate_ncbi_taxa(taxid_attr="taxid")

    if args.alg:
        t.link_to_alignment(args.alg, alg_format=args.alg_format)
        LEAF_ATTRIBUTES["sequence"] = 1

    if args.species_discovery_regexp:
        SPCODE_REGEXP = re.compile(args.species_discovery_regexp)
        t.set_species_naming_function(user_species_naming_function)

    if args.ladderize:
        t.ladderize()
    if args.sort:
        t.sort_descendants()

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    # VISUALIZATION

    ts = TreeStyle()
    ts.mode = args.mode
    ts.show_leaf_name = False
    ts.branch_vertical_margin = args.branch_separation
    if args.show_support:
        ts.show_branch_support = True
    if args.show_branch_length:
        ts.show_branch_length = True
    if args.force_topology:
        ts.force_topology = True

    if args.hide_leaf_names:
        del LEAF_ATTRIBUTES["name"]

    if args.show_internal_names:
        INTERNAL_ATTRIBUTES["name"] = 1

    # scale the tree
    if not args.height:
        args.height = None
    if not args.width:
        args.width = None

    if args.text_mode:
        print t.get_ascii(show_internal=args.show_internal_names,
                          attributes=args.show_attributes)
    else:
        ts.layout_fn = master_layout
        if args.image:
            t.render(args.image,
                     tree_style=ts,
                     w=args.width,
                     h=args.height,
                     units=args.size_units)
        else:
            t.show(None, tree_style=ts)
Exemplo n.º 9
0
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)

    input_gr = parser.add_argument_group("TREE INPUT OPTIONS\n=================")
    
    input_gr.add_argument('tree', metavar='tree_file', type=str, nargs=1,
                      help='A tree file (or text string) in newick format.')

    input_gr.add_argument("--raxml", dest="raxml", 
                        action="store_true",
                        help="""Process newick as raxml bootstrap values""")
    
    img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================")
        
    img_gr.add_argument("-m", "--mode", dest="mode", 
                        choices=["c", "r"], default="r",
                        help="""(r)ectangular or (c)ircular visualization""")
  

    img_gr.add_argument("-i", "--image", dest="image", 
                        type=str, 
                        help="Render tree image instead of showing it. A filename "
                        " should be provided. PDF, SVG and PNG file extensions are"
                        " supported (i.e. -i tree.svg)"
                        )

    img_gr.add_argument("--text", dest="text_mode", 
                        action="store_true",
                        help="Shows the tree using ASCII characters")

    img_gr.add_argument("--attr", "--show_attributes", dest="show_attributes", 
                        nargs="+",
                        help="Display the value of the specified attributes, if available")    
    
    img_gr.add_argument("--Iw", "--width", dest="width", 
                        type=int, default=0, 
                        help="width of the rendered image in pixels (see --size-units)."
                        )

    img_gr.add_argument("--Ih", "--height", dest="height", 
                        type=int, default=0,
                        help="height of the rendered image in pixels (see --size-units)."
                        )

    img_gr.add_argument("--Ir", "--resolution", dest="resolution", 
                        type=int, default=300,
                        help="Resolution if the tree image (DPI)"
                        )

    img_gr.add_argument("--Iu", "--size_units", dest="size_units", 
                        choices=["px", "mm", "in"], default="px",
                        help="Units used to specify the size of the image."
                        " (px:pixels, mm:millimeters, in:inches). "
                        )

    img_gr.add_argument("-mbs", "--min_branch_separation", dest="branch_separation", 
                        type=int, default = 3, 
                        help="Min number of pixels to separate branches vertically."
                        )

    img_gr.add_argument("--ss", "--show_support", dest="show_support", 
                        action="store_true",
                        help="""Shows branch bootstrap/support values""")

    img_gr.add_argument("--sbl", "--show_branch_length", dest="show_branch_length", 
                        action="store_true",
                        help="""Show branch lengths.""")

    img_gr.add_argument("--ft", "--force_topology", dest="force_topology", 
                        action="store_true",
                        help="""Force branch length to have a minimum length in the image""")

    img_gr.add_argument("--hln", "--hide_leaf_names", dest="hide_leaf_names", 
                        action="store_true",
                        help="""Hide leaf names.""")

    img_gr.add_argument("--sin", "--show_internal_names", dest="show_internal_names", 
                        action="store_true",
                        help="""Show the name attribute of all internal nodes.""")

    edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================")
    
    edit_gr.add_argument("-r", "--root", dest="root", 
                         type=str, nargs="*",
                         help="Roots the tree to the node grouping the list"
                         " of node names provided (space separated). In example:"
                         "'--root human rat mouse'")
    
    edit_gr.add_argument("-s", "--sort_branches", dest="sort", 
                        action="store_true",
                        help="""Sort branches according to node names.""")

    edit_gr.add_argument("-l", "--ladderize", dest="ladderize", 
                        action="store_true",
                        help="""Sort branches by partition size.""")
    
    edit_gr.add_argument("--color_by_rank", dest="color_by_rank", 
                         type=str, nargs="+",
                         help="""If the attribute rank is present in nodes """)

    edit_gr.add_argument("--ncbi", dest="ncbi", 
                         action="store_true",
                         help=""" Annotate tree using the NCBI taxonomy database""")

    edit_gr.add_argument("--taxid_attr", dest="taxid_attr", 
                         type=str, default="name",
                         help="node attribute encoding for valid taxid numbers.")

    edit_gr.add_argument("--taxid_attr_regexp", dest="taxid_attr_regexp", 
                         type=str,
                         help="If taxid number is encoded as part of another text string, i.e. gene name, use this argument to define a Perl regular expression to extract taxid numbers.")
    
    phylo_gr = parser.add_argument_group("PHYLOGENETIC OPTIONS\n=================")
    
    phylo_gr.add_argument("--alg", dest="alg", 
                        type=str, 
                        help="""Link tree to a multiple sequence alignment.""")

    phylo_gr.add_argument("--alg_format", dest="alg_format", 
                        type=str, default="fasta",
                        help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""")
    
    phylo_gr.add_argument("--sp_discovery", dest="species_discovery_regexp", 
                          type=str, default="^[^_]+_(.+)",
                          help="Perl regular expression used to capture species"
                          " code from node names. By default, node names"
                          " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ")
    
    args = parser.parse_args(argv)

    tfile = args.tree[0]


    if args.ladderize and args.sort:
        raise ValueError("--sort-branches and --ladderize options are mutually exclusive")
    
    if args.raxml:
        nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read())
        t = PhyloTree(nw)
    else:
        t = PhyloTree(tfile)

    if args.ncbi:
        if args.taxid_attr_regexp:
            TAXIDMATCHER = re.compile(args.taxid_attr_regexp)

        for lf in t:
            if args.taxid_attr_regexp:
                lf.taxid = re.search(TAXIDMATCHER, getattr(lf, args.taxid_attr)).groups()[0]
            else:
                lf.taxid = getattr(lf, args.taxid_attr)
        t.annotate_ncbi_taxa(taxid_attr="taxid")
        
    if args.alg:
        t.link_to_alignment(args.alg, alg_format=args.alg_format)
        LEAF_ATTRIBUTES["sequence"] = 1
        
    if args.species_discovery_regexp:
        SPCODE_REGEXP = re.compile(args.species_discovery_regexp)
        t.set_species_naming_function(user_species_naming_function)
        
    if args.ladderize:
        t.ladderize()
    if args.sort:
        t.sort_descendants()

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    # VISUALIZATION
        
    ts = TreeStyle()
    ts.mode = args.mode
    ts.show_leaf_name = False
    ts.branch_vertical_margin = args.branch_separation
    if args.show_support:
        ts.show_branch_support = True
    if args.show_branch_length:
        ts.show_branch_length = True
    if args.force_topology:
        ts.force_topology = True
        
    if args.hide_leaf_names:
        del LEAF_ATTRIBUTES["name"]

    if args.show_internal_names:
        INTERNAL_ATTRIBUTES["name"] = 1

    # scale the tree
    if not args.height: 
        args.height = None
    if not args.width: 
        args.width = None

    if args.text_mode:
        print t.get_ascii(show_internal=args.show_internal_names, attributes = args.show_attributes)
    else:    
        ts.layout_fn = master_layout        
        if args.image:
            t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units)
        else:
            t.show(None, tree_style=ts)
Exemplo n.º 10
0
def process_trees(iter_data, reftree, total_trees, thread_name=""):
    # cache some common data
    reftree_content = reftree.get_cached_content(store_attr="name")
    sorted_ref_branches = [(n, reftree_content[n]) for n in reftree.traverse("preorder")]
    refclades = [(n, reftree_content[n.children[0]], reftree_content[n.children[1]])
                 for n in reftree.traverse("preorder") if not n.is_leaf()]

    informed_branches = defaultdict(int)      # How many trees were used to
                                              # inform about each refTree branch
    
    losses_per_branch = defaultdict(int)      # Number of losses in each refTree branch
    coll_losses_per_branch = defaultdict(int) 
    
    losses_per_dup_branch = defaultdict(list) # Number of losses for duplication
                                              # in each refTreeBranch
    coll_losses_per_dup_branch = defaultdict(list) 
    
    dup_per_branch = defaultdict(list)        # dUplication events sorted by
                                              # refTree branch
    coll_dup_per_branch = defaultdict(list)   
                                              
    refbranch_supports = defaultdict(list)    # gene tree support values for
                                              # each refTree branch
    coll_refbranch_supports = defaultdict(list)    

    skipped_trees = 0
        
    time0 = time.time()
    tracked_times = []
    for tree_counter, (treeid, t, tree_content) in enumerate(iter_data):
        if DEBUG:
            print treeid, t
            ts = TreeStyle()
            ts.title.add_face(faces.TextFace("Seedid = %s"%treeid), 1)
            t.render("%s.png"%treeid, tree_style=ts)
                        
        if tree_counter % 100 == 0:
            etime = time.time() - time0
            tracked_times.append(etime)
            total_etime = ((total_trees - tree_counter) / 100.0) * numpy.mean(tracked_times)
            percent = (tree_counter / float(total_trees)) * 100
            print >>sys.stderr, "\r%s% 10d (%0.1f%%) skipped trees:% 5d. Remaining time ~= %d min" %(thread_name, tree_counter, percent, skipped_trees, total_etime/60.)
            time0 = time.time()
            sys.stderr.flush()
            gc.collect()

            
        if tree_counter and MONITOR_STEP and tree_counter % MONITOR_STEP == 0:
            annotate_tree(reftree, informed_branches, dup_per_branch, losses_per_branch,
                          losses_per_dup_branch, refbranch_supports,
                          coll_dup_per_branch, coll_losses_per_branch,
                          coll_losses_per_dup_branch, coll_refbranch_supports)
            
            ts = TreeStyle()
            ts.layout_fn = info_layout
            reftree.render("temp_tree_analysis.png", tree_style=ts)
            
        # Compute support of this tree over the whole refTree
        seedid = None if USE_COLLATERAL else treeid
        seedsp = None if USE_COLLATERAL else extract_species(treeid)
        branch2supports, branch2coll_supports = get_supported_branches(t, tree_content,
                                                                       refclades=refclades, seedid=seedid)
        if branch2supports == {} and branch2coll_supports == {}:
            skipped_trees +=1
            
        # We combine the information of all treeko trees, by averaging the
        # number of subtrees that supported or not a given refTree branch.
        for refbranch, supports in branch2supports.iteritems():
            if IS_VALID_TREEID is None or IS_VALID_TREEID(treeid, extract_species(reftree_content[refbranch])):
                refbranch_supports[refbranch.nid].append(numpy.mean(supports))
        for refbranch, coll_supports in branch2coll_supports.iteritems():
            if IS_VALID_TREEID is None or IS_VALID_TREEID(treeid, extract_species(reftree_content[refbranch])):
                coll_refbranch_supports[refbranch.nid].append(numpy.mean(coll_supports))

        all_observed_sp = extract_species([n.name for n in tree_content[t]])
                
        if REPORT_PER_TREE_SUPPORTS:
            if branch2supports:
                mean_seed_support = numpy.mean([numpy.mean(branch2supports[_b]) for _b in branch2supports])
            else:
                mean_seed_support = 0.0
            if branch2coll_supports:
                mean_coll_support = numpy.mean([numpy.mean(branch2coll_supports[_b]) for _b in branch2coll_supports])
            else:
                mean_coll_support = 0.0
            species_coverage = float(len(all_observed_sp))/len(REFTREE_SPECIES)
            print >>REPORT_SUPPORT_FILE, '\t'.join(map(str, [treeid, species_coverage, mean_seed_support,  mean_coll_support, len(branch2supports), len(branch2coll_supports)]))
                            
        # Here I keep a counter on how many trees were potentially able to
        # inform about specific reftree branches. For instance, if outgroup
        # species X does not appear in a genetree, I dont want to count this
        # tree as a source for duplication in the X branch.
        if len(all_observed_sp) == 1: 
            max_ref_branch = reftree.search_nodes(name=list(all_observed_sp)[0])[0]
        else:
            max_ref_branch = reftree.get_common_ancestor(all_observed_sp)
            
        for refbranch in max_ref_branch.traverse():
            if IS_VALID_TREEID is None or IS_VALID_TREEID(treeid, extract_species(reftree_content[refbranch])):
                informed_branches[refbranch.nid] += 1

        # Start analyzing internal nodes
        for node in t.traverse("preorder"):
            if node.is_leaf():
                continue 

            if len(node.children) != 2:
                print node
                raise ValueError("Binary trees are required")

            # Extract the species set at both sides of the node
            ch_left = node.children[0]
            ch_right = node.children[1]
            seqs_left = set([n.name for n in tree_content[ch_left]])
            seqs_right = set([n.name for n in tree_content[ch_right]])
            species_left = extract_species(seqs_left)
            species_right = extract_species(seqs_right)

            # Decide whether this node is a duplication or not
            if DETECT_DUPLICATIONS:
                if SP_OVERLAP == 0:
                    isdup = True if species_left & species_right else False
                else:
                    #overlap = len(species_left & species_right) / float(max(len(species_left), len(species_right)))
                    overlap = len(species_left & species_right) / float(len(species_left | species_right))

                    isdup = True if overlap >= SP_OVERLAP else False
                    if DEBUG and overlap:
                        print species_left, species_right
                        print len(species_left & species_right),  float(len(species_left | species_right))
                        print overlap, isdup
                
            else:
                isdup = True if n.evoltype == "D" else False

            # if this is a dup or the root of tree, map the to node to its
            # corresponding refTree branch and infer the expected list of
            # species
            if isdup or node is t: 
                observed_sp = species_left | species_right
                if len(observed_sp) == 1: 
                    ref_branch = reftree.search_nodes(name=list(observed_sp)[0])[0]
                else:
                    ref_branch = reftree.get_common_ancestor(observed_sp)
                expected_sp = reftree_content[ref_branch]

            if isdup:
                if IS_VALID_TREEID is None or IS_VALID_TREEID(treeid, extract_species(reftree_content[ref_branch])):
                    # updates duplications per branch in ref tree (dup rate analysis)
                    if USE_COLLATERAL or seedsp in observed_sp:
                        dup_per_branch[ref_branch.nid].append([seqs_left, seqs_right])
                        __seed = True
                    elif not USE_COLLATERAL:
                        coll_dup_per_branch[ref_branch.nid].append([seqs_left, seqs_right])
                        __seed = False
                        
            # Count losses observed after a duplication or at the root of the tree.
            if isdup or node is t:
                # get a list of losses at both sides of the dupli
                if not isdup and node is t:
                    losses_left = get_lost_branches(observed_sp, expected_sp,
                                                    ref_branch, sorted_ref_branches)
                    losses_right = []
                else:
                    losses_left = get_lost_branches(species_left, expected_sp,
                                                    ref_branch, sorted_ref_branches)
                    losses_right = get_lost_branches(species_right, expected_sp,
                                                     ref_branch, sorted_ref_branches)

                if IS_VALID_TREEID is not None:
                    losses_left = [branch for branch in losses_left if IS_VALID_TREEID(treeid, extract_species(reftree_content[branch]))]
                    losses_right = [branch for branch in losses_right if IS_VALID_TREEID(treeid, extract_species(reftree_content[branch]))]
                    
                if USE_COLLATERAL:
                    losses = losses_left + losses_right
                    coll_losses = []
                else:
                    if treeid in seqs_left:
                        # if the seed species is not found at the other side of
                        # the dup, we can assume that its losses will never be
                        # counted, so we combine data from both sides.
                        if seedsp not in species_right: 
                            losses = losses_left + losses_right
                        # otherwise, we wait for info for a different seed tree
                        else: 
                            losses = losses_left
                        # No collateral information as data come from a duplication including the seed
                        coll_losses = [] 
                    elif treeid in seqs_right:
                        # if the seed species is not found at the other side of
                        # the dup, we can assume that its losses will never be
                        # counted, so we combine data from both sides.
                        if seedsp not in species_left: 
                            losses = losses_left + losses_right
                        # otherwise, we wait for info for a different seed tree
                        else:
                            losses = losses_right
                        # No collateral information as data come from a duplication including the seed
                        coll_losses = [] 
                    else:
                        # If this is a collateral duplication, process losses as such
                        losses = []
                        coll_losses = losses_left + losses_right

                if len(reftree_content[ref_branch]) == 1 and losses + coll_losses:
                    raw_input("This should never happen")
                    
                # update gene loss counters
                for lost_branch in losses:
                    losses_per_branch[lost_branch.nid] += 1
                    if isdup: # if losses come from a dup event
                        losses_per_dup_branch[ref_branch.nid].append(lost_branch)
                for lost_branch in coll_losses:
                    coll_losses_per_branch[lost_branch.nid] += 1
                    if isdup: # if losses come from a dup event
                        coll_losses_per_dup_branch[ref_branch.nid].append(lost_branch)

    return (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
            coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports)
Exemplo n.º 11
0
def main(argv):
    global args

    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                            formatter_class=argparse.RawDescriptionHelpFormatter)

    
    parser.add_argument("-r", dest="reftree", 
                        type=str, required=True,
                        help="""Reference tree""")
    
    parser.add_argument("--source_trees", dest="source_trees", 
                        type=str, required = True,
                        help=("A list of *rooted* genetrees, one per line, in the format: TreeID/SeedID [TAB] newick "))
   
    parser.add_argument("--plot_newick", dest="plot_newick", 
                        type=str,
                        help=(""))
    
    parser.add_argument("--spname_delimiter", dest="spname_delimiter", 
                        type=str, default="_",
                        help=("species code delimiter in node names"))
    
    parser.add_argument("--spname_field", dest="spname_field", 
                        type=int, default=-1,
                        help=("position of the species code extracted from node names. -1 = last field"))

    parser.add_argument("--collateral", dest="use_collateral", 
                        action="store_true",
                        help=("If enabled, collateral information will be used as"
                              " equally qualified data. Otherwise, such data will"
                              " be reported separatedly. Use this if your set of"
                              " trees are not overlaping. "))

    parser.add_argument("--skip_dup_detection", dest="skip_dup_detection", 
                        action="store_true",
                        help=('If used, duplications will be expected to be annotated'
                              ' in the source gene trees with the evoltype="D" tag.'
                              ' Otherwise they will be inferred on the fly using'
                              ' the species overlap algorithm.'))

    parser.add_argument("--spoverlap", dest="species_overlap", 
                        type=float, default=0.0,
                        help=("Species overlap cutoff. A number between 0 and 1 "
                        "representing the percentage of species that should be "
                        "shared between two sister partitions to be considered a"
                        " duplication. 0 = any overlap represents a duplication. "))
    
    parser.add_argument("--debug", dest="debug", 
                        action="store_true",
                        help=("generate an image of every input gene tree tree, so the result can be inspected"))

    parser.add_argument("--snapshot_step", dest="snapshot_step", 
                        type=int, default=1000,
                        help=("How many trees should be processed between snapshots dumps?"))

    parser.add_argument("--reftree_constraint", dest="reftree_constraint", 
                        type=str, 
                        help=("A python module from from which a function called "
                              "*is_valid_treeid(treeid, refbranch)* should be importable. "
                              "The function will be used to decide if the info of a given "
                              "source tree is informative or not for each reftree branch. "))
    
    parser.add_argument("-o", dest="output", 
                        type=str, required=True, 
                        help=("output tag name (extensions will be added)"))

    parser.add_argument("--cpu", dest="cpu", 
                        type=int, default=1, 
                        help=("enable parallel computation"))

    parser.add_argument("--img_report", dest="img_report", 
                        action="store_true", 
                        help=("If true, it generates a summary image results with all the computed data"))

    parser.add_argument("--report_supports", dest="report_supports", 
                        action="store_true", 
                        help=("If used, supported ref tree branches are individually reported for each gene tree "))

    
    args = parser.parse_args(argv)
    if args.plot_newick:
        t = Tree(args.plot_newick)
        ts = TreeStyle()
        ts.layout_fn = info_layout
        t.render("tree_analysis.png", tree_style=ts)
        sys.exit(0)
    
    SPNAME_FIELD, SPNAME_DELIMITER = args.spname_field, args.spname_delimiter
    USE_COLLATERAL = args.use_collateral
    DETECT_DUPLICATIONS = True if not args.skip_dup_detection else False
    REPORT_PER_TREE_SUPPORTS = True if args.report_supports  else False
    SP_OVERLAP = args.species_overlap
    DEBUG = args.debug
    IMG_REPORT = args.img_report
    reftree = PhyloTree(args.reftree, sp_naming_function=None)
    for nid, n in enumerate(reftree.traverse()):
        n.add_features(nid = nid)
    REFTREE_SPECIES = set(reftree.get_leaf_names())
    print __DESCRIPTION__

    if REPORT_PER_TREE_SUPPORTS:
        REPORT_SUPPORT_FILE = open("%s.gentree_supports" %args.output, "w")
        print >>REPORT_SUPPORT_FILE, '#'+'\t'.join(map(str, ["treeId", "spCoverage", "mean_support",  "mean_coll_support", "tested_branches", 'tested_coll_branches']))
    
    TOTAL_TREES = int(commands.getoutput("wc -l %s" %args.source_trees).split()[0]) + 1
    print >>sys.stderr, "Processing %d source trees" %TOTAL_TREES
    if args.reftree_constraint:
        import imp
        constraint = imp.load_source('constraint', args.reftree_constraint)
        IS_VALID_TREEID = constraint.is_valid_treeid
    else:
        IS_VALID_TREEID = None
       
    if args.cpu > 1:
        MONITOR_STEP = 0
        #return (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
        #       coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports)
        # The output of the process_trees function are 9 dictionaries in which keys are refbranches
        target_dicts = [{} for x in range(9)] 
        def merge_dict_results(target, source):
            def merge_dict(target, source):
                for k, v in source.iteritems():
                    if k not in target:
                        target[k] = v
                    elif isinstance(v, list):
                        target[k].extend(v)
                    elif isinstance(v, set):
                        target[k].update(v)
                    elif isinstance(v, int):
                        target[k] += v
                    else:
                        raise ValueError("Impossible to merge str results")
            for index in xrange(len(target)):
                merge_dict(target[index], out[index])

        from multiprocessing import Process, Queue
        from Queue import Empty as QueueEmpty
        outputs_queue = Queue()
        if TOTAL_TREES > args.cpu:
            trees_per_cpu = TOTAL_TREES / args.cpu
            trees_per_cpu += 1 if TOTAL_TREES % args.cpu else 0
        else:
            trees_per_cpu = 1
            args.cpu = TOTAL_TREES
            
        all_workers = set()
        for cpu_num in xrange(args.cpu):
            sline = (cpu_num*trees_per_cpu)
            eline = (cpu_num*trees_per_cpu) + trees_per_cpu
            data_iter = tree_iterator(args.source_trees,
                                      restrict_species=REFTREE_SPECIES,
                                      start_line=sline,
                                      end_line=eline)
            print >>sys.stderr, "Launching worker %d from %d to %d" %(cpu_num, sline, eline)
            worker = Process(target=run_parallel,
                             args=(cpu_num, outputs_queue, process_trees, data_iter, reftree, trees_per_cpu))
            worker.name = "Worker_%d" %cpu_num
            all_workers.add(worker)
            worker.start()
            
        while all_workers:
            # clear done threads
            for w in list(all_workers):
                if not w.is_alive():
                    print >>sys.stderr, "%s thread is done!" %w.name
                    all_workers.discard(w)
            # get and merge results
            while 1:
                try:
                    out = outputs_queue.get(False)
                except QueueEmpty:
                    break
                else:
                    # This merge depends on process_trees return output!!!!!
                    merge_dict_results(target_dicts, out)
                    # Dump a snapshot
                    dump_results(reftree, *target_dicts)
                time.sleep(0.1)
            if all_workers:
                time.sleep(1)
        # collected data
        (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
         coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch,
         coll_refbranch_supports) = target_dicts
    else:
        MONITOR_STEP = args.snapshot_step
        data_iter = tree_iterator(args.source_trees, restrict_species=REFTREE_SPECIES)
        
        (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
         coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch,
         coll_refbranch_supports) = process_trees(data_iter, reftree, TOTAL_TREES)

    if REPORT_PER_TREE_SUPPORTS:
        REPORT_SUPPORT_FILE.close()

    dump_results(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
                 coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports)

    print >>sys.stderr, "Dumping full analysis..."
    # Full dump, including duplication details
    cPickle.dump(reftree, open("%s.pkl"%args.output, "w"))
Exemplo n.º 12
0
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections. 
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().
    
    parser.add_argument("--show", dest="show_tree",
                        action="store_true", 
                        help="""Display tree after the analysis.""")
    
    parser.add_argument("--render", dest="render",
                        action="store_true", 
                        help="""Render tree.""")

    parser.add_argument("--dump", dest="dump",
                        action="store_true", 
                        help="""Dump analysis""")

    parser.add_argument("--explore", dest="explore",
                        type=str,
                        help="""Reads a previously analyzed tree and visualize it""")
    
    input_args = parser.add_mutually_exclusive_group()
    input_args.required=True
    input_args.add_argument("-t", "--tree", dest="target_tree",  nargs="+",
                        type=str,
                        help="""Tree file in newick format""")

    input_args.add_argument("-tf", dest="tree_list_file",
                        type=str, 
                        help="File with the list of tree files")
    
    parser.add_argument("--tax", dest="tax_info", type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument("--sp_delimiter", dest="sp_delimiter", type=str,
                        help="If taxid is part of the leaf name, delimiter used to split the string")

    parser.add_argument("--sp_field", dest="sp_field", type=int, default=0,
                        help="field position for taxid after splitting leaf names")
    
    parser.add_argument("--ref", dest="ref_tree", type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only", dest="rf_only",
                        action = "store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument("--outgroup", dest="outgroup",
                        type=str, nargs="+",
                        help="A list of node names defining the trees outgroup")
    
    parser.add_argument("--is_sptree", dest="is_sptree",
                        action = "store_true",
                        help="Assumes no duplication nodes in the tree")
    
    parser.add_argument("-o", dest="output", type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str,
                        help="")
    
    parser.add_argument("--tax2track", dest="tax2track", type=str,
                        help="")
    
    parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true",
                        help="")
    
    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None
    
    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >>sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout 
        ts.mode = "r"
        t.show(tree_style=ts)
        print >>sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()
    
    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >>sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips")
    print >>OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >>sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

        
    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)
        
        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")
                
            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t
       
        
        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track)
            
        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees =  t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

          
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(t, subtrees, show_tree=SHOW_TREE)
            
            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0
            
        ndups = 0
        nsubtrees = len(subtrees)
           
        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf
            
        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0 # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" %ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append((partial_rf[0]/float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" %( numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes))
        iter_values = [os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med,
                       rf_std, rf_max, common_names] 
        print >>OUT, '|'.join(map(lambda x: str(x).strip().ljust(15), iter_values)) 
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems =  sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems), "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" %fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout 
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1
                    
                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))
            
    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width = 50, row_line=True, header=HEADER)
            
    if args.output:
        OUT.close()