示例#1
0
def tree_iterator(fname, restrict_species=None, start_line=None, end_line=None):
    for ln, line in enumerate(open(fname)):
        if start_line is not None and ln < start_line:
            continue
        elif end_line is not None and ln >= end_line:
            break

        if line.startswith("#") or not line.strip(): continue
        treeid, newick = line.split("\t")
        t = PhyloTree(newick, sp_naming_function=extract_species)
        if restrict_species:
            t.prune([n for n in t.iter_leaves() if n.species in restrict_species])

        n2content = t.get_cached_content()
        if len(n2content[t]) < 2:
            continue
        yield treeid, t, n2content
示例#2
0
def tree_iterator(fname,
                  restrict_species=None,
                  start_line=None,
                  end_line=None):
    for ln, line in enumerate(open(fname)):
        if start_line is not None and ln < start_line:
            continue
        elif end_line is not None and ln >= end_line:
            break

        if line.startswith("#") or not line.strip(): continue
        treeid, newick = line.split("\t")
        t = PhyloTree(newick, sp_naming_function=extract_species)
        if restrict_species:
            t.prune(
                [n for n in t.iter_leaves() if n.species in restrict_species])

        n2content = t.get_cached_content()
        if len(n2content[t]) < 2:
            continue
        yield treeid, t, n2content
示例#3
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections.
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    parser.add_argument("--show",
                        dest="show_tree",
                        action="store_true",
                        help="""Display tree after the analysis.""")

    parser.add_argument("--render",
                        dest="render",
                        action="store_true",
                        help="""Render tree.""")

    parser.add_argument("--dump",
                        dest="dump",
                        action="store_true",
                        help="""Dump analysis""")

    parser.add_argument(
        "--explore",
        dest="explore",
        type=str,
        help="""Reads a previously analyzed tree and visualize it""")

    input_args = parser.add_mutually_exclusive_group()
    input_args.required = True
    input_args.add_argument("-t",
                            "--tree",
                            dest="target_tree",
                            nargs="+",
                            type=str,
                            help="""Tree file in newick format""")

    input_args.add_argument("-tf",
                            dest="tree_list_file",
                            type=str,
                            help="File with the list of tree files")

    parser.add_argument("--tax",
                        dest="tax_info",
                        type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument(
        "--sp_delimiter",
        dest="sp_delimiter",
        type=str,
        help=
        "If taxid is part of the leaf name, delimiter used to split the string"
    )

    parser.add_argument(
        "--sp_field",
        dest="sp_field",
        type=int,
        default=0,
        help="field position for taxid after splitting leaf names")

    parser.add_argument("--ref",
                        dest="ref_tree",
                        type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only",
                        dest="rf_only",
                        action="store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument(
        "--outgroup",
        dest="outgroup",
        type=str,
        nargs="+",
        help="A list of node names defining the trees outgroup")

    parser.add_argument("--is_sptree",
                        dest="is_sptree",
                        action="store_true",
                        help="Assumes no duplication nodes in the tree")

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str, help="")

    parser.add_argument("--tax2track", dest="tax2track", type=str, help="")

    parser.add_argument("--dump_tax_info",
                        dest="dump_tax_info",
                        action="store_true",
                        help="")

    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None

    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >> sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout
        ts.mode = "r"
        t.show(tree_style=ts)
        print >> sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()

    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >> sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Clade sizes", "RF (avg)", "RF (med)",
              "RF (std)", "RF (max)", "Shared tips")
    print >> OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >> sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)

        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")

            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t

        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(
                t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(
                t, None, tax2name, tax2track)

        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees = t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(
                t, subtrees, show_tree=SHOW_TREE)

            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0

        ndups = 0
        nsubtrees = len(subtrees)

        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf

        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(
                map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0  # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" % ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(
                        set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append(
                        (partial_rf[0] / float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size)  # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" % (numpy.mean(broken_sizes),
                                               numpy.median(broken_sizes),
                                               numpy.std(broken_sizes))
        iter_values = [
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, rf, rf_med, rf_std,
            rf_max, common_names
        ]
        print >> OUT, '|'.join(
            map(lambda x: str(x).strip().ljust(15), iter_values))
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems = sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems),
                                "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" % fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %
                  problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([
            os.path.basename(tfile), nsubtrees, ndups, broken_subtrees,
            ncbi_mistakes, broken_branches, sizes_info, fixed_string,
            problems_string
        ])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1

                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))

    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees",
              "Broken clades", "Broken branches", "Clade sizes",
              "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width=50, row_line=True, header=HEADER)

    if args.output:
        OUT.close()
示例#4
0
文件: ete_view.py 项目: daisieh/ete
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections. 
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    input_gr = parser.add_argument_group("TREE INPUT OPTIONS\n=================")
    
    input_gr.add_argument('tree', metavar='tree_file', type=str, nargs=1,
                      help='A tree file (or text string) in newick format.')

    input_gr.add_argument("--raxml", dest="raxml", 
                        action="store_true",
                        help="""Process newick as raxml bootstrap values""")
    
    img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================")
        
    img_gr.add_argument("-m", "--mode", dest="mode", 
                        choices=["c", "r"], default="r",
                        help="""(r)ectangular or (c)ircular visualization""")
  

    img_gr.add_argument("-i", "--image", dest="image", 
                        type=str, 
                        help="Render tree image instead of showing it. A filename "
                        " should be provided. PDF, SVG and PNG file extensions are"
                        " supported (i.e. -i tree.svg)"
                        )

    img_gr.add_argument("--Iw", "--width", dest="width", 
                        type=int, default=0, 
                        help="width of the rendered image in pixels (see --size-units)."
                        )

    img_gr.add_argument("--Ih", "--height", dest="height", 
                        type=int, default=0,
                        help="height of the rendered image in pixels (see --size-units)."
                        )

    img_gr.add_argument("--Ir", "--resolution", dest="resolution", 
                        type=int, default=300,
                        help="Resolution if the tree image (DPI)"
                        )

    img_gr.add_argument("--Iu", "--size-units", dest="size_units", 
                        choices=["px", "mm", "in"], default="px",
                        help="Units used to specify the size of the image."
                        " (px:pixels, mm:millimeters, in:inches). "
                        )

    img_gr.add_argument("-mbs", "--min-branch-separation", dest="branch_separation", 
                        type=int, default = 3, 
                        help="Min number of pixels to separate branches vertically."
                        )

    img_gr.add_argument("--ss", "--show-support", dest="show_support", 
                        action="store_true",
                        help="""Shows branch bootstrap/support values""")

    img_gr.add_argument("--sbl", "--branch-length", dest="show_branch_length", 
                        action="store_true",
                        help="""Show branch lengths.""")

    img_gr.add_argument("--ft", "--force-topology", dest="force_topology", 
                        action="store_true",
                        help="""Force branch length to have a minimum length in the image""")

    img_gr.add_argument("--hln", "--hide-leaf-names", dest="hide_leaf_names", 
                        action="store_true",
                        help="""Hide leaf names.""")

    img_gr.add_argument("--sin", "--show-internal-names", dest="show_internal_names", 
                        action="store_true",
                        help="""Show the name attribute of all internal nodes.""")

    edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================")
    
    edit_gr.add_argument("-r", "--root", dest="root", 
                         type=str, nargs="*",
                         help="Roots the tree to the node grouping the list"
                         " of node names provided (space separated). In example:"
                         "'--root human rat mouse'")
    
    edit_gr.add_argument("-s", "--sort-branches", dest="sort", 
                        action="store_true",
                        help="""Sort branches according to node names.""")

    edit_gr.add_argument("-l", "--ladderize", dest="ladderize", 
                        action="store_true",
                        help="""Sort branches by partition size.""")
    
    edit_gr.add_argument("--color_by_rank", dest="color_by_rank", 
                         type=str, nargs="+",
                         help="""If the attribute rank is present in nodes """)
    
    phylo_gr = parser.add_argument_group("PHYLOGENETIC OPTIONS\n=================")
    
    phylo_gr.add_argument("--alg", dest="alg", 
                        type=str, 
                        help="""Multiple sequence alignment.""")

    phylo_gr.add_argument("--alg-format", dest="alg_format", 
                        type=str, default="fasta",
                        help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""")
    
    phylo_gr.add_argument("--sp-discovery", dest="species_discovery_regexp", 
                          type=str, default="^[^_]+_(.+)",
                          help="Perl regular expression to capture species"
                          " code from node names. By default, node names"
                          " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ")
        
    phylo_gr.add_argument("--dump-subtrees", dest="subtrees_output_file", 
                          type=str, 
                          help="Returns a file containing all possible species subtrees"
                               " contained in a given gene tree ")

    phylo_gr.add_argument("--newick", dest="newick", 
                          type=str,
                          help="dumps newick file after applying editing options")

    
    args = parser.parse_args(argv)

    tfile = args.tree[0]


    if args.ladderize and args.sort:
        raise ValueError("--sort-branches and --ladderize options are mutually exclusive")
    
    if args.raxml:
        nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read())
        t = PhyloTree(nw)
        #for n in t.traverse():
            #n.support = getattr(n, "bootstrap", -1)
            #
    else:
        t = PhyloTree(tfile)
        
    if args.alg:
        t.link_to_alignment(args.alg, alg_format=args.alg_format)
        LEAF_ATTRIBUTES["sequence"] = 1
        
    if args.species_discovery_regexp:
        SPCODE_REGEXP = re.compile(args.species_discovery_regexp)
        t.set_species_naming_function(user_species_naming_function)
        
    if args.ladderize:
        t.ladderize()
    if args.sort:
        t.sort_descendants()

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    # EXTRACT INFO

    if args.subtrees_output_file:
        ntrees, ndups, treeiter = t.get_speciation_trees()
        print >>sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." %(ndups, ntrees)
        OUT = open(args.subtrees_output_file, "w")
        for tree in treeiter:
            print >>OUT, tree.write()
        OUT.close()

    # VISUALIZATION
        
    ts = TreeStyle()
    ts.mode = args.mode
    ts.show_leaf_name = False
    ts.branch_vertical_margin = args.branch_separation
    if args.show_support:
        ts.show_branch_support = True
    if args.show_branch_length:
        ts.show_branch_length = True
    if args.force_topology:
        ts.force_topology = True
        
    if args.hide_leaf_names:
        del LEAF_ATTRIBUTES["name"]

    if args.show_internal_names:
        INTERNAL_ATTRIBUTES["name"] = 1

    # scale the tree
    if not args.height: 
        args.height = None
    if not args.width: 
        args.width = None

    ts.layout_fn = master_layout
    if args.image:
        t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units)
    else:
        t.show(None, tree_style=ts)

    if args.newick:
        t.write(features=[], outfile=args.newick)
        print "Processed Newick dumped into", args.newick
示例#5
0
def main(argv):
    global args

    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("-r",
                        dest="reftree",
                        type=str,
                        required=True,
                        help="""Reference tree""")

    parser.add_argument(
        "--source_trees",
        dest="source_trees",
        type=str,
        required=True,
        help=
        ("A list of *rooted* genetrees, one per line, in the format: TreeID/SeedID [TAB] newick "
         ))

    parser.add_argument("--plot_newick",
                        dest="plot_newick",
                        type=str,
                        help=(""))

    parser.add_argument("--spname_delimiter",
                        dest="spname_delimiter",
                        type=str,
                        default="_",
                        help=("species code delimiter in node names"))

    parser.add_argument(
        "--spname_field",
        dest="spname_field",
        type=int,
        default=-1,
        help=
        ("position of the species code extracted from node names. -1 = last field"
         ))

    parser.add_argument(
        "--collateral",
        dest="use_collateral",
        action="store_true",
        help=("If enabled, collateral information will be used as"
              " equally qualified data. Otherwise, such data will"
              " be reported separatedly. Use this if your set of"
              " trees are not overlaping. "))

    parser.add_argument(
        "--skip_dup_detection",
        dest="skip_dup_detection",
        action="store_true",
        help=('If used, duplications will be expected to be annotated'
              ' in the source gene trees with the evoltype="D" tag.'
              ' Otherwise they will be inferred on the fly using'
              ' the species overlap algorithm.'))

    parser.add_argument(
        "--spoverlap",
        dest="species_overlap",
        type=float,
        default=0.0,
        help=("Species overlap cutoff. A number between 0 and 1 "
              "representing the percentage of species that should be "
              "shared between two sister partitions to be considered a"
              " duplication. 0 = any overlap represents a duplication. "))

    parser.add_argument(
        "--debug",
        dest="debug",
        action="store_true",
        help=
        ("generate an image of every input gene tree tree, so the result can be inspected"
         ))

    parser.add_argument(
        "--snapshot_step",
        dest="snapshot_step",
        type=int,
        default=1000,
        help=("How many trees should be processed between snapshots dumps?"))

    parser.add_argument(
        "--reftree_constraint",
        dest="reftree_constraint",
        type=str,
        help=("A python module from from which a function called "
              "*is_valid_treeid(treeid, refbranch)* should be importable. "
              "The function will be used to decide if the info of a given "
              "source tree is informative or not for each reftree branch. "))

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        required=True,
                        help=("output tag name (extensions will be added)"))

    parser.add_argument("--cpu",
                        dest="cpu",
                        type=int,
                        default=1,
                        help=("enable parallel computation"))

    parser.add_argument(
        "--img_report",
        dest="img_report",
        action="store_true",
        help=
        ("If true, it generates a summary image results with all the computed data"
         ))

    parser.add_argument(
        "--report_supports",
        dest="report_supports",
        action="store_true",
        help=
        ("If used, supported ref tree branches are individually reported for each gene tree "
         ))

    args = parser.parse_args(argv)
    if args.plot_newick:
        t = Tree(args.plot_newick)
        ts = TreeStyle()
        ts.layout_fn = info_layout
        t.render("tree_analysis.png", tree_style=ts)
        sys.exit(0)

    SPNAME_FIELD, SPNAME_DELIMITER = args.spname_field, args.spname_delimiter
    USE_COLLATERAL = args.use_collateral
    DETECT_DUPLICATIONS = True if not args.skip_dup_detection else False
    REPORT_PER_TREE_SUPPORTS = True if args.report_supports else False
    SP_OVERLAP = args.species_overlap
    DEBUG = args.debug
    IMG_REPORT = args.img_report
    reftree = PhyloTree(args.reftree, sp_naming_function=None)
    for nid, n in enumerate(reftree.traverse()):
        n.add_features(nid=nid)
    REFTREE_SPECIES = set(reftree.get_leaf_names())
    print __DESCRIPTION__

    if REPORT_PER_TREE_SUPPORTS:
        REPORT_SUPPORT_FILE = open("%s.gentree_supports" % args.output, "w")
        print >> REPORT_SUPPORT_FILE, '#' + '\t'.join(
            map(str, [
                "treeId", "spCoverage", "mean_support", "mean_coll_support",
                "tested_branches", 'tested_coll_branches'
            ]))

    TOTAL_TREES = int(
        commands.getoutput("wc -l %s" % args.source_trees).split()[0]) + 1
    print >> sys.stderr, "Processing %d source trees" % TOTAL_TREES
    if args.reftree_constraint:
        import imp
        constraint = imp.load_source('constraint', args.reftree_constraint)
        IS_VALID_TREEID = constraint.is_valid_treeid
    else:
        IS_VALID_TREEID = None

    if args.cpu > 1:
        MONITOR_STEP = 0
        #return (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
        #       coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports)
        # The output of the process_trees function are 9 dictionaries in which keys are refbranches
        target_dicts = [{} for x in range(9)]

        def merge_dict_results(target, source):
            def merge_dict(target, source):
                for k, v in source.iteritems():
                    if k not in target:
                        target[k] = v
                    elif isinstance(v, list):
                        target[k].extend(v)
                    elif isinstance(v, set):
                        target[k].update(v)
                    elif isinstance(v, int):
                        target[k] += v
                    else:
                        raise ValueError("Impossible to merge str results")

            for index in xrange(len(target)):
                merge_dict(target[index], out[index])

        from multiprocessing import Process, Queue
        from Queue import Empty as QueueEmpty
        outputs_queue = Queue()
        if TOTAL_TREES > args.cpu:
            trees_per_cpu = TOTAL_TREES / args.cpu
            trees_per_cpu += 1 if TOTAL_TREES % args.cpu else 0
        else:
            trees_per_cpu = 1
            args.cpu = TOTAL_TREES

        all_workers = set()
        for cpu_num in xrange(args.cpu):
            sline = (cpu_num * trees_per_cpu)
            eline = (cpu_num * trees_per_cpu) + trees_per_cpu
            data_iter = tree_iterator(args.source_trees,
                                      restrict_species=REFTREE_SPECIES,
                                      start_line=sline,
                                      end_line=eline)
            print >> sys.stderr, "Launching worker %d from %d to %d" % (
                cpu_num, sline, eline)
            worker = Process(target=run_parallel,
                             args=(cpu_num, outputs_queue, process_trees,
                                   data_iter, reftree, trees_per_cpu))
            worker.name = "Worker_%d" % cpu_num
            all_workers.add(worker)
            worker.start()

        while all_workers:
            # clear done threads
            for w in list(all_workers):
                if not w.is_alive():
                    print >> sys.stderr, "%s thread is done!" % w.name
                    all_workers.discard(w)
            # get and merge results
            while 1:
                try:
                    out = outputs_queue.get(False)
                except QueueEmpty:
                    break
                else:
                    # This merge depends on process_trees return output!!!!!
                    merge_dict_results(target_dicts, out)
                    # Dump a snapshot
                    dump_results(reftree, *target_dicts)
                time.sleep(0.1)
            if all_workers:
                time.sleep(1)
        # collected data
        (informed_branches, dup_per_branch, losses_per_branch,
         losses_per_dup_branch, refbranch_supports, coll_dup_per_branch,
         coll_losses_per_branch, coll_losses_per_dup_branch,
         coll_refbranch_supports) = target_dicts
    else:
        MONITOR_STEP = args.snapshot_step
        data_iter = tree_iterator(args.source_trees,
                                  restrict_species=REFTREE_SPECIES)

        (informed_branches, dup_per_branch, losses_per_branch,
         losses_per_dup_branch, refbranch_supports, coll_dup_per_branch,
         coll_losses_per_branch, coll_losses_per_dup_branch,
         coll_refbranch_supports) = process_trees(data_iter, reftree,
                                                  TOTAL_TREES)

    if REPORT_PER_TREE_SUPPORTS:
        REPORT_SUPPORT_FILE.close()

    dump_results(reftree, informed_branches, dup_per_branch, losses_per_branch,
                 losses_per_dup_branch, refbranch_supports,
                 coll_dup_per_branch, coll_losses_per_branch,
                 coll_losses_per_dup_branch, coll_refbranch_supports)

    print >> sys.stderr, "Dumping full analysis..."
    # Full dump, including duplication details
    cPickle.dump(reftree, open("%s.pkl" % args.output, "w"))
示例#6
0
def get_supported_branches(source_tree, reftree, refclades, seedid=None):
    """
        Given a reference species tree and a rooted gene tree in which
        duplication events are already mapped, this function does the following:
        
          - Split gene tree into all possible species tree (Treeko method)
          
          - Find matches between each subtree branch and all branches in the
            reference tree. 
          
              - Each branch in each species subtree is compared to all branches
                in the reftree. If left/right side of the subtree branch
                coincide with a the left/right side of a reference tree branch,
                this is considered a gene tree support point. Coincidences must
                comply with the following conditions:

                   - All species in the left/right sides of the subtree branch
                     exist in the left/right sides of the reference branch.

                   - Species in the left/right sides of the reference branch are
                     never mixed in the subtree branch.
                     
                   - Missing species are allowed in the subtree split, only if
                     such species are not present in any other part of the
                     original gene tree.
    """

    # Run Treeko to get all possible species tree combinations. We assume dups are already mapped
    ntrees, ndups, sp_trees = source_tree.get_speciation_trees(
        autodetect_duplications=DETECT_DUPLICATIONS, newick_only=True)
    if ntrees > 100000:
        return {}, {}

    branches_found = []
    branch2supports = defaultdict(list)
    branch2coll_supports = defaultdict(list)
    for nw in sp_trees:
        # Use all treeko trees or only those subtrees containing the seed?
        if seedid and seedid not in nw:
            container = branch2coll_supports
        else:
            container = branch2supports

        subtree = PhyloTree(nw, sp_naming_function=extract_species)
        subtreenode2content = subtree.get_cached_content(store_attr="species")
        #set([phy3(_c.name) for _c in subtreenode2content[subtree]])
        all_sp_in_subtree = subtreenode2content[subtree]

        # Visit all nodes in the tree
        for n in subtree.traverse("preorder"):
            if not n.is_leaf():
                c1 = subtreenode2content[n.children[0]]
                c2 = subtreenode2content[n.children[1]]
                #branches_found.append([all_sp_in_subtree, c1, c2])

                for refnode, m1, m2 in refclades:
                    all_expected_sp = m1 | m2

                    # We add one supporting point to every observed split that coincides
                    # with a reference tree branch. This is, seqs in one side and seqs
                    # on the other side of the observed split matches a ref_tree branch
                    # without having extra seqs in any of the sides. However, we allow
                    # for split matches where some seqs are lost in the observed split.

                    #for all_sp_in_subtree, c1, c2 in branches_found:
                    all_seen_sp = c1 | c2
                    notfound, found = 0, 0

                    false_missing = (all_expected_sp -
                                     all_seen_sp) & all_sp_in_subtree
                    outside_species = (all_seen_sp - all_expected_sp)

                    # Compare expected (m1,m2) splits with observed splits (c1,c2).
                    a_straight = m1 & c1
                    b_straight = m2 & c2
                    a_cross = m1 & c2
                    b_cross = m2 & c1

                    # if matches are found for one of the first possible comparison
                    if (a_straight and b_straight):
                        # and the match contains all the observed species, species
                        # from both sides are not mixed and missing species are real
                        if not outside_species and not a_cross and not b_cross and not false_missing:
                            found += 1
                        else:
                            notfound += 1

                    # if matches are found for the second possible comparison (This
                    # would never occur if found variable was increased in the
                    # previous if)
                    if (a_cross and b_cross):
                        # and the match contains all the observed species, species
                        # from both sides are not mixed and missing species are real
                        if not outside_species and not a_straight and not b_straight and not false_missing:
                            found += 1
                        else:
                            notfound += 1

                    if notfound > 0:
                        container[refnode].append(0)
                    elif found > 0:
                        container[refnode].append(1)

                    if found == 2:
                        raw_input(
                            "Two possible matches? This should never occur!!")

    return branch2supports, branch2coll_supports
示例#7
0
def main(argv):
    
    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",  dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")
    
    parser.add_argument("-t", "--taxid", dest="taxid", nargs="+",  
                        type=int, 
                        help="""taxids (space separated)""")

    parser.add_argument("-tf", "--taxid_file", dest="taxid_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-r", "--reftree", dest="reftree",   
                        type=str, 
                        help="""tree file containing taxids as node names.""")
    
    parser.add_argument("--reftree_attr", dest="reftree_attr",   
                        type=str, default="name",
                        help="""Where taxid should be read from""")
    
    parser.add_argument("-n", "--name", dest="names", nargs="+",  
                        type=str, 
                        help="""species or taxa names (comma separated)""")

    parser.add_argument("-nf", "--names_file", dest="names_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-x", "--taxonomy", dest="taxonomy",   
                        action="store_true",
                        help=("returns a pruned version of the NCBI taxonomy"
                              " tree containing target species"))

    parser.add_argument("--show_tree", dest="show_tree",   
                        action="store_true",
                        help="""shows the NCBI taxonomy tree of the provided species""")
    
    parser.add_argument("--collapse_subspecies", dest="collapse_subspecies",   
                        action="store_true",
                        help=("When used, all nodes under the the species rank"
                              " are collapsed, so all species and subspecies"
                              " are seen as sister nodes"))

    parser.add_argument("--rank_limit", dest="rank_limit",   
                        type=str,
                        help=("When used, all nodes under the provided rank"
                              " are discarded"))
    
    parser.add_argument("--full_lineage", dest="full_lineage",   
                        action="store_true",
                        help=("When used, topology is not pruned to avoid "
                              " one-child-nodes, so the complete lineage"
                              " track leading from root to tips is kept."))
        
    parser.add_argument("-i", "--info", dest="info",   
                        action="store_true",
                        help="""shows NCBI information about the species""")

    parser.add_argument("--fuzzy", dest="fuzzy", type=float,
                        help=("Tries a fuzzy (and SLOW) search for those"
                              " species names that could not be translated"
                              " into taxids. A float number must be provided"
                              " indicating the minimum string similarity."))
   
    
    args = parser.parse_args(argv)
    if not args.taxonomy and not args.info and not args.reftree:
        parser.print_usage()
        sys.exit(0)
    
    if args.fuzzy:
        import pysqlite2.dbapi2 as sqlite3
        c = sqlite3.connect(os.path.join(MODULE_PATH, args.dbfile))
    else:
        ncbi.connect_database(args.dbfile)
    

        
    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(map(strip, open(args.names_file, "rU").read().split("\n")))
    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))
    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations:")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                c.enable_load_extension(True)
                c.execute("select load_extension('%s')" % os.path.join(module_path,
                                            "SQLite-Levenshtein/levenshtein.sqlext"))
                tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" %sim
                    
        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(map(str, [score, name, realname.capitalize(), taxid]))
            
    if args.taxid_file:
        all_taxids.extend(map(strip, open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)
        
    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(list(set([getattr(n, args.reftree_attr) for n in reftree.iter_leaves()])))

       
    if all_taxids and args.info:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi.translate_merged(all_taxids)
        log.info("Dumping %d taxid translations:" %len(all_taxids))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print "\t".join(map(str, [merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ]))
            
        for notfound in set(map(str, all_taxids)) - set(str(k) for k in translator.iterkeys()):
            print >>sys.stderr, notfound, "NOT FOUND"
            
    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa:" %len(all_taxids))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            if n.rank in COLOR_RANKS:
                n.add_features(bgcolor=COLOR_RANKS[n.rank])
            n.name = "%s{%s}" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
            
        if args.collapse_subspecies:
            species_nodes = [n for n in t.traverse() if n.rank == "species"
                             if int(n.taxid) in all_taxids]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" %n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")
                
        if args.show_tree:
            t.show()
            
        print "\n\n  ===== Newick files saved as 'your_taxa_query.*' ===== "
        t.write(format=9, outfile="your_ncbi_query.nw")
        t.write(format=8, outfile="your_ncbi_query.named.nw")
        t.write(format=9, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"],
                outfile="your_ncbi_query.extended.nw")
        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile="your_ncbi_query.taxids.nw")

    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name = translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi.translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)
            
        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
示例#8
0
def get_supported_branches(source_tree, reftree, refclades, seedid=None):
    """
        Given a reference species tree and a rooted gene tree in which
        duplication events are already mapped, this function does the following:
        
          - Split gene tree into all possible species tree (Treeko method)
          
          - Find matches between each subtree branch and all branches in the
            reference tree. 
          
              - Each branch in each species subtree is compared to all branches
                in the reftree. If left/right side of the subtree branch
                coincide with a the left/right side of a reference tree branch,
                this is considered a gene tree support point. Coincidences must
                comply with the following conditions:

                   - All species in the left/right sides of the subtree branch
                     exist in the left/right sides of the reference branch.

                   - Species in the left/right sides of the reference branch are
                     never mixed in the subtree branch.
                     
                   - Missing species are allowed in the subtree split, only if
                     such species are not present in any other part of the
                     original gene tree.
    """
    
    # Run Treeko to get all possible species tree combinations. We assume dups are already mapped
    ntrees, ndups, sp_trees = source_tree.get_speciation_trees(autodetect_duplications=DETECT_DUPLICATIONS, newick_only=True)
    if ntrees > 100000:
        return {}, {}
        
    branches_found = []
    branch2supports = defaultdict(list)
    branch2coll_supports = defaultdict(list)
    for nw in sp_trees:
        # Use all treeko trees or only those subtrees containing the seed?
        if seedid and seedid not in nw:
            container = branch2coll_supports
        else:
            container = branch2supports
            
        subtree = PhyloTree(nw, sp_naming_function = extract_species)
        subtreenode2content = subtree.get_cached_content(store_attr="species")
        #set([phy3(_c.name) for _c in subtreenode2content[subtree]])
        all_sp_in_subtree = subtreenode2content[subtree]
        
        # Visit all nodes in the tree
        for n in subtree.traverse("preorder"):
            if not n.is_leaf():
                c1 = subtreenode2content[n.children[0]]
                c2 = subtreenode2content[n.children[1]]
                #branches_found.append([all_sp_in_subtree, c1, c2])

                for refnode, m1, m2 in refclades:
                    all_expected_sp = m1 | m2

                    # We add one supporting point to every observed split that coincides
                    # with a reference tree branch. This is, seqs in one side and seqs
                    # on the other side of the observed split matches a ref_tree branch
                    # without having extra seqs in any of the sides. However, we allow
                    # for split matches where some seqs are lost in the observed split.
                    
                    #for all_sp_in_subtree, c1, c2 in branches_found:
                    all_seen_sp = c1|c2
                    notfound, found = 0, 0

                    false_missing = (all_expected_sp - all_seen_sp) & all_sp_in_subtree
                    outside_species = (all_seen_sp - all_expected_sp)

                    # Compare expected (m1,m2) splits with observed splits (c1,c2). 
                    a_straight  = m1 & c1
                    b_straight = m2 & c2
                    a_cross = m1 & c2
                    b_cross = m2 & c1

                    # if matches are found for one of the first possible comparison
                    if (a_straight and b_straight):
                        # and the match contains all the observed species, species
                        # from both sides are not mixed and missing species are real
                        if not outside_species and not a_cross and not b_cross and not false_missing:
                            found += 1
                        else:
                            notfound += 1

                    # if matches are found for the second possible comparison (This
                    # would never occur if found variable was increased in the
                    # previous if)
                    if (a_cross and b_cross):
                        # and the match contains all the observed species, species
                        # from both sides are not mixed and missing species are real
                        if not outside_species and not a_straight and not b_straight and not false_missing:
                            found += 1
                        else:
                            notfound += 1

                    if notfound > 0:
                        container[refnode].append(0)
                    elif found > 0:
                        container[refnode].append(1)                    

                    if found == 2:
                        raw_input("Two possible matches? This should never occur!!")
                
    return branch2supports, branch2coll_supports
示例#9
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    input_gr = parser.add_argument_group(
        "TREE INPUT OPTIONS\n=================")

    input_gr.add_argument(
        'tree',
        metavar='tree_file',
        type=str,
        nargs=1,
        help='A tree file (or text string) in newick format.')

    input_gr.add_argument("--raxml",
                          dest="raxml",
                          action="store_true",
                          help="""Process newick as raxml bootstrap values""")

    img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================")

    img_gr.add_argument("-m",
                        "--mode",
                        dest="mode",
                        choices=["c", "r"],
                        default="r",
                        help="""(r)ectangular or (c)ircular visualization""")

    img_gr.add_argument(
        "-i",
        "--image",
        dest="image",
        type=str,
        help="Render tree image instead of showing it. A filename "
        " should be provided. PDF, SVG and PNG file extensions are"
        " supported (i.e. -i tree.svg)")

    img_gr.add_argument("--text",
                        dest="text_mode",
                        action="store_true",
                        help="Shows the tree using ASCII characters")

    img_gr.add_argument(
        "--attr",
        "--show_attributes",
        dest="show_attributes",
        nargs="+",
        help="Display the value of the specified attributes, if available")

    img_gr.add_argument(
        "--Iw",
        "--width",
        dest="width",
        type=int,
        default=0,
        help="width of the rendered image in pixels (see --size-units).")

    img_gr.add_argument(
        "--Ih",
        "--height",
        dest="height",
        type=int,
        default=0,
        help="height of the rendered image in pixels (see --size-units).")

    img_gr.add_argument("--Ir",
                        "--resolution",
                        dest="resolution",
                        type=int,
                        default=300,
                        help="Resolution if the tree image (DPI)")

    img_gr.add_argument("--Iu",
                        "--size_units",
                        dest="size_units",
                        choices=["px", "mm", "in"],
                        default="px",
                        help="Units used to specify the size of the image."
                        " (px:pixels, mm:millimeters, in:inches). ")

    img_gr.add_argument(
        "-mbs",
        "--min_branch_separation",
        dest="branch_separation",
        type=int,
        default=3,
        help="Min number of pixels to separate branches vertically.")

    img_gr.add_argument("--ss",
                        "--show_support",
                        dest="show_support",
                        action="store_true",
                        help="""Shows branch bootstrap/support values""")

    img_gr.add_argument("--sbl",
                        "--show_branch_length",
                        dest="show_branch_length",
                        action="store_true",
                        help="""Show branch lengths.""")

    img_gr.add_argument(
        "--ft",
        "--force_topology",
        dest="force_topology",
        action="store_true",
        help="""Force branch length to have a minimum length in the image""")

    img_gr.add_argument("--hln",
                        "--hide_leaf_names",
                        dest="hide_leaf_names",
                        action="store_true",
                        help="""Hide leaf names.""")

    img_gr.add_argument(
        "--sin",
        "--show_internal_names",
        dest="show_internal_names",
        action="store_true",
        help="""Show the name attribute of all internal nodes.""")

    edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================")

    edit_gr.add_argument(
        "-r",
        "--root",
        dest="root",
        type=str,
        nargs="*",
        help="Roots the tree to the node grouping the list"
        " of node names provided (space separated). In example:"
        "'--root human rat mouse'")

    edit_gr.add_argument("-s",
                         "--sort_branches",
                         dest="sort",
                         action="store_true",
                         help="""Sort branches according to node names.""")

    edit_gr.add_argument("-l",
                         "--ladderize",
                         dest="ladderize",
                         action="store_true",
                         help="""Sort branches by partition size.""")

    edit_gr.add_argument("--color_by_rank",
                         dest="color_by_rank",
                         type=str,
                         nargs="+",
                         help="""If the attribute rank is present in nodes """)

    edit_gr.add_argument(
        "--ncbi",
        dest="ncbi",
        action="store_true",
        help=""" Annotate tree using the NCBI taxonomy database""")

    edit_gr.add_argument(
        "--taxid_attr",
        dest="taxid_attr",
        type=str,
        default="name",
        help="node attribute encoding for valid taxid numbers.")

    edit_gr.add_argument(
        "--taxid_attr_regexp",
        dest="taxid_attr_regexp",
        type=str,
        help=
        "If taxid number is encoded as part of another text string, i.e. gene name, use this argument to define a Perl regular expression to extract taxid numbers."
    )

    phylo_gr = parser.add_argument_group(
        "PHYLOGENETIC OPTIONS\n=================")

    phylo_gr.add_argument(
        "--alg",
        dest="alg",
        type=str,
        help="""Link tree to a multiple sequence alignment.""")

    phylo_gr.add_argument(
        "--alg_format",
        dest="alg_format",
        type=str,
        default="fasta",
        help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""")

    phylo_gr.add_argument(
        "--sp_discovery",
        dest="species_discovery_regexp",
        type=str,
        default="^[^_]+_(.+)",
        help="Perl regular expression used to capture species"
        " code from node names. By default, node names"
        " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ")

    args = parser.parse_args(argv)

    tfile = args.tree[0]

    if args.ladderize and args.sort:
        raise ValueError(
            "--sort-branches and --ladderize options are mutually exclusive")

    if args.raxml:
        nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]",
                    open(tfile).read())
        t = PhyloTree(nw)
    else:
        t = PhyloTree(tfile)

    if args.ncbi:
        if args.taxid_attr_regexp:
            TAXIDMATCHER = re.compile(args.taxid_attr_regexp)

        for lf in t:
            if args.taxid_attr_regexp:
                lf.taxid = re.search(TAXIDMATCHER,
                                     getattr(lf, args.taxid_attr)).groups()[0]
            else:
                lf.taxid = getattr(lf, args.taxid_attr)
        t.annotate_ncbi_taxa(taxid_attr="taxid")

    if args.alg:
        t.link_to_alignment(args.alg, alg_format=args.alg_format)
        LEAF_ATTRIBUTES["sequence"] = 1

    if args.species_discovery_regexp:
        SPCODE_REGEXP = re.compile(args.species_discovery_regexp)
        t.set_species_naming_function(user_species_naming_function)

    if args.ladderize:
        t.ladderize()
    if args.sort:
        t.sort_descendants()

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    # VISUALIZATION

    ts = TreeStyle()
    ts.mode = args.mode
    ts.show_leaf_name = False
    ts.branch_vertical_margin = args.branch_separation
    if args.show_support:
        ts.show_branch_support = True
    if args.show_branch_length:
        ts.show_branch_length = True
    if args.force_topology:
        ts.force_topology = True

    if args.hide_leaf_names:
        del LEAF_ATTRIBUTES["name"]

    if args.show_internal_names:
        INTERNAL_ATTRIBUTES["name"] = 1

    # scale the tree
    if not args.height:
        args.height = None
    if not args.width:
        args.width = None

    if args.text_mode:
        print t.get_ascii(show_internal=args.show_internal_names,
                          attributes=args.show_attributes)
    else:
        ts.layout_fn = master_layout
        if args.image:
            t.render(args.image,
                     tree_style=ts,
                     w=args.width,
                     h=args.height,
                     units=args.size_units)
        else:
            t.show(None, tree_style=ts)
示例#10
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('tree',
                        metavar='tree_file',
                        type=str,
                        nargs=1,
                        help='A tree file (or text string) in newick format.')

    parser.add_argument(
        "--sp_delimiter",
        dest="species_delimiter",
        type=str,
        default="_",
        help=("When species names are guessed from node names,"
              " this argument specifies how to split node name to guess"
              " the species code"))

    parser.add_argument(
        "--sp_field",
        dest="species_field",
        type=int,
        default=1,
        help=("When species names are guessed from node names,"
              " this argument specifies the position of the species"
              " name code relative to the name splitting delimiter"))

    parser.add_argument(
        "--root",
        dest="root",
        type=str,
        nargs="*",
        help="Roots the tree to the node grouping the list"
        " of node names provided (space separated). In example:"
        "'--root human rat mouse'")

    parser.add_argument(
        "--skip_ortholog_detection",
        dest="skip_ortholog_detection",
        action="store_true",
        help=
        ("Skip automatic detection of"
         " speciation and duplication events, thus relying in the"
         " correct annotation of the provided tree using"
         " the extended newick format (i.e. '((A, A)[&&NHX:evoltype=D], B)[&&NHX:evoltype=S];')"
         ))

    parser.add_argument(
        "--evoltype_attr",
        dest="evoltype_attr",
        type=str,
        default="evoltype",
        help=(
            "When orthology detection is disabled,"
            " the attribute name provided here will be expected to exist"
            " in all internal nodes and read from the extended newick format"))

    parser.add_argument("--database",
                        dest="database",
                        type=str,
                        default="",
                        help=("Database name"))

    parser.add_argument(
        "--show",
        dest="show",
        action="store_true",
        default="",
        help=(
            "Show the tree and its evolutionary events before orthoXML export"
        ))

    parser.add_argument(
        "--ascii",
        dest="ascii",
        action="store_true",
        default="",
        help=(
            "Show the tree using ASCII representation and all its evolutionary"
            " events before orthoXML export"))

    parser.add_argument(
        "--newick",
        dest="newick",
        action="store_true",
        default="",
        help=("print the extended newick format for provided tree using"
              " ASCII representation and all its evolutionary events"
              " before orthoXML export"))

    args = parser.parse_args()
    newick = args.tree[0]

    SPECIES_NAME_POS = args.species_field
    SPECIES_NAME_DELIMITER = args.species_delimiter

    # load a phylomeDB Tree provided as a newick file in the command line
    t = PhyloTree(newick, sp_naming_function=extract_spname)

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    if not args.skip_ortholog_detection:
        # detect speciation and duplication events using the species overlap
        # algorithm used in phylomeDB
        t.get_descendant_evol_events()

    if args.ascii:
        print t.get_ascii(attributes=[args.evoltype_attr, "name"],
                          show_internal=True)

    if args.newick:
        print t.write(features=[args.evoltype_attr], format_root_node=True)

    if args.show:
        t.show()

    export_as_orthoXML(t, args.database, args.evoltype_attr)
示例#11
0
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    
    parser.add_argument('tree', metavar='tree_file', type=str, nargs=1,
                      help='A tree file (or text string) in newick format.')


    parser.add_argument("--sp_delimiter", dest="species_delimiter",
                        type=str, default="_",
                        help=("When species names are guessed from node names,"
                              " this argument specifies how to split node name to guess"
                              " the species code"))
                        
    parser.add_argument("--sp_field", dest="species_field", 
                          type=int, default=1,
                          help=("When species names are guessed from node names,"
                                " this argument specifies the position of the species"
                                " name code relative to the name splitting delimiter"))

    parser.add_argument("--root", dest="root", 
                        type=str, nargs="*",
                        help="Roots the tree to the node grouping the list"
                        " of node names provided (space separated). In example:"
                        "'--root human rat mouse'")

    
    parser.add_argument("--skip_ortholog_detection", dest="skip_ortholog_detection", 
                        action="store_true",
                        help=("Skip automatic detection of"
                              " speciation and duplication events, thus relying in the"
                              " correct annotation of the provided tree using"
                              " the extended newick format (i.e. '((A, A)[&&NHX:evoltype=D], B)[&&NHX:evoltype=S];')"))
    
    parser.add_argument("--evoltype_attr", dest="evoltype_attr", 
                          type=str, default="evoltype",
                          help=("When orthology detection is disabled,"
                                " the attribute name provided here will be expected to exist"
                                " in all internal nodes and read from the extended newick format"))
    
    parser.add_argument("--database", dest="database", 
                        type=str, default="",
                        help=("Database name"))


    parser.add_argument("--show", dest="show", 
                        action="store_true", default="",
                        help=("Show the tree and its evolutionary events before orthoXML export"))

    parser.add_argument("--ascii", dest="ascii", 
                        action="store_true", default="",
                        help=("Show the tree using ASCII representation and all its evolutionary"
                              " events before orthoXML export"))

    parser.add_argument("--newick", dest="newick", 
                        action="store_true", default="",
                        help=("print the extended newick format for provided tree using"
                              " ASCII representation and all its evolutionary events"
                              " before orthoXML export"))
    
    
    args = parser.parse_args()
    newick = args.tree[0]

    SPECIES_NAME_POS = args.species_field
    SPECIES_NAME_DELIMITER = args.species_delimiter

    # load a phylomeDB Tree provided as a newick file in the command line
    t = PhyloTree(newick, sp_naming_function=extract_spname)

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)


    if not args.skip_ortholog_detection:
        # detect speciation and duplication events using the species overlap
        # algorithm used in phylomeDB
        t.get_descendant_evol_events()
        
    if args.ascii:
        print t.get_ascii(attributes=[args.evoltype_attr, "name"], show_internal=True)
        
    if args.newick:
        print t.write(features=[args.evoltype_attr], format_root_node=True)
        
    if args.show:
        t.show()
    
    export_as_orthoXML(t, args.database, args.evoltype_attr)
示例#12
0
文件: ete_dist.py 项目: daisieh/ete
def main(argv):
    
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                            formatter_class=argparse.RawDescriptionHelpFormatter)


    parser.add_argument("target_trees", metavar='target_trees', type=str, nargs="*",
                   help='a list of target tree files')
    
    parser.add_argument("--targets_file", dest="targets_file", 
                        type=str, 
                        help="""path to a file containing target trees, one per line""")
    
    parser.add_argument("-o", dest="output", 
                        type=str,
                        help="""Path to the tab delimited report file""")

    parser.add_argument("-r", dest="reftree", 
                        type=str, required=True,
                        help="""Reference tree""")

    parser.add_argument("--outgroup", dest="outgroup", 
                        nargs = "+",
                        help="""outgroup used to root reference and target trees before distance computation""")
  
    parser.add_argument("--expand_polytomies", dest="polytomies", 
                        action = "store_true",
                        help="""expand politomies if necessary""")
  
    parser.add_argument("--unrooted", dest="unrooted", 
                        action = "store_true",
                        help="""compare trees as unrooted""")

    parser.add_argument("--min_support", dest="min_support", 
                        type=float, default=0.0,
                        help=("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"))
    
    parser.add_argument("--extract_species", dest="extract_species", 
                        action = "store_true",
                        help="""When used, reference tree is assumed to contain species names, while target trees as expected to be gene trees. Species name will be extracted from gene tree nodes and treeko will be used if duplication events are found.""")

    parser.add_argument("--spname_delimiter", dest="spname_delimiter", 
                        type=str, default="_",
                        help=("species code delimiter in node names"))
    
    parser.add_argument("--spname_field", dest="spname_field", 
                        type=int, default=-1,
                        help=("position of the species code extracted from node names. -1 = last field"))
    

    parser.add_argument("--collateral", dest="collateral", 
                        action='store_true', 
                        help=(""))

    parser.add_argument("--ref_attr", dest="ref_attr", 
                        type=str, 
                        help=("attribute in ref tree used as leaf name"))
    
    parser.add_argument("--target_attr", dest="target_attr", 
                        type=str, 
                        help=("attribute in target tree used as leaf name"))


    
    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.targets_file and args.target_trees:
        print >>sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)
        
    if args.targets_file:
        target_trees = tree_iterator(args.targets_file)
    else:
        target_trees = args.target_trees
        
    t = Tree(reftree)

    if args.ref_attr:
        for lf in t.iter_leaves():
            lf._origname = lf.name
            if args.ref_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_attr)
    
    if args.outgroup:
        if len(args.outgroup) > 1:
            out = t.get_common_ancestor(args.outgroup)
        else:
            out = t.search_nodes(name=args.outgroup[0])[0]
        t.set_outgroup(out)
             
        
    ref_names = set(t.get_leaf_names())
    reftree_len = len(t)
    reftree_edges = (reftree_len*2)-2
    ncollapsed_branches = len([n for n in t.traverse() if n.children and n.support < args.min_support])
    #reftree_edges -= ncollapsed_branches
    #if ncollapsed_branches:
    #    print '%d branches collapsed in reference tree' %ncollapsed_branches
    
    HEADER = ("target tree", 'dups', 'subtrees', 'used trees', 'treeko', "RF", "maxRF", 'normRF', "%reftree", "%genetree", "avgSize", "minSize", "common tips", "refSize", "targetSize")
    if args.output:
        OUT = open(args.output, "w")
        print >>OUT, '# ' + ctime()
        print >>OUT, '# ' + ' '.join(sys.argv) 
        print >>OUT, '#'+'\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv) 
        COL_WIDTHS = [20, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')
                
    prev_tree = None

    for counter, tfile in enumerate(target_trees):
        if args.targets_file:
            seedid, tfile = tfile
        else:
            seedid = None

           
        if args.extract_species:
            tt = PhyloTree(tfile, sp_naming_function = lambda name: name.split(args.spname_delimiter)[args.spname_field])
        else:
            tt = Tree(tfile)

        if args.target_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.target_attr)
            
        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)
        
        if args.target_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' %counter

        max_size, min_size, avg_size, common = -1, -1, -1, -1
        total_rf, max_rf, norm_rf = -1, -1, -1
        treeko_d = -1
        ref_branches_in_target, target_branches_in_ref = -1, -1
        target_tree_len = -1
        used_subtrees = -1             
        if args.extract_species:
            orig_target_size = len(tt)
            ntrees, ndups, sp_trees = tt.get_speciation_trees(autodetect_duplications=True, newick_only=True)

            if ntrees < 1000:
                all_rf = []
                ref_found = []
                target_found = []
                tree_sizes = []
                all_max_rf = []
                common_names = 0

                for subtree_nw in sp_trees:
                    if seedid and not args.collateral and (seedid not in subtree_nw):
                        continue
                    subtree = PhyloTree(subtree_nw, sp_naming_function = lambda name: name.split(args.spname_delimiter)[args.spname_field])

                    # only necessary if rf function is going to filter by support value. It slows downs the analysis, obviously
                    if args.min_support:
                        subtree_content = subtree.get_cached_content(store_attr='name')
                        for n in subtree.traverse():
                            if n.children:
                                n.support = tt.get_common_ancestor(subtree_content[n]).support
                                
                    rf, maxr, common, p1, p2, d1, d2 = t.robinson_foulds(subtree, expand_polytomies=args.polytomies, unrooted_trees=args.unrooted,
                                                                         attr_t2='species', min_support_t2=args.min_support)
                    if maxr > 0 and p1 and p2:
                        all_rf.append(rf)
                        tree_sizes.append(len(common))
                        all_max_rf.append(maxr)
                        common_names = max(common_names, len(common))
                        
                        ref_found.append(float(len(p2 & p1)) / reftree_edges)
                        p2bis = set([p for p in (p2-d2) if len(p[0])>1 and len(p[1])>1]) # valid edges in target not leaves
                        if p2bis:
                            incompatible_target_branches = float(len((p2-d2) - p1))
                            target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))
                            
                        # valid_target = p2-d2
                        # valid_ref = p1-d1
                        # ref_found.append(float(len(valid_target & valid_ref)) / reftree_edges)
                        
                        # p2bis = set([p for p in (p2-d2) if len(p[0])>1 and len(p[1])>1])
                        # if p2bis-d2:
                        #     incompatible_target_branches = float(len((p2-d2) - p1))
                        #     target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))

                        
                if all_rf:
                    # Treeko speciation distance
                    alld = [(all_rf[i]/float(all_max_rf[i])) for i in xrange(len(all_rf))]
                    a = numpy.sum([alld[i] * tree_sizes[i] for i in xrange(len(all_rf))])
                    b = float(numpy.sum(tree_sizes))
                    treeko_d  = a/b
                    total_rf = numpy.mean(all_rf)                    
                    norm_rf = numpy.mean([(all_rf[i]/float(all_max_rf[i])) for i in xrange(len(all_rf))])
                    max_rf = numpy.max(all_max_rf)
                    ref_branches_in_target = numpy.mean(ref_found)
                    target_branches_in_ref = numpy.mean(target_found) if target_found else -1
                    target_tree_len = numpy.mean(tree_sizes)
                    used_subtrees = len(all_rf)
        else:
            target_tree_len = len(tt)
            ndups, ntrees, used_subtrees = 0, 1, 1
            treeko_d = -1
            total_rf, max_rf, common, p1, p2, d1, d2 = tt.robinson_foulds(t, expand_polytomies=args.polytomies, unrooted_trees=args.unrooted)
            common_names = len(common)
            if max_rf:
                norm_rf = total_rf / float(max_rf)
            if p1 and p2: 
                sizes = [len(p) for p in p2 ^ p1]
                if sizes: 
                    avg_size = sum(sizes) / float(len(sizes))
                    max_size, min_size = max(sizes), min(sizes)
                else:
                    max_size, min_size, avg_size = 0, 0, 0
                    
                ref_branches_in_target = float(len(p2 & p1)) / reftree_edges
                #if p2-d2:
                #    incompatible_target_branches = float(len((p2-d2) - p1))
                #    target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))
            else:
                ref_branches_in_target = 0.0
                target_branches_in_ref = 0.0
                max_size, min_size, avg_size = -1, -1, -1

        if args.output:
            print >>OUT, '\t'.join(map(str, (fname, ndups, ntrees, used_subtrees, treeko_d, total_rf, max_rf, norm_rf, ref_branches_in_target, target_branches_in_ref,
                                             avg_size, min_size, common_names, reftree_len, target_tree_len)))
        else:
            print_table([map(istr, (fname[-30:], ndups, ntrees, used_subtrees, treeko_d, total_rf, max_rf, norm_rf, '%0.4f' %ref_branches_in_target, '%0.4f' %target_branches_in_ref,
                 avg_size, min_size, common_names, reftree_len, target_tree_len))], fix_col_width = COL_WIDTHS, wrap_style='cut')

    if args.output:
        OUT.close()
示例#13
0
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)

    input_gr = parser.add_argument_group("TREE INPUT OPTIONS\n=================")
    
    input_gr.add_argument('tree', metavar='tree_file', type=str, nargs=1,
                      help='A tree file (or text string) in newick format.')

    input_gr.add_argument("--raxml", dest="raxml", 
                        action="store_true",
                        help="""Process newick as raxml bootstrap values""")
    
    img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================")
        
    img_gr.add_argument("-m", "--mode", dest="mode", 
                        choices=["c", "r"], default="r",
                        help="""(r)ectangular or (c)ircular visualization""")
  

    img_gr.add_argument("-i", "--image", dest="image", 
                        type=str, 
                        help="Render tree image instead of showing it. A filename "
                        " should be provided. PDF, SVG and PNG file extensions are"
                        " supported (i.e. -i tree.svg)"
                        )

    img_gr.add_argument("--text", dest="text_mode", 
                        action="store_true",
                        help="Shows the tree using ASCII characters")

    img_gr.add_argument("--attr", "--show_attributes", dest="show_attributes", 
                        nargs="+",
                        help="Display the value of the specified attributes, if available")    
    
    img_gr.add_argument("--Iw", "--width", dest="width", 
                        type=int, default=0, 
                        help="width of the rendered image in pixels (see --size-units)."
                        )

    img_gr.add_argument("--Ih", "--height", dest="height", 
                        type=int, default=0,
                        help="height of the rendered image in pixels (see --size-units)."
                        )

    img_gr.add_argument("--Ir", "--resolution", dest="resolution", 
                        type=int, default=300,
                        help="Resolution if the tree image (DPI)"
                        )

    img_gr.add_argument("--Iu", "--size_units", dest="size_units", 
                        choices=["px", "mm", "in"], default="px",
                        help="Units used to specify the size of the image."
                        " (px:pixels, mm:millimeters, in:inches). "
                        )

    img_gr.add_argument("-mbs", "--min_branch_separation", dest="branch_separation", 
                        type=int, default = 3, 
                        help="Min number of pixels to separate branches vertically."
                        )

    img_gr.add_argument("--ss", "--show_support", dest="show_support", 
                        action="store_true",
                        help="""Shows branch bootstrap/support values""")

    img_gr.add_argument("--sbl", "--show_branch_length", dest="show_branch_length", 
                        action="store_true",
                        help="""Show branch lengths.""")

    img_gr.add_argument("--ft", "--force_topology", dest="force_topology", 
                        action="store_true",
                        help="""Force branch length to have a minimum length in the image""")

    img_gr.add_argument("--hln", "--hide_leaf_names", dest="hide_leaf_names", 
                        action="store_true",
                        help="""Hide leaf names.""")

    img_gr.add_argument("--sin", "--show_internal_names", dest="show_internal_names", 
                        action="store_true",
                        help="""Show the name attribute of all internal nodes.""")

    edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================")
    
    edit_gr.add_argument("-r", "--root", dest="root", 
                         type=str, nargs="*",
                         help="Roots the tree to the node grouping the list"
                         " of node names provided (space separated). In example:"
                         "'--root human rat mouse'")
    
    edit_gr.add_argument("-s", "--sort_branches", dest="sort", 
                        action="store_true",
                        help="""Sort branches according to node names.""")

    edit_gr.add_argument("-l", "--ladderize", dest="ladderize", 
                        action="store_true",
                        help="""Sort branches by partition size.""")
    
    edit_gr.add_argument("--color_by_rank", dest="color_by_rank", 
                         type=str, nargs="+",
                         help="""If the attribute rank is present in nodes """)

    edit_gr.add_argument("--ncbi", dest="ncbi", 
                         action="store_true",
                         help=""" Annotate tree using the NCBI taxonomy database""")

    edit_gr.add_argument("--taxid_attr", dest="taxid_attr", 
                         type=str, default="name",
                         help="node attribute encoding for valid taxid numbers.")

    edit_gr.add_argument("--taxid_attr_regexp", dest="taxid_attr_regexp", 
                         type=str,
                         help="If taxid number is encoded as part of another text string, i.e. gene name, use this argument to define a Perl regular expression to extract taxid numbers.")
    
    phylo_gr = parser.add_argument_group("PHYLOGENETIC OPTIONS\n=================")
    
    phylo_gr.add_argument("--alg", dest="alg", 
                        type=str, 
                        help="""Link tree to a multiple sequence alignment.""")

    phylo_gr.add_argument("--alg_format", dest="alg_format", 
                        type=str, default="fasta",
                        help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""")
    
    phylo_gr.add_argument("--sp_discovery", dest="species_discovery_regexp", 
                          type=str, default="^[^_]+_(.+)",
                          help="Perl regular expression used to capture species"
                          " code from node names. By default, node names"
                          " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ")
    
    args = parser.parse_args(argv)

    tfile = args.tree[0]


    if args.ladderize and args.sort:
        raise ValueError("--sort-branches and --ladderize options are mutually exclusive")
    
    if args.raxml:
        nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read())
        t = PhyloTree(nw)
    else:
        t = PhyloTree(tfile)

    if args.ncbi:
        if args.taxid_attr_regexp:
            TAXIDMATCHER = re.compile(args.taxid_attr_regexp)

        for lf in t:
            if args.taxid_attr_regexp:
                lf.taxid = re.search(TAXIDMATCHER, getattr(lf, args.taxid_attr)).groups()[0]
            else:
                lf.taxid = getattr(lf, args.taxid_attr)
        t.annotate_ncbi_taxa(taxid_attr="taxid")
        
    if args.alg:
        t.link_to_alignment(args.alg, alg_format=args.alg_format)
        LEAF_ATTRIBUTES["sequence"] = 1
        
    if args.species_discovery_regexp:
        SPCODE_REGEXP = re.compile(args.species_discovery_regexp)
        t.set_species_naming_function(user_species_naming_function)
        
    if args.ladderize:
        t.ladderize()
    if args.sort:
        t.sort_descendants()

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    # VISUALIZATION
        
    ts = TreeStyle()
    ts.mode = args.mode
    ts.show_leaf_name = False
    ts.branch_vertical_margin = args.branch_separation
    if args.show_support:
        ts.show_branch_support = True
    if args.show_branch_length:
        ts.show_branch_length = True
    if args.force_topology:
        ts.force_topology = True
        
    if args.hide_leaf_names:
        del LEAF_ATTRIBUTES["name"]

    if args.show_internal_names:
        INTERNAL_ATTRIBUTES["name"] = 1

    # scale the tree
    if not args.height: 
        args.height = None
    if not args.width: 
        args.width = None

    if args.text_mode:
        print t.get_ascii(show_internal=args.show_internal_names, attributes = args.show_attributes)
    else:    
        ts.layout_fn = master_layout        
        if args.image:
            t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units)
        else:
            t.show(None, tree_style=ts)
示例#14
0
def main(argv):
    global args

    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                            formatter_class=argparse.RawDescriptionHelpFormatter)

    
    parser.add_argument("-r", dest="reftree", 
                        type=str, required=True,
                        help="""Reference tree""")
    
    parser.add_argument("--source_trees", dest="source_trees", 
                        type=str, required = True,
                        help=("A list of *rooted* genetrees, one per line, in the format: TreeID/SeedID [TAB] newick "))
   
    parser.add_argument("--plot_newick", dest="plot_newick", 
                        type=str,
                        help=(""))
    
    parser.add_argument("--spname_delimiter", dest="spname_delimiter", 
                        type=str, default="_",
                        help=("species code delimiter in node names"))
    
    parser.add_argument("--spname_field", dest="spname_field", 
                        type=int, default=-1,
                        help=("position of the species code extracted from node names. -1 = last field"))

    parser.add_argument("--collateral", dest="use_collateral", 
                        action="store_true",
                        help=("If enabled, collateral information will be used as"
                              " equally qualified data. Otherwise, such data will"
                              " be reported separatedly. Use this if your set of"
                              " trees are not overlaping. "))

    parser.add_argument("--skip_dup_detection", dest="skip_dup_detection", 
                        action="store_true",
                        help=('If used, duplications will be expected to be annotated'
                              ' in the source gene trees with the evoltype="D" tag.'
                              ' Otherwise they will be inferred on the fly using'
                              ' the species overlap algorithm.'))

    parser.add_argument("--spoverlap", dest="species_overlap", 
                        type=float, default=0.0,
                        help=("Species overlap cutoff. A number between 0 and 1 "
                        "representing the percentage of species that should be "
                        "shared between two sister partitions to be considered a"
                        " duplication. 0 = any overlap represents a duplication. "))
    
    parser.add_argument("--debug", dest="debug", 
                        action="store_true",
                        help=("generate an image of every input gene tree tree, so the result can be inspected"))

    parser.add_argument("--snapshot_step", dest="snapshot_step", 
                        type=int, default=1000,
                        help=("How many trees should be processed between snapshots dumps?"))

    parser.add_argument("--reftree_constraint", dest="reftree_constraint", 
                        type=str, 
                        help=("A python module from from which a function called "
                              "*is_valid_treeid(treeid, refbranch)* should be importable. "
                              "The function will be used to decide if the info of a given "
                              "source tree is informative or not for each reftree branch. "))
    
    parser.add_argument("-o", dest="output", 
                        type=str, required=True, 
                        help=("output tag name (extensions will be added)"))

    parser.add_argument("--cpu", dest="cpu", 
                        type=int, default=1, 
                        help=("enable parallel computation"))

    parser.add_argument("--img_report", dest="img_report", 
                        action="store_true", 
                        help=("If true, it generates a summary image results with all the computed data"))

    parser.add_argument("--report_supports", dest="report_supports", 
                        action="store_true", 
                        help=("If used, supported ref tree branches are individually reported for each gene tree "))

    
    args = parser.parse_args(argv)
    if args.plot_newick:
        t = Tree(args.plot_newick)
        ts = TreeStyle()
        ts.layout_fn = info_layout
        t.render("tree_analysis.png", tree_style=ts)
        sys.exit(0)
    
    SPNAME_FIELD, SPNAME_DELIMITER = args.spname_field, args.spname_delimiter
    USE_COLLATERAL = args.use_collateral
    DETECT_DUPLICATIONS = True if not args.skip_dup_detection else False
    REPORT_PER_TREE_SUPPORTS = True if args.report_supports  else False
    SP_OVERLAP = args.species_overlap
    DEBUG = args.debug
    IMG_REPORT = args.img_report
    reftree = PhyloTree(args.reftree, sp_naming_function=None)
    for nid, n in enumerate(reftree.traverse()):
        n.add_features(nid = nid)
    REFTREE_SPECIES = set(reftree.get_leaf_names())
    print __DESCRIPTION__

    if REPORT_PER_TREE_SUPPORTS:
        REPORT_SUPPORT_FILE = open("%s.gentree_supports" %args.output, "w")
        print >>REPORT_SUPPORT_FILE, '#'+'\t'.join(map(str, ["treeId", "spCoverage", "mean_support",  "mean_coll_support", "tested_branches", 'tested_coll_branches']))
    
    TOTAL_TREES = int(commands.getoutput("wc -l %s" %args.source_trees).split()[0]) + 1
    print >>sys.stderr, "Processing %d source trees" %TOTAL_TREES
    if args.reftree_constraint:
        import imp
        constraint = imp.load_source('constraint', args.reftree_constraint)
        IS_VALID_TREEID = constraint.is_valid_treeid
    else:
        IS_VALID_TREEID = None
       
    if args.cpu > 1:
        MONITOR_STEP = 0
        #return (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
        #       coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports)
        # The output of the process_trees function are 9 dictionaries in which keys are refbranches
        target_dicts = [{} for x in range(9)] 
        def merge_dict_results(target, source):
            def merge_dict(target, source):
                for k, v in source.iteritems():
                    if k not in target:
                        target[k] = v
                    elif isinstance(v, list):
                        target[k].extend(v)
                    elif isinstance(v, set):
                        target[k].update(v)
                    elif isinstance(v, int):
                        target[k] += v
                    else:
                        raise ValueError("Impossible to merge str results")
            for index in xrange(len(target)):
                merge_dict(target[index], out[index])

        from multiprocessing import Process, Queue
        from Queue import Empty as QueueEmpty
        outputs_queue = Queue()
        if TOTAL_TREES > args.cpu:
            trees_per_cpu = TOTAL_TREES / args.cpu
            trees_per_cpu += 1 if TOTAL_TREES % args.cpu else 0
        else:
            trees_per_cpu = 1
            args.cpu = TOTAL_TREES
            
        all_workers = set()
        for cpu_num in xrange(args.cpu):
            sline = (cpu_num*trees_per_cpu)
            eline = (cpu_num*trees_per_cpu) + trees_per_cpu
            data_iter = tree_iterator(args.source_trees,
                                      restrict_species=REFTREE_SPECIES,
                                      start_line=sline,
                                      end_line=eline)
            print >>sys.stderr, "Launching worker %d from %d to %d" %(cpu_num, sline, eline)
            worker = Process(target=run_parallel,
                             args=(cpu_num, outputs_queue, process_trees, data_iter, reftree, trees_per_cpu))
            worker.name = "Worker_%d" %cpu_num
            all_workers.add(worker)
            worker.start()
            
        while all_workers:
            # clear done threads
            for w in list(all_workers):
                if not w.is_alive():
                    print >>sys.stderr, "%s thread is done!" %w.name
                    all_workers.discard(w)
            # get and merge results
            while 1:
                try:
                    out = outputs_queue.get(False)
                except QueueEmpty:
                    break
                else:
                    # This merge depends on process_trees return output!!!!!
                    merge_dict_results(target_dicts, out)
                    # Dump a snapshot
                    dump_results(reftree, *target_dicts)
                time.sleep(0.1)
            if all_workers:
                time.sleep(1)
        # collected data
        (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
         coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch,
         coll_refbranch_supports) = target_dicts
    else:
        MONITOR_STEP = args.snapshot_step
        data_iter = tree_iterator(args.source_trees, restrict_species=REFTREE_SPECIES)
        
        (informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
         coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch,
         coll_refbranch_supports) = process_trees(data_iter, reftree, TOTAL_TREES)

    if REPORT_PER_TREE_SUPPORTS:
        REPORT_SUPPORT_FILE.close()

    dump_results(reftree, informed_branches, dup_per_branch, losses_per_branch, losses_per_dup_branch, refbranch_supports,
                 coll_dup_per_branch, coll_losses_per_branch, coll_losses_per_dup_branch, coll_refbranch_supports)

    print >>sys.stderr, "Dumping full analysis..."
    # Full dump, including duplication details
    cPickle.dump(reftree, open("%s.pkl"%args.output, "w"))
示例#15
0
def main(argv):

    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("target_trees",
                        metavar='target_trees',
                        type=str,
                        nargs="*",
                        help='a list of target tree files')

    parser.add_argument(
        "--targets_file",
        dest="targets_file",
        type=str,
        help="""path to a file containing target trees, one per line""")

    parser.add_argument("-o",
                        dest="output",
                        type=str,
                        help="""Path to the tab delimited report file""")

    parser.add_argument("-r",
                        dest="reftree",
                        type=str,
                        required=True,
                        help="""Reference tree""")

    parser.add_argument(
        "--outgroup",
        dest="outgroup",
        nargs="+",
        help=
        """outgroup used to root reference and target trees before distance computation"""
    )

    parser.add_argument("--expand_polytomies",
                        dest="polytomies",
                        action="store_true",
                        help="""expand politomies if necessary""")

    parser.add_argument("--unrooted",
                        dest="unrooted",
                        action="store_true",
                        help="""compare trees as unrooted""")

    parser.add_argument(
        "--min_support",
        dest="min_support",
        type=float,
        default=0.0,
        help=
        ("min support value for branches to be counted in the distance computation (RF, treeko and refTree/targeGene compatibility)"
         ))

    parser.add_argument(
        "--extract_species",
        dest="extract_species",
        action="store_true",
        help=
        """When used, reference tree is assumed to contain species names, while target trees as expected to be gene trees. Species name will be extracted from gene tree nodes and treeko will be used if duplication events are found."""
    )

    parser.add_argument("--spname_delimiter",
                        dest="spname_delimiter",
                        type=str,
                        default="_",
                        help=("species code delimiter in node names"))

    parser.add_argument(
        "--spname_field",
        dest="spname_field",
        type=int,
        default=-1,
        help=
        ("position of the species code extracted from node names. -1 = last field"
         ))

    parser.add_argument("--collateral",
                        dest="collateral",
                        action='store_true',
                        help=(""))

    parser.add_argument("--ref_attr",
                        dest="ref_attr",
                        type=str,
                        help=("attribute in ref tree used as leaf name"))

    parser.add_argument("--target_attr",
                        dest="target_attr",
                        type=str,
                        help=("attribute in target tree used as leaf name"))

    args = parser.parse_args(argv)
    print __DESCRIPTION__
    reftree = args.reftree
    if args.targets_file and args.target_trees:
        print >> sys.stderr, 'The use of targets_file and targets at the same time is not supported.'
        sys.exit(1)

    if args.targets_file:
        target_trees = tree_iterator(args.targets_file)
    else:
        target_trees = args.target_trees

    t = Tree(reftree)

    if args.ref_attr:
        for lf in t.iter_leaves():
            lf._origname = lf.name
            if args.ref_attr not in lf.features:
                print lf
            lf.name = getattr(lf, args.ref_attr)

    if args.outgroup:
        if len(args.outgroup) > 1:
            out = t.get_common_ancestor(args.outgroup)
        else:
            out = t.search_nodes(name=args.outgroup[0])[0]
        t.set_outgroup(out)

    ref_names = set(t.get_leaf_names())
    reftree_len = len(t)
    reftree_edges = (reftree_len * 2) - 2
    ncollapsed_branches = len([
        n for n in t.traverse() if n.children and n.support < args.min_support
    ])
    #reftree_edges -= ncollapsed_branches
    #if ncollapsed_branches:
    #    print '%d branches collapsed in reference tree' %ncollapsed_branches

    HEADER = ("target tree", 'dups', 'subtrees', 'used trees', 'treeko', "RF",
              "maxRF", 'normRF', "%reftree", "%genetree", "avgSize", "minSize",
              "common tips", "refSize", "targetSize")
    if args.output:
        OUT = open(args.output, "w")
        print >> OUT, '# ' + ctime()
        print >> OUT, '# ' + ' '.join(sys.argv)
        print >> OUT, '#' + '\t'.join(HEADER)
    else:
        print '# ' + ctime()
        print '# ' + ' '.join(sys.argv)
        COL_WIDTHS = [20, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
        print_table([HEADER], fix_col_width=COL_WIDTHS, wrap_style='wrap')

    prev_tree = None

    for counter, tfile in enumerate(target_trees):
        if args.targets_file:
            seedid, tfile = tfile
        else:
            seedid = None

        if args.extract_species:
            tt = PhyloTree(tfile,
                           sp_naming_function=lambda name: name.split(
                               args.spname_delimiter)[args.spname_field])
        else:
            tt = Tree(tfile)

        if args.target_attr:
            for lf in tt.iter_leaves():
                lf._origname = lf.name
                lf.name = getattr(lf, args.target_attr)

        if args.outgroup:
            if len(args.outgroup) > 1:
                out = tt.get_common_ancestor(args.outgroup)
            else:
                out = tt.search_nodes(name=args.outgroup[0])[0]
            tt.set_outgroup(out)

        if args.target_trees:
            fname = os.path.basename(tfile)
        else:
            fname = '%05d' % counter

        max_size, min_size, avg_size, common = -1, -1, -1, -1
        total_rf, max_rf, norm_rf = -1, -1, -1
        treeko_d = -1
        ref_branches_in_target, target_branches_in_ref = -1, -1
        target_tree_len = -1
        used_subtrees = -1
        if args.extract_species:
            orig_target_size = len(tt)
            ntrees, ndups, sp_trees = tt.get_speciation_trees(
                autodetect_duplications=True, newick_only=True)

            if ntrees < 1000:
                all_rf = []
                ref_found = []
                target_found = []
                tree_sizes = []
                all_max_rf = []
                common_names = 0

                for subtree_nw in sp_trees:
                    if seedid and not args.collateral and (seedid
                                                           not in subtree_nw):
                        continue
                    subtree = PhyloTree(
                        subtree_nw,
                        sp_naming_function=lambda name: name.split(
                            args.spname_delimiter)[args.spname_field])

                    # only necessary if rf function is going to filter by support value. It slows downs the analysis, obviously
                    if args.min_support:
                        subtree_content = subtree.get_cached_content(
                            store_attr='name')
                        for n in subtree.traverse():
                            if n.children:
                                n.support = tt.get_common_ancestor(
                                    subtree_content[n]).support

                    rf, maxr, common, p1, p2, d1, d2 = t.robinson_foulds(
                        subtree,
                        expand_polytomies=args.polytomies,
                        unrooted_trees=args.unrooted,
                        attr_t2='species',
                        min_support_t2=args.min_support)
                    if maxr > 0 and p1 and p2:
                        all_rf.append(rf)
                        tree_sizes.append(len(common))
                        all_max_rf.append(maxr)
                        common_names = max(common_names, len(common))

                        ref_found.append(float(len(p2 & p1)) / reftree_edges)
                        p2bis = set([
                            p for p in (p2 - d2)
                            if len(p[0]) > 1 and len(p[1]) > 1
                        ])  # valid edges in target not leaves
                        if p2bis:
                            incompatible_target_branches = float(
                                len((p2 - d2) - p1))
                            target_found.append(1 -
                                                (incompatible_target_branches /
                                                 (len(p2 - d2))))

                        # valid_target = p2-d2
                        # valid_ref = p1-d1
                        # ref_found.append(float(len(valid_target & valid_ref)) / reftree_edges)

                        # p2bis = set([p for p in (p2-d2) if len(p[0])>1 and len(p[1])>1])
                        # if p2bis-d2:
                        #     incompatible_target_branches = float(len((p2-d2) - p1))
                        #     target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))

                if all_rf:
                    # Treeko speciation distance
                    alld = [(all_rf[i] / float(all_max_rf[i]))
                            for i in xrange(len(all_rf))]
                    a = numpy.sum(
                        [alld[i] * tree_sizes[i] for i in xrange(len(all_rf))])
                    b = float(numpy.sum(tree_sizes))
                    treeko_d = a / b
                    total_rf = numpy.mean(all_rf)
                    norm_rf = numpy.mean([(all_rf[i] / float(all_max_rf[i]))
                                          for i in xrange(len(all_rf))])
                    max_rf = numpy.max(all_max_rf)
                    ref_branches_in_target = numpy.mean(ref_found)
                    target_branches_in_ref = numpy.mean(
                        target_found) if target_found else -1
                    target_tree_len = numpy.mean(tree_sizes)
                    used_subtrees = len(all_rf)
        else:
            target_tree_len = len(tt)
            ndups, ntrees, used_subtrees = 0, 1, 1
            treeko_d = -1
            total_rf, max_rf, common, p1, p2, d1, d2 = tt.robinson_foulds(
                t,
                expand_polytomies=args.polytomies,
                unrooted_trees=args.unrooted)
            common_names = len(common)
            if max_rf:
                norm_rf = total_rf / float(max_rf)
            if p1 and p2:
                sizes = [len(p) for p in p2 ^ p1]
                if sizes:
                    avg_size = sum(sizes) / float(len(sizes))
                    max_size, min_size = max(sizes), min(sizes)
                else:
                    max_size, min_size, avg_size = 0, 0, 0

                ref_branches_in_target = float(len(p2 & p1)) / reftree_edges
                #if p2-d2:
                #    incompatible_target_branches = float(len((p2-d2) - p1))
                #    target_found.append(1 - (incompatible_target_branches / (len(p2-d2))))
            else:
                ref_branches_in_target = 0.0
                target_branches_in_ref = 0.0
                max_size, min_size, avg_size = -1, -1, -1

        if args.output:
            print >> OUT, '\t'.join(
                map(str, (fname, ndups, ntrees, used_subtrees, treeko_d,
                          total_rf, max_rf, norm_rf, ref_branches_in_target,
                          target_branches_in_ref, avg_size, min_size,
                          common_names, reftree_len, target_tree_len)))
        else:
            print_table([
                map(istr,
                    (fname[-30:], ndups, ntrees, used_subtrees, treeko_d,
                     total_rf, max_rf, norm_rf,
                     '%0.4f' % ref_branches_in_target,
                     '%0.4f' % target_branches_in_ref, avg_size, min_size,
                     common_names, reftree_len, target_tree_len))
            ],
                        fix_col_width=COL_WIDTHS,
                        wrap_style='cut')

    if args.output:
        OUT.close()
示例#16
0
def main(argv):

    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",
                        dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")

    parser.add_argument("-t",
                        "--taxid",
                        dest="taxid",
                        nargs="+",
                        type=int,
                        help="""taxids (space separated)""")

    parser.add_argument(
        "-tf",
        "--taxid_file",
        dest="taxid_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-r",
                        "--reftree",
                        dest="reftree",
                        type=str,
                        help="""tree file containing taxids as node names.""")

    parser.add_argument("--reftree_attr",
                        dest="reftree_attr",
                        type=str,
                        default="name",
                        help="""Where taxid should be read from""")

    parser.add_argument("-n",
                        "--name",
                        dest="names",
                        nargs="+",
                        type=str,
                        help="""species or taxa names (comma separated)""")

    parser.add_argument(
        "-nf",
        "--names_file",
        dest="names_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-x",
                        "--taxonomy",
                        dest="taxonomy",
                        action="store_true",
                        help=("returns a pruned version of the NCBI taxonomy"
                              " tree containing target species"))

    parser.add_argument(
        "--show_tree",
        dest="show_tree",
        action="store_true",
        help="""shows the NCBI taxonomy tree of the provided species""")

    parser.add_argument("--collapse_subspecies",
                        dest="collapse_subspecies",
                        action="store_true",
                        help=("When used, all nodes under the the species rank"
                              " are collapsed, so all species and subspecies"
                              " are seen as sister nodes"))

    parser.add_argument("--rank_limit",
                        dest="rank_limit",
                        type=str,
                        help=("When used, all nodes under the provided rank"
                              " are discarded"))

    parser.add_argument("--full_lineage",
                        dest="full_lineage",
                        action="store_true",
                        help=("When used, topology is not pruned to avoid "
                              " one-child-nodes, so the complete lineage"
                              " track leading from root to tips is kept."))

    parser.add_argument("-i",
                        "--info",
                        dest="info",
                        action="store_true",
                        help="""shows NCBI information about the species""")

    parser.add_argument("--fuzzy",
                        dest="fuzzy",
                        type=float,
                        help=("Tries a fuzzy (and SLOW) search for those"
                              " species names that could not be translated"
                              " into taxids. A float number must be provided"
                              " indicating the minimum string similarity."))

    args = parser.parse_args(argv)
    if not args.taxonomy and not args.info and not args.reftree:
        parser.print_usage()
        sys.exit(0)

    if args.fuzzy:
        import pysqlite2.dbapi2 as sqlite3
        c = sqlite3.connect(os.path.join(MODULE_PATH, args.dbfile))
    else:
        ncbi.connect_database(args.dbfile)

    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(
            map(strip,
                open(args.names_file, "rU").read().split("\n")))
    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))
    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations:")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                c.enable_load_extension(True)
                c.execute("select load_extension('%s')" % os.path.join(
                    module_path, "SQLite-Levenshtein/levenshtein.sqlext"))
                tax, realname, sim = ncbi.get_fuzzy_name_translation(
                    name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" % sim

        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(
                map(str,
                    [score, name, realname.capitalize(), taxid]))

    if args.taxid_file:
        all_taxids.extend(
            map(strip,
                open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)

    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(
            list(
                set([
                    getattr(n, args.reftree_attr)
                    for n in reftree.iter_leaves()
                ])))

    if all_taxids and args.info:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi.translate_merged(all_taxids)
        log.info("Dumping %d taxid translations:" % len(all_taxids))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print "\t".join(
                map(str, [
                    merge_conversion.get(int(taxid), taxid), name,
                    named_lineage, lineage
                ]))

        for notfound in set(map(str, all_taxids)) - set(
                str(k) for k in translator.iterkeys()):
            print >> sys.stderr, notfound, "NOT FOUND"

    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa:" % len(all_taxids))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            if n.rank in COLOR_RANKS:
                n.add_features(bgcolor=COLOR_RANKS[n.rank])
            n.name = "%s{%s}" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))

        if args.collapse_subspecies:
            species_nodes = [
                n for n in t.traverse() if n.rank == "species"
                if int(n.taxid) in all_taxids
            ]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" % n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")

        if args.show_tree:
            t.show()

        print "\n\n  ===== Newick files saved as 'your_taxa_query.*' ===== "
        t.write(format=9, outfile="your_ncbi_query.nw")
        t.write(format=8, outfile="your_ncbi_query.named.nw")
        t.write(format=9,
                features=[
                    "taxid", "name", "rank", "bgcolor", "sci_name",
                    "collapse_subspecies", "named_lineage"
                ],
                outfile="your_ncbi_query.extended.nw")
        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile="your_ncbi_query.taxids.nw")

    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name=translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi.translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)

        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
示例#17
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections.
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    input_gr = parser.add_argument_group(
        "TREE INPUT OPTIONS\n=================")

    input_gr.add_argument(
        'tree',
        metavar='tree_file',
        type=str,
        nargs=1,
        help='A tree file (or text string) in newick format.')

    input_gr.add_argument("--raxml",
                          dest="raxml",
                          action="store_true",
                          help="""Process newick as raxml bootstrap values""")

    img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================")

    img_gr.add_argument("-m",
                        "--mode",
                        dest="mode",
                        choices=["c", "r"],
                        default="r",
                        help="""(r)ectangular or (c)ircular visualization""")

    img_gr.add_argument(
        "-i",
        "--image",
        dest="image",
        type=str,
        help="Render tree image instead of showing it. A filename "
        " should be provided. PDF, SVG and PNG file extensions are"
        " supported (i.e. -i tree.svg)")

    img_gr.add_argument(
        "--Iw",
        "--width",
        dest="width",
        type=int,
        default=0,
        help="width of the rendered image in pixels (see --size-units).")

    img_gr.add_argument(
        "--Ih",
        "--height",
        dest="height",
        type=int,
        default=0,
        help="height of the rendered image in pixels (see --size-units).")

    img_gr.add_argument("--Ir",
                        "--resolution",
                        dest="resolution",
                        type=int,
                        default=300,
                        help="Resolution if the tree image (DPI)")

    img_gr.add_argument("--Iu",
                        "--size-units",
                        dest="size_units",
                        choices=["px", "mm", "in"],
                        default="px",
                        help="Units used to specify the size of the image."
                        " (px:pixels, mm:millimeters, in:inches). ")

    img_gr.add_argument(
        "-mbs",
        "--min-branch-separation",
        dest="branch_separation",
        type=int,
        default=3,
        help="Min number of pixels to separate branches vertically.")

    img_gr.add_argument("--ss",
                        "--show-support",
                        dest="show_support",
                        action="store_true",
                        help="""Shows branch bootstrap/support values""")

    img_gr.add_argument("--sbl",
                        "--branch-length",
                        dest="show_branch_length",
                        action="store_true",
                        help="""Show branch lengths.""")

    img_gr.add_argument(
        "--ft",
        "--force-topology",
        dest="force_topology",
        action="store_true",
        help="""Force branch length to have a minimum length in the image""")

    img_gr.add_argument("--hln",
                        "--hide-leaf-names",
                        dest="hide_leaf_names",
                        action="store_true",
                        help="""Hide leaf names.""")

    img_gr.add_argument(
        "--sin",
        "--show-internal-names",
        dest="show_internal_names",
        action="store_true",
        help="""Show the name attribute of all internal nodes.""")

    edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================")

    edit_gr.add_argument(
        "-r",
        "--root",
        dest="root",
        type=str,
        nargs="*",
        help="Roots the tree to the node grouping the list"
        " of node names provided (space separated). In example:"
        "'--root human rat mouse'")

    edit_gr.add_argument("-s",
                         "--sort-branches",
                         dest="sort",
                         action="store_true",
                         help="""Sort branches according to node names.""")

    edit_gr.add_argument("-l",
                         "--ladderize",
                         dest="ladderize",
                         action="store_true",
                         help="""Sort branches by partition size.""")

    edit_gr.add_argument("--color_by_rank",
                         dest="color_by_rank",
                         type=str,
                         nargs="+",
                         help="""If the attribute rank is present in nodes """)

    phylo_gr = parser.add_argument_group(
        "PHYLOGENETIC OPTIONS\n=================")

    phylo_gr.add_argument("--alg",
                          dest="alg",
                          type=str,
                          help="""Multiple sequence alignment.""")

    phylo_gr.add_argument(
        "--alg-format",
        dest="alg_format",
        type=str,
        default="fasta",
        help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""")

    phylo_gr.add_argument(
        "--sp-discovery",
        dest="species_discovery_regexp",
        type=str,
        default="^[^_]+_(.+)",
        help="Perl regular expression to capture species"
        " code from node names. By default, node names"
        " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ")

    phylo_gr.add_argument(
        "--dump-subtrees",
        dest="subtrees_output_file",
        type=str,
        help="Returns a file containing all possible species subtrees"
        " contained in a given gene tree ")

    phylo_gr.add_argument(
        "--newick",
        dest="newick",
        type=str,
        help="dumps newick file after applying editing options")

    args = parser.parse_args(argv)

    tfile = args.tree[0]

    if args.ladderize and args.sort:
        raise ValueError(
            "--sort-branches and --ladderize options are mutually exclusive")

    if args.raxml:
        nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]",
                    open(tfile).read())
        t = PhyloTree(nw)
        #for n in t.traverse():
        #n.support = getattr(n, "bootstrap", -1)
        #
    else:
        t = PhyloTree(tfile)

    if args.alg:
        t.link_to_alignment(args.alg, alg_format=args.alg_format)
        LEAF_ATTRIBUTES["sequence"] = 1

    if args.species_discovery_regexp:
        SPCODE_REGEXP = re.compile(args.species_discovery_regexp)
        t.set_species_naming_function(user_species_naming_function)

    if args.ladderize:
        t.ladderize()
    if args.sort:
        t.sort_descendants()

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    # EXTRACT INFO

    if args.subtrees_output_file:
        ntrees, ndups, treeiter = t.get_speciation_trees()
        print >> sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." % (
            ndups, ntrees)
        OUT = open(args.subtrees_output_file, "w")
        for tree in treeiter:
            print >> OUT, tree.write()
        OUT.close()

    # VISUALIZATION

    ts = TreeStyle()
    ts.mode = args.mode
    ts.show_leaf_name = False
    ts.branch_vertical_margin = args.branch_separation
    if args.show_support:
        ts.show_branch_support = True
    if args.show_branch_length:
        ts.show_branch_length = True
    if args.force_topology:
        ts.force_topology = True

    if args.hide_leaf_names:
        del LEAF_ATTRIBUTES["name"]

    if args.show_internal_names:
        INTERNAL_ATTRIBUTES["name"] = 1

    # scale the tree
    if not args.height:
        args.height = None
    if not args.width:
        args.width = None

    ts.layout_fn = master_layout
    if args.image:
        t.render(args.image,
                 tree_style=ts,
                 w=args.width,
                 h=args.height,
                 units=args.size_units)
    else:
        t.show(None, tree_style=ts)

    if args.newick:
        t.write(features=[], outfile=args.newick)
        print "Processed Newick dumped into", args.newick
示例#18
0
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections. 
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().
    
    parser.add_argument("--show", dest="show_tree",
                        action="store_true", 
                        help="""Display tree after the analysis.""")
    
    parser.add_argument("--render", dest="render",
                        action="store_true", 
                        help="""Render tree.""")

    parser.add_argument("--dump", dest="dump",
                        action="store_true", 
                        help="""Dump analysis""")

    parser.add_argument("--explore", dest="explore",
                        type=str,
                        help="""Reads a previously analyzed tree and visualize it""")
    
    input_args = parser.add_mutually_exclusive_group()
    input_args.required=True
    input_args.add_argument("-t", "--tree", dest="target_tree",  nargs="+",
                        type=str,
                        help="""Tree file in newick format""")

    input_args.add_argument("-tf", dest="tree_list_file",
                        type=str, 
                        help="File with the list of tree files")
    
    parser.add_argument("--tax", dest="tax_info", type=str,
                        help="If the taxid attribute is not set in the"
                        " newick file for all leaf nodes, a tab file file"
                        " with the translation of name and taxid can be"
                        " provided with this option.")

    parser.add_argument("--sp_delimiter", dest="sp_delimiter", type=str,
                        help="If taxid is part of the leaf name, delimiter used to split the string")

    parser.add_argument("--sp_field", dest="sp_field", type=int, default=0,
                        help="field position for taxid after splitting leaf names")
    
    parser.add_argument("--ref", dest="ref_tree", type=str,
                        help="Uses ref tree to compute robinson foulds"
                        " distances of the different subtrees")

    parser.add_argument("--rf-only", dest="rf_only",
                        action = "store_true",
                        help="Skip ncbi consensus analysis")

    parser.add_argument("--outgroup", dest="outgroup",
                        type=str, nargs="+",
                        help="A list of node names defining the trees outgroup")
    
    parser.add_argument("--is_sptree", dest="is_sptree",
                        action = "store_true",
                        help="Assumes no duplication nodes in the tree")
    
    parser.add_argument("-o", dest="output", type=str,
                        help="Writes result into a file")

    parser.add_argument("--tax2name", dest="tax2name", type=str,
                        help="")
    
    parser.add_argument("--tax2track", dest="tax2track", type=str,
                        help="")
    
    parser.add_argument("--dump_tax_info", dest="dump_tax_info", action="store_true",
                        help="")
    
    args = parser.parse_args(argv)

    if args.sp_delimiter:
        GET_TAXID = lambda x: x.split(args.sp_delimiter)[args.sp_field]
    else:
        GET_TAXID = None
    
    reftree_name = os.path.basename(args.ref_tree) if args.ref_tree else ""
    if args.explore:
        print >>sys.stderr, "Reading tree from file:", args.explore
        t = cPickle.load(open(args.explore))
        ts = TreeStyle()
        ts.force_topology = True
        ts.show_leaf_name = False
        ts.layout_fn = ncbi_layout 
        ts.mode = "r"
        t.show(tree_style=ts)
        print >>sys.stderr, "dumping color config"
        cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))
        sys.exit()
    
    if args.output:
        OUT = open(args.output, "w")
    else:
        OUT = sys.stdout

    print >>sys.stderr, "Dumping results into", OUT
    target_trees = []
    if args.tree_list_file:
        target_trees = [line.strip() for line in open(args.tree_list_file)]
    if args.target_tree:
        target_trees += args.target_tree
    prev_tree = None
    if args.tax2name:
        tax2name = cPickle.load(open(args.tax2name))
    else:
        tax2name = {}

    if args.tax2track:
        tax2track = cPickle.load(open(args.tax2track))
    else:
        tax2track = {}
    print len(tax2track), len(tax2name)
    header = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Clade sizes", "RF (avg)", "RF (med)", "RF (std)", "RF (max)", "Shared tips")
    print >>OUT, '|'.join([h.ljust(15) for h in header])
    if args.ref_tree:
        print >>sys.stderr, "Reading ref tree from", args.ref_tree
        reft = Tree(args.ref_tree, format=1)
    else:
        reft = None

    SHOW_TREE = False
    if args.show_tree or args.render:
        SHOW_TREE = True

        
    prev_broken = set()
    ENTRIES = []
    ncbi.connect_database()
    for tfile in target_trees:
        #print tfile
        t = PhyloTree(tfile, sp_naming_function=None)
        if GET_TAXID:
            for n in t.iter_leaves():
                n.name = GET_TAXID(n.name)
        
        if args.outgroup:
            if len(args.outgroup) == 1:
                out = t & args.outgroup[0]
            else:
                out = t.get_common_ancestor(args.outgroup)
                if set(out.get_leaf_names()) ^ set(args.outgroup):
                    raise ValueError("Outgroup is not monophyletic")
                
            t.set_outgroup(out)
        t.ladderize()

        if prev_tree:
            tree_compare(t, prev_tree)
        prev_tree = t
       
        
        if args.tax_info:
            tax2name, tax2track = annotate_tree_with_taxa(t, args.tax_info, tax2name, tax2track)
            if args.dump_tax_info:
                cPickle.dump(tax2track, open("tax2track.pkl", "w"))
                cPickle.dump(tax2name, open("tax2name.pkl", "w"))
                print "Tax info written into pickle files"
        else:
            for n in t.iter_leaves():
                spcode = n.name
                n.add_features(taxid=spcode)
                n.add_features(species=spcode)
            tax2name, tax2track = annotate_tree_with_taxa(t, None, tax2name, tax2track)
            
        # Split tree into species trees
        #subtrees =  t.get_speciation_trees()
        if not args.rf_only:
            #print "Calculating tree subparts..."
            t1 = time.time()
            if not args.is_sptree:
                subtrees =  t.split_by_dups()
                #print "Subparts:", len(subtrees), time.time()-t1
            else:
                subtrees = [t]

          
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = analyze_subtrees(t, subtrees, show_tree=SHOW_TREE)
            
            #print valid_subtrees, broken_subtrees, ncbi_mistakes, total_rf
        else:
            subtrees = []
            valid_subtrees, broken_subtrees, ncbi_mistakes, broken_branches, total_rf, broken_clades, broken_sizes = 0, 0, 0, 0, 0, 0
            
        ndups = 0
        nsubtrees = len(subtrees)
           
        rf = 0
        rf_max = 0
        rf_std = 0
        rf_med = 0
        common_names = 0
        max_size = 0
        if reft and len(subtrees) == 1:
            rf = t.robinson_foulds(reft, attr_t1="realname")
            rf_max = rf[1]
            rf = rf[0]
            rf_med = rf
            
        elif reft:
            #print "Calculating avg RF..."
            nsubtrees, ndups, subtrees = t.get_speciation_trees(map_features=["taxid"])
            #print len(subtrees), "Sub-Species-trees found"
            avg_rf = []
            rf_max = 0.0 # reft.robinson_foulds(reft)[1]
            sum_size = 0.0
            print nsubtrees, "subtrees", ndups, "duplications"

            for ii, subt in enumerate(subtrees):
                print "\r%d" %ii,
                sys.stdout.flush()
                try:
                    partial_rf = subt.robinson_foulds(reft, attr_t1="taxid")
                except ValueError:
                    pass
                else:
                    sptree_size = len(set([n.taxid for n in subt.iter_leaves()]))
                    sum_size += sptree_size
                    avg_rf.append((partial_rf[0]/float(partial_rf[1])) * sptree_size)
                    common_names = len(partial_rf[3])
                    max_size = max(max_size, sptree_size)
                    rf_max = max(rf_max, partial_rf[1])
                #print  partial_rf[:2]
            rf = numpy.sum(avg_rf) / float(sum_size) # Treeko dist
            rf_std = numpy.std(avg_rf)
            rf_med = numpy.median(avg_rf)

        sizes_info = "%0.1f/%0.1f +- %0.1f" %( numpy.mean(broken_sizes), numpy.median(broken_sizes), numpy.std(broken_sizes))
        iter_values = [os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, rf, rf_med,
                       rf_std, rf_max, common_names] 
        print >>OUT, '|'.join(map(lambda x: str(x).strip().ljust(15), iter_values)) 
        fixed = sorted([n for n in prev_broken if n not in broken_clades])
        new_problems =  sorted(broken_clades - prev_broken)
        fixed_string = color(', '.join(fixed), "green") if fixed else ""
        problems_string = color(', '.join(new_problems), "red") if new_problems else ""
        OUT.write("    Fixed clades: %s\n" %fixed_string) if fixed else None
        OUT.write("    New broken:   %s\n" %problems_string) if new_problems else None
        prev_broken = broken_clades
        ENTRIES.append([os.path.basename(tfile), nsubtrees, ndups,
                        broken_subtrees, ncbi_mistakes, broken_branches, sizes_info, fixed_string, problems_string])
        OUT.flush()
        if args.show_tree or args.render:
            ts = TreeStyle()
            ts.force_topology = True
            #ts.tree_width = 500
            ts.show_leaf_name = False
            ts.layout_fn = ncbi_layout 
            ts.mode = "r"
            t.dist = 0
            if args.show_tree:
                #if args.hide_monophyletic:
                #    tax2monophyletic = {}
                #    n2content = t.get_node2content()
                #    for node in t.traverse():
                #        term2count = defaultdict(int)
                #        for leaf in n2content[node]:
                #            if leaf.lineage:
                #                for term in leaf.lineage:
                #                    term2count[term] += 1
                #        expected_size = len(n2content)
                #        for term, count in term2count.iteritems():
                #            if count > 1
                    
                print "Showing tree..."
                t.show(tree_style=ts)
            else:
                t.render("img.svg", tree_style=ts, dpi=300)
            print "dumping color config"
            cPickle.dump(name2color, open("ncbi_colors.pkl", "w"))

        if args.dump:
            cPickle.dump(t, open("ncbi_analysis.pkl", "w"))
            
    print
    print
    HEADER = ("TargetTree", "Subtrees", "Ndups", "Broken subtrees", "Broken clades", "Broken branches", "Clade sizes", "Fixed Groups", "New Broken Clades")
    print_table(ENTRIES, max_col_width = 50, row_line=True, header=HEADER)
            
    if args.output:
        OUT.close()