示例#1
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections.
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    input_gr = parser.add_argument_group(
        "TREE INPUT OPTIONS\n=================")

    input_gr.add_argument(
        'tree',
        metavar='tree_file',
        type=str,
        nargs=1,
        help='A tree file (or text string) in newick format.')

    input_gr.add_argument("--raxml",
                          dest="raxml",
                          action="store_true",
                          help="""Process newick as raxml bootstrap values""")

    img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================")

    img_gr.add_argument("-m",
                        "--mode",
                        dest="mode",
                        choices=["c", "r"],
                        default="r",
                        help="""(r)ectangular or (c)ircular visualization""")

    img_gr.add_argument(
        "-i",
        "--image",
        dest="image",
        type=str,
        help="Render tree image instead of showing it. A filename "
        " should be provided. PDF, SVG and PNG file extensions are"
        " supported (i.e. -i tree.svg)")

    img_gr.add_argument(
        "--Iw",
        "--width",
        dest="width",
        type=int,
        default=0,
        help="width of the rendered image in pixels (see --size-units).")

    img_gr.add_argument(
        "--Ih",
        "--height",
        dest="height",
        type=int,
        default=0,
        help="height of the rendered image in pixels (see --size-units).")

    img_gr.add_argument("--Ir",
                        "--resolution",
                        dest="resolution",
                        type=int,
                        default=300,
                        help="Resolution if the tree image (DPI)")

    img_gr.add_argument("--Iu",
                        "--size-units",
                        dest="size_units",
                        choices=["px", "mm", "in"],
                        default="px",
                        help="Units used to specify the size of the image."
                        " (px:pixels, mm:millimeters, in:inches). ")

    img_gr.add_argument(
        "-mbs",
        "--min-branch-separation",
        dest="branch_separation",
        type=int,
        default=3,
        help="Min number of pixels to separate branches vertically.")

    img_gr.add_argument("--ss",
                        "--show-support",
                        dest="show_support",
                        action="store_true",
                        help="""Shows branch bootstrap/support values""")

    img_gr.add_argument("--sbl",
                        "--branch-length",
                        dest="show_branch_length",
                        action="store_true",
                        help="""Show branch lengths.""")

    img_gr.add_argument(
        "--ft",
        "--force-topology",
        dest="force_topology",
        action="store_true",
        help="""Force branch length to have a minimum length in the image""")

    img_gr.add_argument("--hln",
                        "--hide-leaf-names",
                        dest="hide_leaf_names",
                        action="store_true",
                        help="""Hide leaf names.""")

    img_gr.add_argument(
        "--sin",
        "--show-internal-names",
        dest="show_internal_names",
        action="store_true",
        help="""Show the name attribute of all internal nodes.""")

    edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================")

    edit_gr.add_argument(
        "-r",
        "--root",
        dest="root",
        type=str,
        nargs="*",
        help="Roots the tree to the node grouping the list"
        " of node names provided (space separated). In example:"
        "'--root human rat mouse'")

    edit_gr.add_argument("-s",
                         "--sort-branches",
                         dest="sort",
                         action="store_true",
                         help="""Sort branches according to node names.""")

    edit_gr.add_argument("-l",
                         "--ladderize",
                         dest="ladderize",
                         action="store_true",
                         help="""Sort branches by partition size.""")

    edit_gr.add_argument("--color_by_rank",
                         dest="color_by_rank",
                         type=str,
                         nargs="+",
                         help="""If the attribute rank is present in nodes """)

    phylo_gr = parser.add_argument_group(
        "PHYLOGENETIC OPTIONS\n=================")

    phylo_gr.add_argument("--alg",
                          dest="alg",
                          type=str,
                          help="""Multiple sequence alignment.""")

    phylo_gr.add_argument(
        "--alg-format",
        dest="alg_format",
        type=str,
        default="fasta",
        help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""")

    phylo_gr.add_argument(
        "--sp-discovery",
        dest="species_discovery_regexp",
        type=str,
        default="^[^_]+_(.+)",
        help="Perl regular expression to capture species"
        " code from node names. By default, node names"
        " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ")

    phylo_gr.add_argument(
        "--dump-subtrees",
        dest="subtrees_output_file",
        type=str,
        help="Returns a file containing all possible species subtrees"
        " contained in a given gene tree ")

    phylo_gr.add_argument(
        "--newick",
        dest="newick",
        type=str,
        help="dumps newick file after applying editing options")

    args = parser.parse_args(argv)

    tfile = args.tree[0]

    if args.ladderize and args.sort:
        raise ValueError(
            "--sort-branches and --ladderize options are mutually exclusive")

    if args.raxml:
        nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]",
                    open(tfile).read())
        t = PhyloTree(nw)
        #for n in t.traverse():
        #n.support = getattr(n, "bootstrap", -1)
        #
    else:
        t = PhyloTree(tfile)

    if args.alg:
        t.link_to_alignment(args.alg, alg_format=args.alg_format)
        LEAF_ATTRIBUTES["sequence"] = 1

    if args.species_discovery_regexp:
        SPCODE_REGEXP = re.compile(args.species_discovery_regexp)
        t.set_species_naming_function(user_species_naming_function)

    if args.ladderize:
        t.ladderize()
    if args.sort:
        t.sort_descendants()

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    # EXTRACT INFO

    if args.subtrees_output_file:
        ntrees, ndups, treeiter = t.get_speciation_trees()
        print >> sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." % (
            ndups, ntrees)
        OUT = open(args.subtrees_output_file, "w")
        for tree in treeiter:
            print >> OUT, tree.write()
        OUT.close()

    # VISUALIZATION

    ts = TreeStyle()
    ts.mode = args.mode
    ts.show_leaf_name = False
    ts.branch_vertical_margin = args.branch_separation
    if args.show_support:
        ts.show_branch_support = True
    if args.show_branch_length:
        ts.show_branch_length = True
    if args.force_topology:
        ts.force_topology = True

    if args.hide_leaf_names:
        del LEAF_ATTRIBUTES["name"]

    if args.show_internal_names:
        INTERNAL_ATTRIBUTES["name"] = 1

    # scale the tree
    if not args.height:
        args.height = None
    if not args.width:
        args.width = None

    ts.layout_fn = master_layout
    if args.image:
        t.render(args.image,
                 tree_style=ts,
                 w=args.width,
                 h=args.height,
                 units=args.size_units)
    else:
        t.show(None, tree_style=ts)

    if args.newick:
        t.write(features=[], outfile=args.newick)
        print "Processed Newick dumped into", args.newick
示例#2
0
文件: ete_view.py 项目: daisieh/ete
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    # name or flags - Either a name or a list of option strings, e.g. foo or -f, --foo.
    # action - The basic type of action to be taken when this argument is encountered at the command line. (store, store_const, store_true, store_false, append, append_const, version)
    # nargs - The number of command-line arguments that should be consumed. (N, ? (one or default), * (all 1 or more), + (more than 1) )
    # const - A constant value required by some action and nargs selections. 
    # default - The value produced if the argument is absent from the command line.
    # type - The type to which the command-line argument should be converted.
    # choices - A container of the allowable values for the argument.
    # required - Whether or not the command-line option may be omitted (optionals only).
    # help - A brief description of what the argument does.
    # metavar - A name for the argument in usage messages.
    # dest - The name of the attribute to be added to the object returned by parse_args().

    input_gr = parser.add_argument_group("TREE INPUT OPTIONS\n=================")
    
    input_gr.add_argument('tree', metavar='tree_file', type=str, nargs=1,
                      help='A tree file (or text string) in newick format.')

    input_gr.add_argument("--raxml", dest="raxml", 
                        action="store_true",
                        help="""Process newick as raxml bootstrap values""")
    
    img_gr = parser.add_argument_group("TREE IMAGE OPTIONS\n=================")
        
    img_gr.add_argument("-m", "--mode", dest="mode", 
                        choices=["c", "r"], default="r",
                        help="""(r)ectangular or (c)ircular visualization""")
  

    img_gr.add_argument("-i", "--image", dest="image", 
                        type=str, 
                        help="Render tree image instead of showing it. A filename "
                        " should be provided. PDF, SVG and PNG file extensions are"
                        " supported (i.e. -i tree.svg)"
                        )

    img_gr.add_argument("--Iw", "--width", dest="width", 
                        type=int, default=0, 
                        help="width of the rendered image in pixels (see --size-units)."
                        )

    img_gr.add_argument("--Ih", "--height", dest="height", 
                        type=int, default=0,
                        help="height of the rendered image in pixels (see --size-units)."
                        )

    img_gr.add_argument("--Ir", "--resolution", dest="resolution", 
                        type=int, default=300,
                        help="Resolution if the tree image (DPI)"
                        )

    img_gr.add_argument("--Iu", "--size-units", dest="size_units", 
                        choices=["px", "mm", "in"], default="px",
                        help="Units used to specify the size of the image."
                        " (px:pixels, mm:millimeters, in:inches). "
                        )

    img_gr.add_argument("-mbs", "--min-branch-separation", dest="branch_separation", 
                        type=int, default = 3, 
                        help="Min number of pixels to separate branches vertically."
                        )

    img_gr.add_argument("--ss", "--show-support", dest="show_support", 
                        action="store_true",
                        help="""Shows branch bootstrap/support values""")

    img_gr.add_argument("--sbl", "--branch-length", dest="show_branch_length", 
                        action="store_true",
                        help="""Show branch lengths.""")

    img_gr.add_argument("--ft", "--force-topology", dest="force_topology", 
                        action="store_true",
                        help="""Force branch length to have a minimum length in the image""")

    img_gr.add_argument("--hln", "--hide-leaf-names", dest="hide_leaf_names", 
                        action="store_true",
                        help="""Hide leaf names.""")

    img_gr.add_argument("--sin", "--show-internal-names", dest="show_internal_names", 
                        action="store_true",
                        help="""Show the name attribute of all internal nodes.""")

    edit_gr = parser.add_argument_group("TREE EDIT OPTIONS\n=================")
    
    edit_gr.add_argument("-r", "--root", dest="root", 
                         type=str, nargs="*",
                         help="Roots the tree to the node grouping the list"
                         " of node names provided (space separated). In example:"
                         "'--root human rat mouse'")
    
    edit_gr.add_argument("-s", "--sort-branches", dest="sort", 
                        action="store_true",
                        help="""Sort branches according to node names.""")

    edit_gr.add_argument("-l", "--ladderize", dest="ladderize", 
                        action="store_true",
                        help="""Sort branches by partition size.""")
    
    edit_gr.add_argument("--color_by_rank", dest="color_by_rank", 
                         type=str, nargs="+",
                         help="""If the attribute rank is present in nodes """)
    
    phylo_gr = parser.add_argument_group("PHYLOGENETIC OPTIONS\n=================")
    
    phylo_gr.add_argument("--alg", dest="alg", 
                        type=str, 
                        help="""Multiple sequence alignment.""")

    phylo_gr.add_argument("--alg-format", dest="alg_format", 
                        type=str, default="fasta",
                        help="""fasta, phylip, iphylip, relaxed_iphylip, relaxed_phylip.""")
    
    phylo_gr.add_argument("--sp-discovery", dest="species_discovery_regexp", 
                          type=str, default="^[^_]+_(.+)",
                          help="Perl regular expression to capture species"
                          " code from node names. By default, node names"
                          " are expected to follow the NAME_SPCODE format = '^[^_]+_(.+)' ")
        
    phylo_gr.add_argument("--dump-subtrees", dest="subtrees_output_file", 
                          type=str, 
                          help="Returns a file containing all possible species subtrees"
                               " contained in a given gene tree ")

    phylo_gr.add_argument("--newick", dest="newick", 
                          type=str,
                          help="dumps newick file after applying editing options")

    
    args = parser.parse_args(argv)

    tfile = args.tree[0]


    if args.ladderize and args.sort:
        raise ValueError("--sort-branches and --ladderize options are mutually exclusive")
    
    if args.raxml:
        nw = re.sub(":(\d+\.\d+)\[(\d+)\]", ":\\1[&&NHX:support=\\2]", open(tfile).read())
        t = PhyloTree(nw)
        #for n in t.traverse():
            #n.support = getattr(n, "bootstrap", -1)
            #
    else:
        t = PhyloTree(tfile)
        
    if args.alg:
        t.link_to_alignment(args.alg, alg_format=args.alg_format)
        LEAF_ATTRIBUTES["sequence"] = 1
        
    if args.species_discovery_regexp:
        SPCODE_REGEXP = re.compile(args.species_discovery_regexp)
        t.set_species_naming_function(user_species_naming_function)
        
    if args.ladderize:
        t.ladderize()
    if args.sort:
        t.sort_descendants()

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    # EXTRACT INFO

    if args.subtrees_output_file:
        ntrees, ndups, treeiter = t.get_speciation_trees()
        print >>sys.stderr, "Found %d duplication nodes. Dumping %d sutrees..." %(ndups, ntrees)
        OUT = open(args.subtrees_output_file, "w")
        for tree in treeiter:
            print >>OUT, tree.write()
        OUT.close()

    # VISUALIZATION
        
    ts = TreeStyle()
    ts.mode = args.mode
    ts.show_leaf_name = False
    ts.branch_vertical_margin = args.branch_separation
    if args.show_support:
        ts.show_branch_support = True
    if args.show_branch_length:
        ts.show_branch_length = True
    if args.force_topology:
        ts.force_topology = True
        
    if args.hide_leaf_names:
        del LEAF_ATTRIBUTES["name"]

    if args.show_internal_names:
        INTERNAL_ATTRIBUTES["name"] = 1

    # scale the tree
    if not args.height: 
        args.height = None
    if not args.width: 
        args.width = None

    ts.layout_fn = master_layout
    if args.image:
        t.render(args.image, tree_style=ts, w=args.width, h=args.height, units=args.size_units)
    else:
        t.show(None, tree_style=ts)

    if args.newick:
        t.write(features=[], outfile=args.newick)
        print "Processed Newick dumped into", args.newick
示例#3
0
def main(argv):
    
    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",  dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")
    
    parser.add_argument("-t", "--taxid", dest="taxid", nargs="+",  
                        type=int, 
                        help="""taxids (space separated)""")

    parser.add_argument("-tf", "--taxid_file", dest="taxid_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-r", "--reftree", dest="reftree",   
                        type=str, 
                        help="""tree file containing taxids as node names.""")
    
    parser.add_argument("--reftree_attr", dest="reftree_attr",   
                        type=str, default="name",
                        help="""Where taxid should be read from""")
    
    parser.add_argument("-n", "--name", dest="names", nargs="+",  
                        type=str, 
                        help="""species or taxa names (comma separated)""")

    parser.add_argument("-nf", "--names_file", dest="names_file",   
                        type=str, 
                        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-x", "--taxonomy", dest="taxonomy",   
                        action="store_true",
                        help=("returns a pruned version of the NCBI taxonomy"
                              " tree containing target species"))

    parser.add_argument("--show_tree", dest="show_tree",   
                        action="store_true",
                        help="""shows the NCBI taxonomy tree of the provided species""")
    
    parser.add_argument("--collapse_subspecies", dest="collapse_subspecies",   
                        action="store_true",
                        help=("When used, all nodes under the the species rank"
                              " are collapsed, so all species and subspecies"
                              " are seen as sister nodes"))

    parser.add_argument("--rank_limit", dest="rank_limit",   
                        type=str,
                        help=("When used, all nodes under the provided rank"
                              " are discarded"))
    
    parser.add_argument("--full_lineage", dest="full_lineage",   
                        action="store_true",
                        help=("When used, topology is not pruned to avoid "
                              " one-child-nodes, so the complete lineage"
                              " track leading from root to tips is kept."))
        
    parser.add_argument("-i", "--info", dest="info",   
                        action="store_true",
                        help="""shows NCBI information about the species""")

    parser.add_argument("--fuzzy", dest="fuzzy", type=float,
                        help=("Tries a fuzzy (and SLOW) search for those"
                              " species names that could not be translated"
                              " into taxids. A float number must be provided"
                              " indicating the minimum string similarity."))
   
    
    args = parser.parse_args(argv)
    if not args.taxonomy and not args.info and not args.reftree:
        parser.print_usage()
        sys.exit(0)
    
    if args.fuzzy:
        import pysqlite2.dbapi2 as sqlite3
        c = sqlite3.connect(os.path.join(MODULE_PATH, args.dbfile))
    else:
        ncbi.connect_database(args.dbfile)
    

        
    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(map(strip, open(args.names_file, "rU").read().split("\n")))
    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))
    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations:")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                c.enable_load_extension(True)
                c.execute("select load_extension('%s')" % os.path.join(module_path,
                                            "SQLite-Levenshtein/levenshtein.sqlext"))
                tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" %sim
                    
        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(map(str, [score, name, realname.capitalize(), taxid]))
            
    if args.taxid_file:
        all_taxids.extend(map(strip, open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)
        
    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(list(set([getattr(n, args.reftree_attr) for n in reftree.iter_leaves()])))

       
    if all_taxids and args.info:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi.translate_merged(all_taxids)
        log.info("Dumping %d taxid translations:" %len(all_taxids))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print "\t".join(map(str, [merge_conversion.get(int(taxid), taxid), name, named_lineage, lineage ]))
            
        for notfound in set(map(str, all_taxids)) - set(str(k) for k in translator.iterkeys()):
            print >>sys.stderr, notfound, "NOT FOUND"
            
    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa:" %len(all_taxids))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            if n.rank in COLOR_RANKS:
                n.add_features(bgcolor=COLOR_RANKS[n.rank])
            n.name = "%s{%s}" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
            
        if args.collapse_subspecies:
            species_nodes = [n for n in t.traverse() if n.rank == "species"
                             if int(n.taxid) in all_taxids]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" %n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")
                
        if args.show_tree:
            t.show()
            
        print "\n\n  ===== Newick files saved as 'your_taxa_query.*' ===== "
        t.write(format=9, outfile="your_ncbi_query.nw")
        t.write(format=8, outfile="your_ncbi_query.named.nw")
        t.write(format=9, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"],
                outfile="your_ncbi_query.extended.nw")
        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile="your_ncbi_query.taxids.nw")

    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name = translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi.translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)
            
        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
示例#4
0
def main(argv):

    parser = ArgumentParser(description=__DESCRIPTION__)

    parser.add_argument("--db",
                        dest="dbfile",
                        type=str,
                        help="""NCBI sqlite3 db file.""")

    parser.add_argument("-t",
                        "--taxid",
                        dest="taxid",
                        nargs="+",
                        type=int,
                        help="""taxids (space separated)""")

    parser.add_argument(
        "-tf",
        "--taxid_file",
        dest="taxid_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-r",
                        "--reftree",
                        dest="reftree",
                        type=str,
                        help="""tree file containing taxids as node names.""")

    parser.add_argument("--reftree_attr",
                        dest="reftree_attr",
                        type=str,
                        default="name",
                        help="""Where taxid should be read from""")

    parser.add_argument("-n",
                        "--name",
                        dest="names",
                        nargs="+",
                        type=str,
                        help="""species or taxa names (comma separated)""")

    parser.add_argument(
        "-nf",
        "--names_file",
        dest="names_file",
        type=str,
        help="""file containing a list of taxids (one per line)""")

    parser.add_argument("-x",
                        "--taxonomy",
                        dest="taxonomy",
                        action="store_true",
                        help=("returns a pruned version of the NCBI taxonomy"
                              " tree containing target species"))

    parser.add_argument(
        "--show_tree",
        dest="show_tree",
        action="store_true",
        help="""shows the NCBI taxonomy tree of the provided species""")

    parser.add_argument("--collapse_subspecies",
                        dest="collapse_subspecies",
                        action="store_true",
                        help=("When used, all nodes under the the species rank"
                              " are collapsed, so all species and subspecies"
                              " are seen as sister nodes"))

    parser.add_argument("--rank_limit",
                        dest="rank_limit",
                        type=str,
                        help=("When used, all nodes under the provided rank"
                              " are discarded"))

    parser.add_argument("--full_lineage",
                        dest="full_lineage",
                        action="store_true",
                        help=("When used, topology is not pruned to avoid "
                              " one-child-nodes, so the complete lineage"
                              " track leading from root to tips is kept."))

    parser.add_argument("-i",
                        "--info",
                        dest="info",
                        action="store_true",
                        help="""shows NCBI information about the species""")

    parser.add_argument("--fuzzy",
                        dest="fuzzy",
                        type=float,
                        help=("Tries a fuzzy (and SLOW) search for those"
                              " species names that could not be translated"
                              " into taxids. A float number must be provided"
                              " indicating the minimum string similarity."))

    args = parser.parse_args(argv)
    if not args.taxonomy and not args.info and not args.reftree:
        parser.print_usage()
        sys.exit(0)

    if args.fuzzy:
        import pysqlite2.dbapi2 as sqlite3
        c = sqlite3.connect(os.path.join(MODULE_PATH, args.dbfile))
    else:
        ncbi.connect_database(args.dbfile)

    all_names = set([])
    all_taxids = []

    if args.names_file:
        all_names.update(
            map(strip,
                open(args.names_file, "rU").read().split("\n")))
    if args.names:
        all_names.update(map(strip, " ".join(args.names).split(",")))
    all_names.discard("")
    #all_names = set([n.lower() for n in all_names])
    not_found = set()
    name2realname = {}
    name2score = {}
    if all_names:
        log.info("Dumping name translations:")
        name2id = ncbi.get_name_translator(all_names)
        not_found = all_names - set(name2id.keys())

        if args.fuzzy and not_found:
            log.info("%s unknown names", len(not_found))
            for name in not_found:
                # enable extension loading
                c.enable_load_extension(True)
                c.execute("select load_extension('%s')" % os.path.join(
                    module_path, "SQLite-Levenshtein/levenshtein.sqlext"))
                tax, realname, sim = ncbi.get_fuzzy_name_translation(
                    name, args.fuzzy)
                if tax:
                    name2id[name] = tax
                    name2realname[name] = realname
                    name2score[name] = "Fuzzy:%0.2f" % sim

        for name in all_names:
            taxid = name2id.get(name, "???")
            realname = name2realname.get(name, name)
            score = name2score.get(name, "Exact:1.0")
            print "\t".join(
                map(str,
                    [score, name, realname.capitalize(), taxid]))

    if args.taxid_file:
        all_taxids.extend(
            map(strip,
                open(args.taxid_file, "rU").read().split("\n")))
    if args.taxid:
        all_taxids.extend(args.taxid)

    reftree = None
    if args.reftree:
        reftree = PhyloTree(args.reftree)
        all_taxids.extend(
            list(
                set([
                    getattr(n, args.reftree_attr)
                    for n in reftree.iter_leaves()
                ])))

    if all_taxids and args.info:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        all_taxids, merge_conversion = ncbi.translate_merged(all_taxids)
        log.info("Dumping %d taxid translations:" % len(all_taxids))
        all_taxids.discard("")
        translator = ncbi.get_taxid_translator(all_taxids)
        for taxid, name in translator.iteritems():
            lineage = ncbi.get_sp_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage = ','.join(map(str, lineage))
            print "\t".join(
                map(str, [
                    merge_conversion.get(int(taxid), taxid), name,
                    named_lineage, lineage
                ]))

        for notfound in set(map(str, all_taxids)) - set(
                str(k) for k in translator.iterkeys()):
            print >> sys.stderr, notfound, "NOT FOUND"

    if all_taxids and args.taxonomy:
        all_taxids = set(all_taxids)
        all_taxids.discard("")
        log.info("Dumping NCBI taxonomy of %d taxa:" % len(all_taxids))

        t = ncbi.get_topology(all_taxids, args.full_lineage, args.rank_limit)
        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            if n.rank in COLOR_RANKS:
                n.add_features(bgcolor=COLOR_RANKS[n.rank])
            n.name = "%s{%s}" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_sp_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))

        if args.collapse_subspecies:
            species_nodes = [
                n for n in t.traverse() if n.rank == "species"
                if int(n.taxid) in all_taxids
            ]
            for sp_node in species_nodes:
                bellow = sp_node.get_descendants()
                if bellow:
                    # creates a copy of the species node
                    connector = sp_node.__class__()
                    for f in sp_node.features:
                        connector.add_feature(f, getattr(sp_node, f))
                    connector.name = connector.name + "{species}"
                    for n in bellow:
                        n.detach()
                        n.name = n.name + "{%s}" % n.rank
                        sp_node.add_child(n)
                    sp_node.add_child(connector)
                    sp_node.add_feature("collapse_subspecies", "1")

        if args.show_tree:
            t.show()

        print "\n\n  ===== Newick files saved as 'your_taxa_query.*' ===== "
        t.write(format=9, outfile="your_ncbi_query.nw")
        t.write(format=8, outfile="your_ncbi_query.named.nw")
        t.write(format=9,
                features=[
                    "taxid", "name", "rank", "bgcolor", "sci_name",
                    "collapse_subspecies", "named_lineage"
                ],
                outfile="your_ncbi_query.extended.nw")
        for i in t.iter_leaves():
            i.name = i.taxid
        t.write(format=9, outfile="your_ncbi_query.taxids.nw")

    if all_taxids and reftree:
        translator = ncbi.get_taxid_translator(all_taxids)
        for n in reftree.iter_leaves():
            if not hasattr(n, "taxid"):
                n.add_features(taxid=int(getattr(n, args.reftree_attr)))
            n.add_features(sci_name=translator.get(int(n.taxid), n.name))
            lineage = ncbi.get_sp_lineage(n.taxid)
            named_lineage = '|'.join(ncbi.translate_to_names(lineage))
            n.add_features(ncbi_track=named_lineage)

        print reftree.write(features=["taxid", "sci_name", "ncbi_track"])
示例#5
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__DESCRIPTION__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('tree',
                        metavar='tree_file',
                        type=str,
                        nargs=1,
                        help='A tree file (or text string) in newick format.')

    parser.add_argument(
        "--sp_delimiter",
        dest="species_delimiter",
        type=str,
        default="_",
        help=("When species names are guessed from node names,"
              " this argument specifies how to split node name to guess"
              " the species code"))

    parser.add_argument(
        "--sp_field",
        dest="species_field",
        type=int,
        default=1,
        help=("When species names are guessed from node names,"
              " this argument specifies the position of the species"
              " name code relative to the name splitting delimiter"))

    parser.add_argument(
        "--root",
        dest="root",
        type=str,
        nargs="*",
        help="Roots the tree to the node grouping the list"
        " of node names provided (space separated). In example:"
        "'--root human rat mouse'")

    parser.add_argument(
        "--skip_ortholog_detection",
        dest="skip_ortholog_detection",
        action="store_true",
        help=
        ("Skip automatic detection of"
         " speciation and duplication events, thus relying in the"
         " correct annotation of the provided tree using"
         " the extended newick format (i.e. '((A, A)[&&NHX:evoltype=D], B)[&&NHX:evoltype=S];')"
         ))

    parser.add_argument(
        "--evoltype_attr",
        dest="evoltype_attr",
        type=str,
        default="evoltype",
        help=(
            "When orthology detection is disabled,"
            " the attribute name provided here will be expected to exist"
            " in all internal nodes and read from the extended newick format"))

    parser.add_argument("--database",
                        dest="database",
                        type=str,
                        default="",
                        help=("Database name"))

    parser.add_argument(
        "--show",
        dest="show",
        action="store_true",
        default="",
        help=(
            "Show the tree and its evolutionary events before orthoXML export"
        ))

    parser.add_argument(
        "--ascii",
        dest="ascii",
        action="store_true",
        default="",
        help=(
            "Show the tree using ASCII representation and all its evolutionary"
            " events before orthoXML export"))

    parser.add_argument(
        "--newick",
        dest="newick",
        action="store_true",
        default="",
        help=("print the extended newick format for provided tree using"
              " ASCII representation and all its evolutionary events"
              " before orthoXML export"))

    args = parser.parse_args()
    newick = args.tree[0]

    SPECIES_NAME_POS = args.species_field
    SPECIES_NAME_DELIMITER = args.species_delimiter

    # load a phylomeDB Tree provided as a newick file in the command line
    t = PhyloTree(newick, sp_naming_function=extract_spname)

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)

    if not args.skip_ortholog_detection:
        # detect speciation and duplication events using the species overlap
        # algorithm used in phylomeDB
        t.get_descendant_evol_events()

    if args.ascii:
        print t.get_ascii(attributes=[args.evoltype_attr, "name"],
                          show_internal=True)

    if args.newick:
        print t.write(features=[args.evoltype_attr], format_root_node=True)

    if args.show:
        t.show()

    export_as_orthoXML(t, args.database, args.evoltype_attr)
示例#6
0
def main(argv):
    parser = argparse.ArgumentParser(description=__DESCRIPTION__, 
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    
    parser.add_argument('tree', metavar='tree_file', type=str, nargs=1,
                      help='A tree file (or text string) in newick format.')


    parser.add_argument("--sp_delimiter", dest="species_delimiter",
                        type=str, default="_",
                        help=("When species names are guessed from node names,"
                              " this argument specifies how to split node name to guess"
                              " the species code"))
                        
    parser.add_argument("--sp_field", dest="species_field", 
                          type=int, default=1,
                          help=("When species names are guessed from node names,"
                                " this argument specifies the position of the species"
                                " name code relative to the name splitting delimiter"))

    parser.add_argument("--root", dest="root", 
                        type=str, nargs="*",
                        help="Roots the tree to the node grouping the list"
                        " of node names provided (space separated). In example:"
                        "'--root human rat mouse'")

    
    parser.add_argument("--skip_ortholog_detection", dest="skip_ortholog_detection", 
                        action="store_true",
                        help=("Skip automatic detection of"
                              " speciation and duplication events, thus relying in the"
                              " correct annotation of the provided tree using"
                              " the extended newick format (i.e. '((A, A)[&&NHX:evoltype=D], B)[&&NHX:evoltype=S];')"))
    
    parser.add_argument("--evoltype_attr", dest="evoltype_attr", 
                          type=str, default="evoltype",
                          help=("When orthology detection is disabled,"
                                " the attribute name provided here will be expected to exist"
                                " in all internal nodes and read from the extended newick format"))
    
    parser.add_argument("--database", dest="database", 
                        type=str, default="",
                        help=("Database name"))


    parser.add_argument("--show", dest="show", 
                        action="store_true", default="",
                        help=("Show the tree and its evolutionary events before orthoXML export"))

    parser.add_argument("--ascii", dest="ascii", 
                        action="store_true", default="",
                        help=("Show the tree using ASCII representation and all its evolutionary"
                              " events before orthoXML export"))

    parser.add_argument("--newick", dest="newick", 
                        action="store_true", default="",
                        help=("print the extended newick format for provided tree using"
                              " ASCII representation and all its evolutionary events"
                              " before orthoXML export"))
    
    
    args = parser.parse_args()
    newick = args.tree[0]

    SPECIES_NAME_POS = args.species_field
    SPECIES_NAME_DELIMITER = args.species_delimiter

    # load a phylomeDB Tree provided as a newick file in the command line
    t = PhyloTree(newick, sp_naming_function=extract_spname)

    if args.root:
        if len(args.root) > 1:
            outgroup = t.get_common_ancestor(args.root)
        else:
            outgroup = t & args.root[0]
        t.set_outgroup(outgroup)


    if not args.skip_ortholog_detection:
        # detect speciation and duplication events using the species overlap
        # algorithm used in phylomeDB
        t.get_descendant_evol_events()
        
    if args.ascii:
        print t.get_ascii(attributes=[args.evoltype_attr, "name"], show_internal=True)
        
    if args.newick:
        print t.write(features=[args.evoltype_attr], format_root_node=True)
        
    if args.show:
        t.show()
    
    export_as_orthoXML(t, args.database, args.evoltype_attr)