def runDNADIST(mali, pairs, options): """run dnadist.""" # use phylip for these phylip = WrapperPhylip.Phylip() phylip.setProgram("dnadist") phylip.setMali(mali) phylip_options = [] if options.distance == "K80": phylip_options += ["D"] * 1 elif options.distance == "JC69": phylip_options += ["D"] * 2 elif options.distance == "LogDet": phylip_options += ["D"] * 3 phylip_options.append("Y") phylip.setOptions(phylip_options) if options.dump: phylip.setLogLevel(2) result = phylip.run() writePhylipResult(result, options)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: trees2tree.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("counts", "min", "max", "sum", "mean", "median", "stddev", "non-redundant", "consensus", "select-largest"), help="aggregation function.") parser.add_option("-r", "--regex-id", dest="regex_id", type="string", help="regex pattern to extract identifier from tree name for the selection functions.") parser.add_option("-w", "--write-values", dest="write_values", type="string", help="if processing multiple trees, write values to file.") parser.add_option("-e", "--error-branchlength", dest="error_branchlength", type="float", help="set branch length without counts to this value.") parser.set_defaults( method="mean", regex_id=None, filtered_branch_lengths=(-999.0, 999.0), write_values = None, error_branchlength = None, separator=":", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.loglevel >= 2: options.stdlog.write("# reading trees from stdin.\n") options.stdlog.flush() nexus = TreeTools.Newick2Nexus(sys.stdin) if options.loglevel >= 1: options.stdlog.write( "# read %i trees from stdin.\n" % len(nexus.trees)) nskipped = 0 ninput = len(nexus.trees) noutput = 0 nerrors = 0 if options.method == "non-redundant": # compute non-redudant trees template_trees = [] template_counts = [] ntree = 0 for tree in nexus.trees: for x in range(0, len(template_trees)): is_compatible, reason = TreeTools.IsCompatible( tree, template_trees[x]) if is_compatible: template_counts[x] += 1 break else: template_counts.append(1) template_trees.append(tree) if options.loglevel >= 2: options.stdlog.write( "# tree=%i, ntemplates=%i\n" % (ntree, len(template_trees))) ntree += 1 for x in range(0, len(template_trees)): if options.loglevel >= 1: options.stdlog.write("# tree: %i, counts: %i, percent=%5.2f\n" % (x, template_counts[x], template_counts[x] * 100.0 / ntotal)) options.stdout.write( TreeTools.Tree2Newick(template_trees[x]) + "\n") elif options.method in ("select-largest",): # select one of the trees with the same name. clusters = {} for x in range(0, len(nexus.trees)): n = nexus.trees[x].name if options.regex_id: n = re.search(options.regex_id, n).groups()[0] if n not in clusters: clusters[n] = [] clusters[n].append(x) new_trees = [] for name, cluster in clusters.items(): new_trees.append( getBestTree([nexus.trees[x] for x in cluster], options.method)) for x in range(0, len(new_trees)): options.stdout.write(">%s\n" % new_trees[x].name) options.stdout.write(TreeTools.Tree2Newick(new_trees[x],) + "\n") noutput += 1 nskipped = ntotal - noutput elif options.method == "consensus": phylip = WrapperPhylip.Phylip() phylip.setLogLevel(options.loglevel - 2) phylip.setProgram("consense") phylip_options = [] phylip_options.append("Y") phylip.setOptions(phylip_options) phylip.setTrees(nexus.trees) result = phylip.run() options.stdout.write( "# consensus tree built from %i trees\n" % (phylip.mNInputTrees)) options.stdout.write( TreeTools.Tree2Newick(result.mNexus.trees[0]) + "\n") noutput = 1 else: if options.method in ("min", "max", "sum", "mean", "counts"): xtree = nexus.trees[0] for n in xtree.chain.keys(): if xtree.node(n).data.branchlength in options.filtered_branch_lengths: xtree.node(n).data.branchlength = 0 ntotals = [1] * len(xtree.chain.keys()) if options.method == "min": f = min elif options.method == "max": f = max elif options.method == "sum": f = lambda x, y: x + y elif options.method == "mean": f = lambda x, y: x + y elif options.method == "counts": f = lambda x, y: x + 1 for n in xtree.chain.keys(): if xtree.node(n).data.branchlength not in options.filtered_branch_lengths: xtree.node(n).data.branchlength = 1 else: xtree.node(n).data.branchlength = 0 else: raise "unknown option %s" % options.method for tree in nexus.trees[1:]: for n in tree.chain.keys(): if tree.node(n).data.branchlength not in options.filtered_branch_lengths: xtree.node(n).data.branchlength = f( xtree.node(n).data.branchlength, tree.node(n).data.branchlength) ntotals[n] += 1 if options.method == "mean": for n in xtree.chain.keys(): if ntotals[n] > 0: xtree.node(n).data.branchlength = float( xtree.node(n).data.branchlength) / ntotals[n] else: if options.error_branchlength is not None: xtree.node( n).data.branchlength = options.error_branchlength if options.loglevel >= 1: options.stdlog.write( "# no counts for node %i - set to %f\n" % (n, options.error_branchlength)) nerrors += 1 else: raise "no counts for node %i" % n else: # collect all values for trees values = [[] for x in range(TreeTools.GetSize(nexus.trees[0]))] for tree in nexus.trees: for n, node in tree.chain.items(): if node.data.branchlength not in options.filtered_branch_lengths: values[n].append(node.data.branchlength) tree = nexus.trees[0] for n, node in tree.chain.items(): if len(values[n]) > 0: if options.method == "stddev": node.data.branchlength = scipy.std(values[n]) elif options.method == "median": node.data.branchlength = scipy.median(values[n]) else: if options.error_branchlength is not None: node.data.branchlength = options.error_branchlength if options.loglevel >= 1: options.stdlog.write( "# no counts for node %i - set to %f\n" % (n, options.error_branchlength)) nerrors += 1 else: raise "no counts for node %i" % n if options.write_values: outfile = open(options.write_values, "w") for n, node in tree.chain.items(): values[n].sort() id = options.separator.join( sorted(TreeTools.GetLeaves(tree, n))) outfile.write("%s\t%s\n" % (id, ";".join(map(str, values[n])))) outfile.close() del nexus.trees[1:] options.stdout.write(TreeTools.Nexus2Newick(nexus) + "\n") noutput = 1 if options.loglevel >= 1: options.stdlog.write("# ntotal=%i, nskipped=%i, noutput=%i, nerrors=%i\n" % ( ninput, nskipped, noutput, nerrors)) E.Stop()
def Process(lines, other_trees, options, map_old2new, ntree): nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines)) if options.loglevel >= 1: options.stdlog.write("# read %i trees.\n" % len(nexus.trees)) nskipped = 0 ntotal = len(nexus.trees) extract_pattern = None species2remove = None write_map = False phylip_executable = None phylip_options = None index = 0 # default: do not output internal node names write_all_taxa = False for tree in nexus.trees: if options.outgroup: tree.root_with_outgroup(options.outgroup) for method in options.methods: if options.loglevel >= 3: options.stdlog.write("# applying method %s to tree %i.\n" % (method, index)) if method == "midpoint-root": tree.root_midpoint() elif method == "balanced-root": tree.root_balanced() elif method == "unroot": TreeTools.Unroot(tree) elif method == "phylip": if not phylip_executable: phylip_executable = options.parameters[0] del options.parameters[0] phylip_options = re.split("@", options.parameters[0]) del options.parameters[0] phylip = WrapperPhylip.Phylip() phylip.setProgram(phylip_executable) phylip.setOptions(phylip_options) phylip.setTree(tree) result = phylip.run() nexus.trees[index] = result.mNexus.trees[0] elif method == "normalize": if options.value == 0: v = 0 for n in tree.chain.keys(): v = max(v, tree.node(n).data.branchlength) else: v = options.value for n in tree.chain.keys(): tree.node(n).data.branchlength /= float(options.value) elif method == "divide-by-tree": if len(other_trees) > 1: other_tree = other_trees[ntree] else: other_tree = other_trees[0] # the trees have to be exactly the same!! if options.loglevel >= 2: print tree.display() print other_tree.display() if not tree.is_identical(other_tree): nskipped += 1 continue # even if the trees are the same (in topology), the node numbering might not be # the same. Thus build a map of node ids. map_a2b = TreeTools.GetNodeMap(tree, other_tree) for n in tree.chain.keys(): try: tree.node(n).data.branchlength /= float( other_tree.node(map_a2b[n]).data.branchlength) except ZeroDivisionError: options.stdlog.write( "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n" % (n, map_a2b[n], ntree)) continue elif method == "rename": if not map_old2new: map_old2new = IOTools.ReadMap(open(options.parameters[0], "r"), columns=(0, 1)) if options.invert_map: map_old2new = IOTools.getInvertedDictionary( map_old2new, make_unique=True) del options.parameters[0] unknown = [] for n, node in tree.chain.items(): if node.data.taxon: try: node.data.taxon = map_old2new[node.data.taxon] except KeyError: unknown.append(node.data.taxon) for taxon in unknown: tree.prune(taxon) # reformat terminals elif method == "extract-with-pattern": if not extract_pattern: extract_pattern = re.compile(options.parameters[0]) del options.parameters[0] for n in tree.get_terminals(): node = tree.node(n) node.data.taxon = extract_pattern.search( node.data.taxon).groups()[0] elif method == "set-uniform-branchlength": for n in tree.chain.keys(): tree.node(n).data.branchlength = options.value elif method == "build-map": # build a map of identifiers options.write_map = True for n in tree.get_terminals(): node = tree.node(n) if node.data.taxon not in map_old2new: new = options.template_identifier % (len(map_old2new) + 1) map_old2new[node.data.taxon] = new node.data.taxon = map_old2new[node.data.taxon] elif method == "remove-pattern": if species2remove is None: species2remove = re.compile(options.parameters[0]) del options.parameters taxa = [] for n in tree.get_terminals(): t = tree.node(n).data.taxon skip = False if species2remove.search(t): continue if not skip: taxa.append(t) TreeTools.PruneTree(tree, taxa) elif method == "add-node-names": inode = 0 write_all_taxa = True for n, node in tree.chain.items(): if not node.data.taxon: node.data.taxon = "inode%i" % inode inode += 1 elif method == "newick2nhx": # convert names to species names for n in tree.get_terminals(): t = tree.node(n).data.taxon d = t.split("|") if len(d) >= 2: tree.node(n).data.species = d[0] index += 1 ntree += 1 if options.output_format == "nh": options.stdout.write( TreeTools.Nexus2Newick( nexus, write_all_taxa=True, with_branchlengths=options.with_branchlengths) + "\n") else: for tree in nexus.trees: tree.writeToFile(options.stdout, format=options.output_format) return ntotal, nskipped, ntree
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: matrix2tree.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-i", "--invert-map", dest="invert_map", action="store_true", help="""invert map.""") parser.add_option("--input-format", dest="input_format", type="choice", choices=("phylip", "full"), help="""input format.""") parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="""filename with tree to fit.""") parser.add_option("-m", "--method", dest="method", type="choice", choices=("nj", "kitsch", "fitch"), help="""algorithm to run.""") parser.add_option("-e", "--replicates", dest="replicates", action="store_true", help="replicates.") parser.add_option("-r", "--root", dest="root", action="store_true", help="midpoint root (if it is not rooted).") parser.add_option("-u", "--unroot", dest="unroot", action="store_true", help="unroot tree (if it is rooted).") parser.add_option("--skip-separators", dest="write_separators", action="store_false", help="do not echo separators (starting with >)") # parser.add_option("-i", "--iterations", dest="iterations", type="int", # help="number of iterations." ) parser.add_option("-p", "--power", dest="power", type="float", help="power.") parser.add_option( "--prune-tree", dest="prune_tree", action="store_true", help= "prune tree such to include only taxa which are part of the input matrix." ) parser.add_option( "--add-random", dest="add_random", action="store_true", help="add small random value to off-diagonal zero elements in matrix.") parser.add_option( "--pseudo-replicates", dest="pseudo_replicates", action="store_true", help= "add small random value to off-diagonal zero elements in matrix, even if they have no replicates." ) parser.add_option("--debug", dest="debug", action="store_true", help="dump debug information.") parser.set_defaults( value=0, method="nj", input_format="phylip", filename_tree=None, outgroup=None, replicates=False, root=False, unroot=False, power=0, write_separators=True, prune_tree=False, add_random=False, debug=False, ) (options, args) = E.Start(parser, add_pipe_options=True) phylip = WrapperPhylip.Phylip() if options.debug: phylip.setLogLevel(options.loglevel) phylip.setPruneTree(options.prune_tree) lines = filter(lambda x: x[0] != "#", sys.stdin.readlines()) chunks = filter(lambda x: lines[x][0] == ">", range(len(lines))) if not chunks: options.write_separators = False chunks = [-1] chunks.append(len(lines)) for x in range(len(chunks) - 1): matrix = lines[chunks[x] + 1:chunks[x + 1]] # parse phylip matrix if options.add_random: mm = [] ids = [] for l in range(1, len(matrix)): values = re.split("\s+", matrix[l][:-1]) ids.append(values[0]) mm.append(map(lambda x: x.strip(), values[1:])) d = len(mm) if options.replicates: for row in range(d - 1): for col in range(row + 1, d): cc = col * 2 rr = row * 2 if mm[row][cc] == "0" and mm[row][cc + 1] != "0": mm[row][cc + 1] = "1" mm[col][rr + 1] = "1" v = str(random.random() / 10000.0) mm[row][cc] = v mm[col][rr] = v else: for row in range(d - 1): for col in range(row + 1, d): if mm[row][col] == "0": v = str(random.random() / 10000.0) mm[row][col] = v mm[col][row] = v matrix = ["%i\n" % d] for row in range(d): matrix.append(ids[row] + " " + " ".join(mm[row]) + "\n") # parse phylip matrix if options.pseudo_replicates: mm = [] ids = [] for l in range(1, len(matrix)): values = re.split("\s+", matrix[l][:-1]) ids.append(values[0]) mm.append(map(lambda x: x.strip(), values[1:])) d = len(mm) if options.replicates: for row in range(d - 1): for col in range(row + 1, d): cc = col * 2 rr = row * 2 if mm[row][cc + 1] == "0": mm[row][cc + 1] = "1" mm[col][rr + 1] = "1" v = str(random.random() / 10000.0) mm[row][cc] = v mm[col][rr] = v else: mm[row][cc + 1] = "100" mm[col][rr + 1] = "100" else: for row in range(d - 1): for col in range(row + 1, d): if mm[row][col] == "0": v = str(random.random() / 10000.0) mm[row][col] = v mm[col][row] = v matrix = ["%i\n" % d] for row in range(d): matrix.append(ids[row] + " " + " ".join(mm[row]) + "\n") phylip.setMatrix(matrix) phylip_options = [] if options.filename_tree: nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r")) ref_tree = nexus.trees[0] phylip.setTree(ref_tree) phylip_options.append("U") else: ref_tree = None if options.method == "nj": phylip.setProgram("neighbor") elif options.method == "fitch": phylip.setProgram("fitch") elif options.method == "kitsch": phylip.setProgram("kitsch") if options.replicates: phylip_options.append("S") if options.power > 0: phylip_options.append("P") phylip_options.append("%f" % options.power) phylip_options.append("Y") phylip.setOptions(phylip_options) result = phylip.run() # root with outgroup if options.root: if options.outgroup: pass # midpoint root else: for tree in result.mNexus.trees: tree.root_midpoint() # explicitely unroot elif options.unroot: phylip.setOptions(("Y", "W", "U", "Q")) phylip.setProgram("retree") for x in range(len(result.mNexus.trees)): phylip.setTree(result.mNexus.trees[x]) xresult = phylip.run() result.mNexus.trees[x] = xresult.mNexus.trees[0] if options.write_separators: options.stdout.write(lines[chunks[x]]) if result.mNexus: options.stdout.write(TreeTools.Nexus2Newick(result.mNexus) + "\n") if options.loglevel >= 1: if ref_tree: nref = len(ref_tree.get_terminals()) else: nref = 0 for tree in result.mNexus.trees: options.stdlog.write( "# ninput=%i, nreference=%i, noutput=%i\n" % (len(matrix) - 1, nref, len(tree.get_terminals()))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2rates.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment") parser.add_option( "-s", "--sites", dest="sites", type="string", help="sites to use [default=%default].", ) parser.add_option( "-f", "--file", dest="filename", type="string", help="filename of multiple alignment (- for stdin) [default=%default].", metavar="FILE") parser.add_option("-o", "--format", dest="format", type="string", help="format [default=%default].", metavar="format") parser.add_option( "-d", "--distance", dest="distance", type="choice", choices=("PID", "T92", "JC69", "POVL", "F84", "LogDet", "K80", "F81", "HKY85", "TN93", "REV", "UNREST", "REVU", "UNRESTU", "JTT", "PMB", "PAM", "Kimura", "CategoriesModel"), help="method to use for distance calculation [default=%default].") parser.add_option("--method", dest="method", type="choice", choices=("phylip", "baseml", "own", "xrate"), help="program to use for rate calculation.") parser.add_option("--output-format", dest="output_format", type="choice", choices=("list", "tree"), help="output format.") parser.add_option( "-m", "--min-sites", dest="min_sites", type="int", help="minimum number of sites for output[default=%default].", ) parser.add_option( "-a", "--alphabet", dest="alphabet", type="choice", choices=("aa", "na", "auto"), help="alphabet to use.", ) parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree information.") parser.add_option("--set-alpha", dest="alpha", type="float", help="initial alpha value.") parser.add_option("--fix-alpha", dest="fix_alpha", action="store_true", help="do not estimate alpha.") parser.add_option("--set-kappa", dest="kappa", type="float", help="initial kappa value.") parser.add_option("--fix-kappa", dest="fix_kappa", action="store_true", help="do not estimate kappa.") parser.add_option("--dump", dest="dump", action="store_true", help="dump output.") parser.add_option("--test", dest="test", action="store_true", help="test run - does not clean up.") parser.add_option("--pairwise", dest="pairwise", action="store_true", help="force pairwise comparison.") parser.add_option( "--set-clean-data", dest="clean_data", type="choice", choices=("0", "1"), help= "PAML should cleanup data: 0=only gaps within pair are removed, 1=columns in the mali with gaps are removed." ) parser.add_option( "--with-counts", dest="with_counts", action="store_true", help= "output counts of aligned positions, transitions and transversions.") parser.add_option("-w", "--write", dest="write", type="choice", action="append", choices=("input", "trained", "all"), help="output sections to write for xrate.") parser.add_option("--output-pattern", dest="output_pattern", type="string", help="output pattern for output files.") parser.add_option("--xrate-min-increment", dest="xrate_min_increment", type=float, help="minimum increment to stop iteration in xrate.") parser.set_defaults( input_format="fasta", filename_tree=None, with_counts=False, sites="d4", distance="T92", min_sites=1, filename="-", alphabet="auto", format="%6.4f", method="phylip", kappa=None, fix_kappa=False, alpha=None, fix_alpha=False, dump=False, clean_data=None, output_format="list", iteration="all-vs-all", pairwise=False, report_step=1000, output_pattern="%s.eg", write=[], test_xrate=False, xrate_min_increment=None, is_codons=False, ) (options, args) = E.Start(parser) if options.filename != "-": infile = open(options.filename, "r") else: infile = sys.stdin # read multiple alignment if options.pairwise: # read sequences, but not as a multiple alignment. This permits # multiple names. mali = Mali.SequenceCollection() options.iteration = "pairwise" else: mali = Mali.Mali() mali.readFromFile(infile, format=options.input_format) ids = mali.getIdentifiers() if options.alphabet == "auto": s = "".join(map(lambda x: x.mString, mali.values())).lower() ss = re.sub("[acgtxn]", "", s) if float(len(ss)) < (len(s) * 0.1): options.alphabet = "na" if mali.getNumColumns() % 3 == 0: options.is_codons = True else: options.alphabet = "aa" if options.loglevel >= 1: options.stdlog.write("# autodetected alphabet: %s\n" % options.alphabet) if options.filename != "-": infile.close() npairs = 0 nskipped_length = 0 nskipped_distance = 0 pairs = [] if options.iteration == "all-vs-all": for x in range(len(ids) - 1): for y in range(x + 1, len(ids)): pairs.append((x, y)) elif options.iteration == "first-vs-all": for y in range(1, len(ids)): pairs.append((0, y)) elif options.iteration == "pairwise": if len(ids) % 2 != 0: raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len( ids) for x in range(0, len(ids), 2): pairs.append((x, x + 1)) if options.alphabet == "na": if options.method == "baseml": runBaseML(mali, pairs, options) elif options.method == "phylip" and options.distance in ("F84", "K80", "JC69", "LogDet"): runDNADIST(mali, pairs, options) elif options.method == "xrate": runXrate(mali, pairs, options) else: if options.is_codons: h = Genomics.SequencePairInfoCodons().getHeader() else: h = Genomics.SequencePairInfo().getHeader() options.stdout.write("seq1\tseq2\tdist\tvar\t%s\n" % (h)) for x, y in pairs: id_x = ids[x] npairs += 1 id_y = ids[y] info = Genomics.CalculatePairIndices( mali[id_x], mali[id_y], with_codons=options.is_codons) if options.distance in ("T92", "JC69"): if options.sites == "d4": seq1, seq2 = Genomics.GetDegenerateSites(mali[id_x], mali[id_y], position=3, degeneracy=4) if len(seq1) < options.min_sites: nskipped_length += 1 continue else: raise "unknown sites %s" % options.sites if options.distance == "T92": distance, variance = CalculateDistanceT92(info) elif options.distance == "JC69": distance, variance = CalculateDistanceJC69(info) elif options.distance == "PID": distance, variance = CalculateDistancePID( mali[id_x], mali[id_y]) elif options.distance == "POVL": distance, variance = CalculateDistancePOVL( mali[id_x], mali[id_y]) if distance >= 0: options.stdout.write("\t".join( map(str, (id_x, id_y, options.format % distance, options.format % variance, info))) + "\n") else: nskipped_distance += 1 elif options.alphabet == "aa": if options.distance in ("JTT", "PMB", "PAM", "Kimura", "CategoriesModel"): # use phylip for these phylip = WrapperPhylip.Phylip() phylip.setProgram("protdist") phylip.setMali(mali) phylip_options = [] if options.distance == "PMG": phylip_options += ["D"] * 1 elif options.distance == "PAM": phylip_options += ["D"] * 2 elif options.distance == "Kimura": phylip_options += ["D"] * 3 elif options.distance == "CategoriesModel": phylip_options += ["D"] * 4 phylip_options.append("Y") phylip.setOptions(phylip_options) result = phylip.run() writePhylipResult(result, options) else: options.stdout.write("id1\tid2\tdist\tvar\n") # iterate over all pairs of sequences for x, y in pairs: id_x = ids[x] npairs += 1 id_y = ids[y] if options.distance == "PID": distance, variance = CalculateDistancePID( mali[id_x], mali[id_y]) elif options.distance == "POVL": # percentage overlap distance, variance = CalculateDistancePOVL( mali[id_x], mali[id_y]) if distance >= 0: options.stdout.write("\t".join( (id_x, id_y, options.format % distance, options.format % variance)) + "\n") else: nskipped_distance += 1 if options.loglevel >= 1: options.stdlog.write( "# nseqs=%i, npairs=%i, nskipped_length=%i, nskipped_distance=%i\n" % (len(ids), npairs, nskipped_length, nskipped_distance)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: data2phylocontrasts.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--write-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.Start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) phylip = WrapperPhylip.Phylip() if options.debug: phylip.setLogLevel(options.loglevel) phylip.setProgram("contrast") ########################################################## ########################################################## ########################################################## # retrieve data and give to phylip data = [] headers = [] first = True for line in sys.stdin: if line[0] == "#": continue d = line[:-1].strip().split("\t") if first: first = False headers = d[1:] continue data.append(d) phylip.setData(data) ncolumns = len(headers) nrows = len(data) ########################################################## ########################################################## ########################################################## # read trees nexus = None if options.filename_tree: nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r")) if not nexus: raise ValueError("please provide trees with branchlenghts") ########################################################## ########################################################## ########################################################## # set up phylip phylip_options = [] # print out contrasts phylip_options.append("C") phylip_options.append("Y") phylip.setOptions(phylip_options) ########################################################## ########################################################## ########################################################## # main loop ########################################################## for tree in nexus.trees: if options.display_tree: tree.display() # compute this before giving the tree to the phylip module, # as it remaps taxon names. map_node2data = {} for x in range(nrows): taxon = data[x][0] map_node2data[tree.search_taxon(taxon)] = x phylip.setTree(tree) result = phylip.run() for method in options.methods: if method in ("pearson", "spearman"): options.stdout.write("header1\theader2\tr\tp\tcode\n") n = len(result.mContrasts) columns = [] for c in range(ncolumns): columns.append(map(lambda x: x[c], result.mContrasts)) for x in range(0, ncolumns - 1): for y in range(x + 1, ncolumns): # phylip value phy_r = result.mCorrelations[x][y] import rpy from rpy import r as R # Various ways to calculate r. It is not possible to use # cor.test or lsfit directly, as you have to perform a # regression through the origin. # uncomment to check pearson r against phylip's value ## r = calculateCorrelationCoefficient( columns[x], columns[y] ) # for significance, use linear regression models in R rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = R.lm(R("y ~ x - 1"), data=R.data_frame(x=columns[x], y=columns[y])) rpy.set_default_mode(rpy.BASIC_CONVERSION) ss = R.summary(linear_model) # extract the p-value p = ss['coefficients'][-1][-1] if p < 0.001: code = "***" elif p < 0.01: code = "**" elif p < 0.05: code = "*" else: code = "" options.stdout.write("\t".join( (headers[x], headers[y], options.value_format % phy_r, options.pvalue_format % p, code)) + "\n") elif method == "contrasts": options.stdout.write("\t".join(headers) + "\n") for d in result.mContrasts: options.stdout.write( "\t".join(map(lambda x: options.value_format % x, d)) + "\n ") elif method == "compute": # make room for all internal nodes and one dummy node # for unrooted trees. max_index = TreeTools.GetMaxIndex(tree) + 2 variances = [None] * max_index values = [[None] * nrows for x in range(max_index)] contrasts = [] for x in range(max_index): contrasts.append([None] * ncolumns) branchlengths = [None] * max_index def update_data( node_id, bl, c1, c2, ): b1, b2 = branchlengths[c1], branchlengths[c2] rb1 = 1.0 / b1 rb2 = 1.0 / b2 # compute variance variance = math.sqrt(b1 + b2) # extend branch length of this node to create correct # variance for parent branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2) variances[node_id] = variance for c in range(ncolumns): v1, v2 = values[c1][c], values[c2][c] # save ancestral value as weighted mean values[node_id][c] = ( (rb1 * v1 + rb2 * v2)) / (rb1 + rb2) # compute normalized contrast contrasts[node_id][c] = (v1 - v2) / variance def update_contrasts(node_id): """update contrasts for a node.""" node = tree.node(node_id) if node.succ: if len(node.succ) == 2: c1, c2 = node.succ update_data(node_id, node.data.branchlength, c1, c2) else: assert (node_id == tree.root) assert (len(node.succ) == 3) update_data(node_id, node.data.branchlength, node.succ[0], node.succ[1]) update_data(max_index - 1, node.data.branchlength, node_id, node.succ[2]) else: for c in range(ncolumns): values[node_id][c] = float( data[map_node2data[node_id]][c + 1]) branchlengths[node_id] = node.data.branchlength tree.dfs(tree.root, post_function=update_contrasts) options.stdout.write("node_id\tvariance\t%s\n" % "\t".join(headers)) for node_id in range(max_index): if variances[node_id] is None: continue options.stdout.write("%s\t%s\t%s\n" % ( node_id, options.value_format % variances[node_id], "\t".join( map(lambda x: options.value_format % x, contrasts[node_id])), )) E.Stop()