def run(self, grammar, tree=None, dump=0, test=False, options={}): self.mTempdir = tempfile.mkdtemp() self.mFilenameGrammar = "grammar.eg" self.mFilenameTree = "tree.nh" self.mFilenameOutput = None self.mWarnings = [] if test: print "# temporary directory is %s" % self.mTempdir outfile = open(self.mTempdir + "/" + self.mFilenameGrammar, "w") outfile.write(grammar.getGrammar()) outfile.close() if tree: outfile = open(self.mTempdir + "/" + self.mFilenameTree, "w") ## check what kind of tree is given. if type(tree) == StringType: t = tree.strip() if t[0] == "(" and t[-1] in ");": outfile.write("%s\n" % t) else: nexus = TreeTools.Newick2Nexus(open(tree, "r")) t = nexus.trees[0] outfile.write("%s\n" % TreeTools.Tree2Newick(t)) outfile.close() # use your own random seed. Time won't do, if simgram # is called in quick succession. # Are there any restrictions on seeds? Ian using an even number. statement = "%s -rndseed %i -g %s -t %s" % ( self.mExecutable, random.randint( 0, 4294967296), self.mFilenameGrammar, self.mFilenameTree) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.mTempdir, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise UsageError, "Error in running %s \n%s\n%s\nTemporary directory in %s" % ( self.mExecutable, err, out, self.mTempdir) if dump: print "# stdout output of %s:\n%s\n######################################" % ( self.mExecutable, out) if not test: shutil.rmtree(self.mTempdir) return self.parseOutput(out.split("\n"))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2taxa.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--skip-trees", dest="skip_trees", action="store_true", help="do not output tree names in third field [default=%default].") parser.set_defaults(skip_trees=False) (options, args) = E.Start(parser, add_pipe_options=True) nexus = TreeTools.Newick2Nexus(sys.stdin) if options.loglevel >= 1: options.stdlog.write("# read %i trees from stdin.\n" % len(nexus.trees)) ntree = 0 ntotal = len(nexus.trees) if ntotal == 1: options.stdout.write("taxon\n") else: if options.skip_trees: options.stdout.write("taxon\ttree\n") else: options.stdout.write("taxon\ttree\tname\n") for tree in nexus.trees: ntree += 1 taxa = TreeTools.GetTaxa(tree) if ntotal == 1: for t in taxa: options.stdout.write("%s\n" % (t)) elif options.skip_trees: for t in taxa: options.stdout.write("%s\t%i\n" % (t, ntree)) else: for t in taxa: options.stdout.write("%s\t%i\t%s\n" % (t, ntree, tree.name)) if options.loglevel >= 1: options.stdlog.write("# ntotal=%i\n" % (ntotal)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/evaluate_trees.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-r", "--reference=", dest="filename_reference_tree", help="filename with reference tree.", type="string") parser.set_defaults(filename_reference_tree=None) (options, args) = E.Start(parser) if not options.filename_reference_tree: print "please supply reference tree." if options.loglevel >= 1: print "# reading reference tree." nexus = TreeTools.Newick2Nexus(open(options.filename_reference_tree, "r")) reference_tree = nexus.trees[0] if options.loglevel >= 1: print "# reading sample trees." nexus2 = TreeTools.Newick2Nexus(sys.stdin) ntotal, nok, nfailed = 0, 0, 0 ntopology, ntaxa, nleaves = 0, 0, 0 for t in nexus2.trees: ntotal += 1 is_ok, reason = TreeTools.IsCompatible(reference_tree, t) if is_ok: nok += 1 else: nfailed += 1 if reason == "topology": ntopology += 1 elif reason == "taxa": ntaxa += 1 elif reason == "leaves": nleaves += 1 print "# total=%i, compatible=%i, failed=%i, topology=%i, taxa=%i, leaves=%i" %\ (ntotal, nok, nfailed, ntopology, ntaxa, nleaves) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2patterns.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string", help="reference tree to read.") parser.add_option("-s", "--sort-order", dest="sort_order", type="string", help="output order of OTU.") parser.set_defaults( reference_tree=None, sort_order=[], ) (options, args) = E.Start(parser) if not options.sort_order: for nx in reference_tree.get_terminals(): options.sort_order.append(reference_tree.node(nx).get_data().taxon) else: options.sort_order = options.sort_order.split(",") if not options.reference_tree: raise "no reference tree defined." nexus = TreeTools.Newick2Nexus(options.reference_tree) reference_tree = nexus.trees[0] if options.loglevel >= 3: print "# reference tree:" print reference_tree.display() patterns = TreeTools.calculatePatternsFromTree(tree, options.sort_order) for p in patterns: print p E.Stop()
def WriteTree(self, tree): """write tree to file. """ nexus = TreeTools.Newick2Nexus(tree) t = nexus.trees[0] TreeTools.MapTaxa(t, self.mMapOld2New) outfile = open(self.mTempdir + "/" + self.mFilenameTree, "w") outfile.write("%i 1\n" % self.mNumSequences) outfile.write("%s\n" % TreeTools.Tree2Newick(t)) outfile.close()
def processChunk(lines, map_strain2species, options): nexus = TreeTools.Newick2Nexus(lines) global ninput, noutput, nskipped, nmerged for tree in nexus.trees: ninput += 1 if options.loglevel >= 3: tree.display() mergers = getSpeciesTreeMergers(tree, map_strain2species, options) if options.loglevel >= 3: options.stdlog.write( "# found %i nodes in the tree that will be merged.\n" % (len(mergers))) if len(mergers) > 0: nmerged += 1 n = applySpeciesTreeMergers( tree, mergers, map_strain2species, options) if len(tree.get_terminals()) <= 1: nskipped += 1 continue tree.writeToFile(options.stdout, format=options.output_format) noutput += 1
def trainMali( mali, options ): """train a grammar on a multiple alignment.""" ## remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps( minimum_gaps = 1, frame=1 ) length = mali.getNumColumns() input_model = prepareGrammar( options ) for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename( id, species ) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True ) ids = mali.getIdentifiers() if options.input_filename_tree: nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") ) tree = nexus.trees[0] try: tree.relabel( map_old2new, warn = True ) except KeyError, msg: raise KeyError( "names in mali and tree are not congruent: %s" % msg )
def plotLines(self): """plot lines of the tree""" # plot tree in dfs manner def plotLines(node_id): node = self.mTree.node(node_id) left = self.mNodeWidthsStart[node_id] right = self.mNodeWidthsEnd[node_id] height = self.mNodeHeights[node_id] if right != left and node_id != self.mTree.root: self.addElements( self.mDecoratorHorizontalBranches.getElements( node_id, self.getHeaderWidth() + left, self.getHeaderWidth() + right, self.getHeaderHeight() + height)) for s in node.succ: new_height = self.mNodeHeights[s] self.addElements( self.mDecoratorVerticalBranches.getElements( node_id, self.getHeaderWidth() + right, self.getHeaderHeight() + height, self.getHeaderHeight() + new_height)) TreeTools.TreeDFS(self.mTree, self.mTree.root, pre_function=plotLines)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: tree2plot.py 2782 2009-09-10 11:40:29Z andreas $") parser.set_defaults() (options, args) = E.Start(parser, add_pipe_options=True) lines = filter(lambda x: x[0] != "#", sys.stdin.readlines()) nexus = TreeTools.Newick2Nexus(lines) input_tree = nexus.trees[0] treegraph = TreeGraph(support=None, loglevel=options.loglevel) print treegraph.Run(input_tree) E.Stop()
def TranslateNode(node, tree, terminals, options): if options.do_translate: return options.separator.join(sorted(TreeTools.GetLeaves(tree, node))) elif node in terminals: return tree.node(node).data.taxon else: return str(node)
def rerootTree(gene_tree, extract_species, options): otus = TreeTools.GetTaxa(gene_tree) # find monophyletic trees of outgroup_species try: outgroup_taxa = filter( lambda x: extract_species(x) in options.outgroup_species, otus) except AttributeError: raise "error while rerooting tree in tree %s with %s" % ( gene_tree.name, str(otus)) if gene_tree.is_monophyletic(outgroup_taxa): r = outgroup_taxa else: r = [outgroup_taxa[0], ] if r: if options.loglevel >= 1: options.stdlog.write("# tree %s: rerooting with %i outgroups: %s.\n" % ( gene_tree.name, len(r), ",".join(r))) options.stdlog.flush() else: if options.loglevel >= 1: options.stdlog.write( "# tree %s: no outgroup found, tree will not be rerooted.\n" % gene_tree.name) options.stdlog.flush() gene_tree.root_with_outgroup(r) if options.loglevel >= 5: gene_tree.display()
def parseOutput(self, lines, out, err): lines = re.sub("\s", "", "".join(lines)) lines = re.sub("\[[^\]]+\]", "", lines) t = TreeTools.Newick2Nexus("".join(lines)) result = Result() t = t.trees[0] TreeTools.MapTaxa(t, self.mMapNew2Old) result.mTree = t result.mLog = out result.mErr = err return result
def filterTree(tree, options, map_id2location=None): """apply location and type filter to tree. if outgroups are defined, they are not removed. """ otus = TreeTools.GetTaxa(tree) to_remove = set() if options.remove_unplaced: tt = set() for id in otus: if id not in map_id2location: if options.loglevel >= 1: options.stdlog.write( "# WARNING: unknown location for id %s.\n" % id) continue if map_id2location[id].mShortName.lower() in MAP_CONTIG2JUNK: to_remove.add(id) tt.add(id) if options.loglevel >= 3: options.stdlog.write("# tree %s: removing %i entries because of location: %s\n" % (tree.name, len(tt), ";".join(tt))) new_otus = list(set(otus).difference(to_remove)) if len(new_otus) != len(otus): TreeTools.PruneTree(tree, new_otus, keep_distance_to_root=True) if options.loglevel >= 1: options.stdlog.write("# tree %s: filtering: before=%i, remove=%i, after=%i, final=%i\n" % (tree.name, len(otus), len(to_remove), len(new_otus), len(TreeTools.GetTaxa(tree)))) options.stdlog.flush()
def getBestTree(trees, method="select-largest"): """select best tree out of a set of trees.""" if method == "select-largest": sizes = zip(map(lambda x: len(x.get_taxa()), trees), range(len(trees))) sizes.sort() best_tree = sizes[-1][1] if options.loglevel >= 3: for x in range(len(trees)): if x == best_tree: continue options.stdlog.write( "# skipped tree: %s: %s\n" % (trees[x].name, TreeTools.Tree2Newick(trees[x]))) return trees[best_tree]
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: tree2patterns.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string", help="reference tree to read.") parser.add_option("-s", "--sort-order", dest="sort_order", type="string", help="output order of OTU.") parser.set_defaults( reference_tree=None, sort_order=[], ) (options, args) = E.Start(parser) if not options.sort_order: for nx in reference_tree.get_terminals(): options.sort_order.append(reference_tree.node(nx).get_data().taxon) else: options.sort_order = options.sort_order.split(",") if not options.reference_tree: raise "no reference tree defined." nexus = TreeTools.Newick2Nexus(options.reference_tree) reference_tree = nexus.trees[0] if options.loglevel >= 3: print "# reference tree:" print reference_tree.display() patterns = TreeTools.calculatePatternsFromTree(tree, options.sort_order) for p in patterns: print p E.Stop()
def ParseTree(reference_tree, rx_species): nexus = TreeTools.Newick2Nexus(reference_tree) reference_tree = nexus.trees[0] if param_loglevel >= 3: print "# reference tree:" reference_tree.display() map_taxon2id = {} for nx in reference_tree.get_terminals(): otu = reference_tree.node(nx).get_data().taxon map_taxon2id[otu] = len(map_taxon2id) if param_loglevel >= 2: print "# %s\t%i" % (otu, map_taxon2id[otu]) map_taxon2id["unknown"] = len(map_taxon2id) return reference_tree, map_taxon2id
def testGetMergers(self): """ test. TODO: add testing for transcripts """ print "testGetMergers()" for lines, reference, map_strain2species, options in self.mTestData: nexus = TreeTools.Newick2Nexus(lines) mergers = tree_strain2species.getMergers( nexus.trees[0], map_strain2species, options) for node_id, species, strain_x, gene_x, strain_y, gene_y in mergers: key1 = ((strain_x, gene_x), (strain_y, gene_y)) key2 = ((strain_y, gene_y), (strain_x, gene_x)) if key1 not in reference and key2 not in reference: self.fail("%s not in reference %s" % (str(key1), str(reference)))
def processChunk(lines, map_strain2species, options): nexus = TreeTools.Newick2Nexus(lines) global ninput, noutput, nskipped, nmerged for tree in nexus.trees: ninput += 1 if options.loglevel >= 3: tree.display() mergers = getMergers(tree, map_strain2species, options) if options.loglevel >= 3: options.stdlog.write( "# found %i pairs of genes that will be merged.\n" % (len(mergers))) if len(mergers) > 0: nmerged += 1 n = applyMergers(tree, mergers, counters, map_strain2species, options) if len(tree.get_terminals()) <= 1: nskipped += 1 continue for new_name, values in n.items(): for strain, gene in values: if (strain, gene) in merged: options.stdlog.write( "# warning: strain %s and gene %s already appeared in tree %s" % (merged[(strain, gene)])) nwarnings += 1 merged[(strain, gene)] = None output_genes.write("%s\t%s\n" % (options.separator.join( (strain, gene)), new_name)) tree.writeToFile(options.stdout, format=options.output_format) noutput += 1
def GetPrunedReferenceTree( mask, present_orgs, reference_tree ): # reread and process species tree # has to be done for every new pass, because # the tree is modified later on (and I haven't found # a copy mechanism (because I did not look)). nexus = TreeTools.Newick2Nexus( reference_tree ) reference_tree = nexus.trees[0] ########################################################################### # prune reference tree and keep only those taxa, which are present in the cluster. for nx in reference_tree.get_terminals(): otu = reference_tree.node(nx).get_data().taxon if otu not in present_orgs: Prune( reference_tree, otu ) if param_loglevel >= 3: print "# pruned reference tree for %s:" % (",".join(present_orgs.keys())) reference_tree.display() return reference_tree
sort_order = [], ) (options, args) = E.Start( parser ) if not options.sort_order: for nx in reference_tree.get_terminals(): options.sort_order.append( reference_tree.node(nx).get_data().taxon ) else: options.sort_order = options.sort_order.split(",") if not options.reference_tree: raise "no reference tree defined." nexus = TreeTools.Newick2Nexus( options.reference_tree ) reference_tree = nexus.trees[0] if options.loglevel >= 3: print "# reference tree:" print reference_tree.display() patterns = TreeTools.calculatePatternsFromTree( tree, options.sort_order ) for p in patterns: print p E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_duplications.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-s", "--species", dest="species", type="string", help="species to use.") parser.add_option("-p", "--prefix", dest="prefix", type="string", help="prefix to use for temporary files.") parser.add_option("-m", "--method", dest="method", type="string", help="method to use [counts|lists|hists|links].") parser.add_option("-o", "--filename-output", dest="filename_output", type="string", help="output filename.") parser.add_option("-f", "--functions", dest="functions", type="string", help="functions to grep [functional|pseudo|all].") parser.add_option("-l", "--locations", dest="locations", type="string", help="locations to grep [local|nojunk|all|...].") parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("-i", "--fit", dest="fit", type="string", help="fitting method [decay|power]") parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option("--use-relative-height", dest="use_relative_height", action="store_true", help="use relative height values.") parser.add_option( "--reverse", dest="reverse", action="store_true", help="""reverse species. Histograms will show the age of duplications for duplicates in other genomes.""") parser.set_defaults(species="", functions="functional,pseudo,all", locations="local,nojunk,all", filename_output=None, bin_size=1.0, min_value=None, max_value=None, nonnull=None, use_relative_height=False, header=True, fit=None, reverse=False, method="counts") (options, args) = E.Start(parser, add_psql_options=True) options.species = options.species.split(",") options.locations = options.locations.split(",") options.functions = options.functions.split(",") if len(options.species) == 0: raise "please supply list of species." dbhandle = pgdb.connect(options.psql_connection) input_data = map(lambda x: x[:-1].split("\t"), filter(lambda x: x[0] != "#", sys.stdin.readlines())) ## remove header if options.header: del input_data[0] ## decide which columns to take ## 1st column: species1: this is the species in which duplications have occured. ## 2nd column: species2: this is the species with respect to which duplications occured. ## 3rd column: clusterid ## 4th column: chromosomes ## 5th column: function ## 6th column: height ## 7th column: relative height ## 8th column: locations ## 9th column: tree if options.use_relative_height: take = (0, 1, 2, 3, 4, 6, 7, 8) else: take = (0, 1, 2, 3, 4, 5, 7, 8) for x in range(len(input_data)): input_data[x] = tuple([input_data[x][y] for y in take]) map_pos2species = [] map_species2pos = {} for x in range(len(options.species)): map_species2pos[options.species[x]] = x map_pos2species.append(options.species[x]) outfile = None if options.method in ("counts", "medians"): if options.method == "counts": func = len elif options.method == "medians": func = numpy.median for location in options.locations: for function in options.functions: matrix = numpy.zeros( (len(options.species), len(options.species)), numpy.Float) data = GetSubset(input_data, location, function) ## sort by species1 and species2 data.sort() last_species1, last_species2 = None, None values = [] for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: if len(values) > 0: matrix[map_species2pos[last_species1], map_species2pos[last_species2]] = func( values) values = [] last_species1 = species1 last_species2 = species2 values.append(float(height)) if len(values) > 0: matrix[map_species2pos[last_species1], map_species2pos[last_species2]] = func(values) if options.filename_output: dict = {"f": function, "l": location} outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "matrix for method %s: location: %s, function: %s\n" % (options.method, location, function)) if options.method == "medians": format = "%6.4f" elif options.method == "counts": format = "%i" MatlabTools.WriteMatrix(matrix, outfile=outfile, format=format, row_headers=options.species, col_headers=options.species) if options.filename_output: outfile.close() elif options.method in ("lists", "lists-union"): ## write lists of duplicated genes in species1 as compared to species2 ## according to location/function ## First field : gene name ## Second field: cluster id ## Third field : number of other genes in cluster ## Fourth field: location of gene written = {} for location in options.locations: for function in options.functions: values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] data = GetSubset(input_data, location, function) ## sort by species1 and species2 data.sort() last_species1, last_species2 = None, None for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: ## write trees per cluster if options.filename_output: if options.method == "lists": if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1, "o": species2 } written = {} outfile = open(options.filename_output % dict, "w") elif options.method == "lists-union": if last_species1 != species1: if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1 } written = {} outfile = open( options.filename_output % dict, "w") else: outfile = sys.stdout if options.method == "lists": outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, species1, species2)) written = {} elif options.method == "lists-union": if last_species1 != species1: outfile.write( "location: %s, function: %s, species1: %s\n" % (location, function, species1)) written = {} last_species1 = species1 last_species2 = species2 # get tree tt = TreeTools.Newick2Tree(tree) taxa = TreeTools.GetTaxa(tt) for t in taxa: if t in written: continue outfile.write("%s\t%s\t%i\n" % (t, cluster_id, len(taxa))) written[t] = 1 elif options.method in ("hists", "fit-decay"): for location in options.locations: for function in options.functions: values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] data = GetSubset(input_data, location, function) data.sort() ################################################################ ## convert to matrix of list ## values[x][y] contains heights of duplications in species x with reference to y for species1, species2, cluster_id, l, f, height, locations, tree in data: try: values[map_species2pos[species1]][ map_species2pos[species2]].append(float(height)) except KeyError: continue ################################################################ ################################################################ ################################################################ # calculate histograms per species ################################################################ for s in options.species: histograms = [] headers = [] if options.filename_output: dict = {"f": function, "l": location, "s": s} outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write("location: %s, function: %s\n" % (location, function)) for x in range(len(options.species)): if options.reverse: ## duplications in species x vv = values[x][map_species2pos[s]] else: ## duplications in species s vv = values[map_species2pos[s]][x] if len(vv) == 0: pass else: headers.append(options.species[x]) h = Histogram.Calculate( vv, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, no_empty_bins=True) if options.method == "fit-decay": result = fit(h, [2.0, -1.0]) if result: outfile.write( "%s\t%s\t%s\t%i\t%f\t%f\ty = %f * exp ( %f * x )\n" % ( "function", s, options.species[x], h[0][1], result[0], result[1], result[0], result[1], )) elif options.method == "hists": histograms.append(h) if options.method == "hists": combined_histogram = Histogram.Combine( histograms, missing_value="-") outfile.write("bin\t" + "\t".join(headers) + "\n") Histogram.Write(outfile, combined_histogram) if options.filename_output: outfile.close() else: outfile.flush() elif options.method == "pairs": ## get branches with 0 branchlength for location in options.locations: if options.loglevel >= 2: options.stdlog.write("# processing location %s\n" % location) for function in options.functions: if options.loglevel >= 2: options.stdlog.write("# processing function %s " % function) options.stdlog.flush() data = GetSubset(input_data, location, function) if options.loglevel >= 2: options.stdlog.write("with %i data points\n" % len(data)) options.stdlog.flush() data.sort() last_species1, last_species2, last_cluster_id = None, None, None values = [] for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: ## write trees per cluster if options.filename_output: if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1, "o": species2 } outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, species1, species2)) last_species1 = species1 last_species2 = species2 last_cluster_id = None if last_cluster_id != cluster_id: if last_cluster_id != None: pass last_cluster_id = cluster_id outfile.write("%s\t%s\t%s\t%s\n" % (cluster_id, height, locations, tree)) elif options.method == "links": ## write a tree for each species pair: ## each node is a gene+location, the weight of the vertex is the height ## further info added: cluster_id for the duplication for location in options.locations: if options.loglevel >= 2: options.stdlog.write("# processing location %s\n" % location) for function in options.functions: if options.loglevel >= 2: options.stdlog.write("# processing function %s " % function) options.stdlog.flush() data = GetSubset(input_data, location, function) if options.loglevel >= 2: options.stdlog.write("with %i data points\n" % len(data)) options.stdlog.flush() ## stores duplications within first species as compared to second species values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] for species1, species2, cluster_id, l, f, height, locations, tree in data: values[map_species2pos[species1]][ map_species2pos[species2]].append( (cluster_id, -len(locations), locations, tree)) # get links per species for s in options.species: if options.loglevel >= 2: options.stdlog.write("# processing species %s\n" % s) headers = [] for x in range(len(options.species)): if map_pos2species[x] == s: continue vv = values[map_species2pos[s]][x] vv.sort() ## write trees per cluster if options.filename_output: dict = { "f": function, "l": location, "s": s, "o": map_pos2species[x] } outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, s, map_pos2species[x])) ## only print out largest tree last_cluster_id = None for cluster_id, n, locations, tree in vv: if cluster_id != last_cluster_id: outfile.write("%s\t%s\t%s\n" % (cluster_id, locations, tree)) last_cluster_id = cluster_id if options.filename_output: outfile.close() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: trees2sets.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string", help="reference tree to read.") parser.add_option("-e", "--enumeration", dest="enumeration", type="choice", choices=("monophyletic", "full", "pairwise", "exhaustive", "explicit", "lineage"), help="enumeration of ortholog groups.") parser.add_option("-o", "--organisms", dest="column2org", type="string", help="sorted list of organisms.") parser.add_option("-p", "--filename-patterns", dest="filename_patterns", type="string", help="filename with patterns to output.") parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string", help="filename with summary to output.") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("strict", "degenerate", "any", "outgroup", "lineage"), help="sets to extract.") parser.add_option("-s", "--species-set", dest="species_set", type="string", help="comma separated list of species.") parser.add_option("-g", "--outgroups", dest="outgroups", type="string", help="comma separated list of outgroup species.") parser.add_option( "--species-regex", dest="species_regex", type="string", help="regular expression to extract species from identifier.") parser.add_option( "--gene-regex", dest="gene_regex", type="string", help="regular expression to extract gene from identifier.") parser.add_option("--reroot", dest="reroot", type="choice", choices=("outgroup", "midpoint"), help="reroot trees before computing sets.") parser.set_defaults( reference_tree=None, enumeration="full", column2org=None, separator="|", species_regex="^([^|]+)\|", gene_regex="^[^|]+\|[^|]+\|([^|]+)\|", filename_summary=None, methods=[], species_set=None, outgroups=None, reroot=None, ) (options, args) = E.Start(parser) if len(options.methods) == 0: options.methods.append("strict") if options.species_set: options.species_set = options.species_set.split(",") options.enumeration = "explicit" ####################################################################### # warning: outgroup method is useless, as it requires # only a single outgroup per tree and the tree rooted # with the outgroup. if "outgroup" in options.methods and not options.outgroups: raise "please supply --outgroups if method 'outgroup' is chosen." if options.outgroups: options.outgroups = options.outgroups.split(",") ######################################################################## ######################################################################## ######################################################################## if options.reference_tree: if options.reference_tree[0] == "(": nexus = TreeTools.Newick2Nexus(options.reference_tree) else: nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r")) reference_tree = nexus.trees[0] if options.loglevel >= 3: options.stdlog.write("# reference tree:\n%s\n" % reference_tree.display()) else: reference_tree = None raise ValueError("please supply a reference tree") ######################################################################## ######################################################################## ######################################################################## # read all trees ######################################################################## nexus = TreeTools.Newick2Nexus(sys.stdin) ######################################################################## ######################################################################## ######################################################################## # sort out reference tree ######################################################################## rs = re.compile(options.species_regex) rg = re.compile(options.gene_regex) extract_species = lambda x: parseIdentifier(x, options)[0] extract_gene = lambda x: parseIdentifier(x, options)[2] # prune reference tree to species present species_set = set() for tree in nexus.trees: try: species_set = species_set.union( set(map(extract_species, tree.get_taxa()))) except AttributeError: raise "parsing error while extracting species from %s" % str( tree.get_taxa()) TreeTools.PruneTree(reference_tree, species_set) if options.loglevel >= 1: options.stdlog.write("# reference tree after pruning has %i taxa.\n" % len(reference_tree.get_taxa())) if options.column2org: options.column2org = options.column2org.split(",") elif reference_tree: options.column2org = [] for nx in reference_tree.get_terminals(): options.column2org.append(reference_tree.node(nx).get_data().taxon) options.org2column = {} for x in range(len(options.column2org)): options.org2column[options.column2org[x]] = x for method in options.methods: ################################################################### ################################################################### ################################################################### # print out a list of ortholog clusters ################################################################### writeOrthologSets(options.stdout, nexus, extract_species, extract_gene, options=options, reference_tree=reference_tree, method=method, outgroups=options.outgroups) E.Stop()
def writeOrthologSets(outfile, nexus, extract_species, extract_gene, options, reference_tree=None, method="strict", outgroups=None): """output ortholog sets. A "strict" ortholog set contains exactly one gene for each species, while a "degenerate" ortholog set contains at least one gene for each species. """ ###################################################################### # build species set to compare sets = [] species = options.column2org nspecies = len(species) if options.enumeration == "monophyletic": if reference_tree: for members, h1, h2 in TreeTools.GetSubsets(reference_tree): if len(members) > 1: sets.append(members) else: raise "please specify a species tree for monophyletic enumeration" elif options.enumeration == "exhaustive": for x in range(2, len(species)): sets += list(SetTools.xuniqueCombinations(species, x)) sets.append(species) elif options.enumeration == "pairwise": for x in range(len(species) - 1): for y in range(x + 1, len(species)): sets.append((species[x], species[y])) elif options.enumeration == "full": sets.append(species) elif options.enumeration == "lineage": for s in species: sets.append((s, )) elif options.enumeration == "explicit": for x in range(2, len(options.species_set)): sets += list(SetTools.xuniqueCombinations(options.species_set, x)) sets.append(options.species_set) ###################################################################### # build sets with positional information xsets = [] map_frozenset2set = {} for x in range(len(sets)): ss = frozenset(map(lambda x: options.org2column[x], sets[x])) xsets.append(ss) map_frozenset2set[ss] = x ###################################################################### # collect outgroups if outgroups: noutgroups = set() for x in outgroups: noutgroups.add(options.org2column[x]) else: noutgroups = None ###################################################################### # loop over each tree and set # I did not see a way to loop a tree once for all sets without doing # complicated counting. The problem is that counting has to be stopped # at different tree heights for different sets. ninput, noutput, nempty, nskipped = 0, 0, 0, 0 counts = [0] * len(sets) options.stdout.write( "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" % "\t".join(species)) cluster_id = 0 nerrors = 0 for tree in nexus.trees: ninput += 1 ntotal_tree = 0 if options.loglevel >= 3: options.stdlog.write("# processing tree %s\n" % tree.name) if options.reroot: rerootTree(tree, extract_species, options) for c in range(len(xsets)): # numbered species set: 0,1,... sn = xsets[c] # literal species set: species1, species2, ... sl = sets[c] ortholog_nodes = getOrthologNodes(tree, sn, options, selector=method, outgroups=noutgroups) ntotal_tree += len(ortholog_nodes) n = 0 pattern = buildPattern(nspecies, sn) # check for inconsistent partitions (the same gene in different # ortholog clusters) within the current tree found_genes = set() ortho_sets = set() # reverse ortholog_node - work in top-down manner. ortholog_nodes.reverse() for node_id, members in ortholog_nodes: n += 1 cluster_id += 1 otus = filter(lambda x: extract_species(x) in sl, tree.get_taxa(node_id)) genes = set(map(extract_gene, otus)) if found_genes.intersection(genes): # only take largest cluster for lineage specific # duplications if method == "lineage": continue if frozenset(genes) in ortho_sets: nskipped += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: cluster %i: redundant node: %i - skipped because already present: %s\n" % (tree.name, n, node_id, str(found_genes.intersection(genes)))) else: nerrors += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n" % (tree.name, n, node_id, str(found_genes.intersection(genes)))) found_genes = found_genes.union(genes) ortho_sets.add(frozenset(genes)) xpattern = buildPattern(nspecies, sn, members) options.stdout.write( "%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" % (len(sl), tree.name, n, cluster_id, "".join(pattern), "\t".join(xpattern), node_id, ";".join(otus))) counts[c] += n if ntotal_tree == 0: nempty += 1 else: noutput += 1 if options.loglevel >= 1: options.stdout.write( "# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" % (ninput, nempty, noutput, nskipped, nerrors)) # write summary information if options.filename_summary: outfile = open(options.filename_summary, "w") else: outfile = options.stdout outfile.write("//\n") outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species))) for c in range(len(xsets)): pattern = buildPattern(nspecies, xsets[c]) outfile.write("%i\t%s\t%i\t%s\n" % (c, "".join(pattern), counts[c], "\t".join(pattern))) if outfile != options.stdout: outfile.close()
def getOrthologNodes(tree, positive_set, options, selector="strict", outgroups=None): """get all ortholog nodes in tree for species in positive_set. Depending on the selector function, different sets are returned: If selector is "strict", only strict orthologs are returned. These contain exactly one gene per species for all species in the positive_set. If selector is "degenerate", only degenerate orthologs are returned. These contain at least gene per species for species in the positive_set. Collect genes in tree for each species. Returns the node_id for which a set fulfills the criteria and the set for which it fulfills it. Avoid double counting: if you are interested in species A and B, any branches involving others species should be ignored. Make sure to only count once and not every time a discarded branch is removed. Thus, as soon as A and B merge, any node further up the tree have to be ignored. total_genes_function: if true, node is recorded total_species_function: if true, iteration stops """ nspecies = len(options.org2column) if selector == "strict": # strict orthologs: at most one gene per species exit_function = lambda num_genes_for_species: num_genes_for_species > 1 keep_function = lambda num_genes_for_species: num_genes_for_species == 1 total_genes_function = lambda num_genes_at_node, num_species_in_pattern: num_genes_at_node == num_species_in_pattern total_species_function = lambda num_species_at_node, num_species_in_pattern: num_species_at_node == num_species_in_pattern check_outgroup_function = lambda x: False negative_set = set() elif selector == "degenerate": # degenerate orthologs: any number of genes per species, exit_function = lambda num_genes_for_species: False keep_function = lambda num_genes_for_species: num_genes_for_species > 0 total_genes_function = lambda num_genes_at_node, num_species_in_pattern: num_genes_at_node > num_species_in_pattern total_species_function = lambda num_species_at_node, num_species_in_pattern: num_species_at_node == num_species_in_pattern check_outgroup_function = lambda x: False negative_set = set() elif selector == "lineage": # lineage specific duplications: at least 1 gene exit_function = lambda num_genes_for_species: False keep_function = lambda num_genes_for_species: num_genes_for_species > 1 total_genes_function = lambda num_genes_at_node, num_species_in_pattern: num_genes_at_node >= num_species_in_pattern total_species_function = lambda num_species_at_node, num_species_in_pattern: False check_outgroup_function = lambda x: False negative_set = set(range(nspecies)).difference(positive_set) elif selector == "any": # any number of orthologs, including # orphans exit_function = lambda num_genes_for_species: False keep_function = lambda num_genes_for_species: True total_genes_function = lambda num_genes_at_node, num_species_in_pattern: True total_species_function = lambda num_species_at_node, num_species_in_pattern: num_species_at_node == num_species_in_pattern check_outgroup_function = lambda x: False negative_set = set() elif selector == "outgroup": # group selector exit_function = lambda num_genes_for_species: False keep_function = lambda num_genes_for_species: num_genes_for_species > 0 total_genes_function = lambda num_genes_at_node, num_species_in_pattern: False total_species_function = lambda num_species_at_node, num_species_in_pattern: False # check for outgrup: needs to have outgroup and at least one other species # ie.: sum of all genes in outgroups larger than sum of all genes in # all species if not outgroups: raise "usage error: please supply outgroups if 'outgroup'-selector is chosen." check_outgroup_function = lambda genes: 0 < sum( [len(genes[x]) for x in outgroups]) < sum(map(lambda x: len(x), genes)) negative_set = set() else: raise "unknown selector %s" % selector # work here: set genes[node_id] to None, # 1. if the gene count for a species of interest is > 1 # 2. if the gene count for all species of interest is 1 in # the child node. if options.loglevel >= 5: options.stdlog.write("# gene tree\n") tree.display() n = TreeTools.GetSize(tree) + 1 genes = [] for x in range(n): genes.append([set() for x in range(nspecies)]) ortholog_nodes = [] def count_genes(node_id): """record number of genes per species for each node """ node = tree.node(node_id) if options.loglevel >= 6: options.stdlog.write("# node_id=%i\n" % node_id) if options.loglevel >= 10: options.stdlog.write("# sets=%s\n" % (str(genes))) # species in pattern num_species_in_pattern = len(positive_set) if node.succ: # process non-leaf node for s in node.succ: # propagate: terminated nodes force upper nodes to terminate # (assigned to None). if not genes[s]: genes[node_id] = None return # total number of genes at node num_genes_at_node = 0 # total number of species at node num_species_at_node = 0 # compute new gene set for each species at node for x in positive_set: genes[node_id][x] = genes[node_id][x].union(genes[s][x]) num_genes_for_species = len(genes[node_id][x]) if exit_function(num_genes_for_species): genes[node_id] = None return num_genes_at_node += num_genes_for_species if num_genes_for_species: num_species_at_node += 1 if options.loglevel >= 6: print "node=", node_id, "species_at_node", num_species_at_node, "genes_at_node=", num_genes_at_node, \ "num_genes_for_species=", num_genes_for_species, "ngenes=", sum( map(lambda x: len(x), genes[node_id])) options.stdlog.write("# genes at node %i\t%s\n" % (node_id, genes[node_id])) if outgroups: print sum([len(genes[node_id][x]) for x in outgroups]) print check_outgroup_function(genes[node_id]) # check stop criterion if total_species_function(num_species_at_node, num_species_in_pattern): # check if positive requirements are fulfilled for x in positive_set: if not keep_function(len(genes[node_id][x])): if options.loglevel >= 6: options.stdlog.write( "# keep function false for species %i\n" % x) break else: if total_genes_function(num_genes_at_node, num_species_in_pattern): if options.loglevel >= 6: options.stdlog.write("# recording node %i\n" % x) ortholog_nodes.append((node_id, genes[node_id])) genes[node_id] = None return elif check_outgroup_function(genes[node_id]): ortholog_nodes.append((node_id, genes[node_id])) genes[node_id] = None return elif negative_set: if total_genes_function(num_genes_at_node, num_species_in_pattern): if options.loglevel >= 6: options.stdlog.write("# recording node %i\n" % node_id) ortholog_nodes.append((node_id, genes[node_id])) else: # process leaf s, t, g, q = parseIdentifier(node.data.taxon, options) c = options.org2column[s] if c in positive_set: genes[node_id][c].add(g) elif c in negative_set: genes[node_id] = None tree.dfs(tree.root, post_function=count_genes) return ortholog_nodes
def processMali(mali, options): ncols = mali.getNumColumns() if ncols == 0: raise "refusing to process empty alignment." ## add annotation of states if options.block_size != None: if options.block_size < 1: size = int(float(ncols) / 3.0 * options.block_size) * 3 else: size = int(options.block_size) * 3 size = min(size, ncols) mali.addAnnotation("STATE", "N" * size + "C" * (ncols - size)) ## remove gene ids for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename(id, species) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True) ids = mali.getIdentifiers() xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 # remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps(minimum_gaps=1, frame=3) if options.input_filename_tree: nexus = TreeTools.Newick2Nexus(open(options.input_filename_tree, "r")) tree = nexus.trees[0] tree.relabel(map_old2new) else: tree = None annotation = mali.getAnnotation("STATE") chars = set(list(annotation)) for c in chars: assert c in ( "N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized" if len(chars) == 1: if options.loglevel >= 1: options.stdlog.write("# WARNING: only a single block") blocks = (("B0_", chars[0]), ) else: blocks = (("B0_", "N"), ("B1_", "C")) result, mali, ids = prepareGrammar(xgram, mali, tree, map_old2new, blocks, options) trained_model = result.getModel() pis, matrices = RateEstimation.getRateMatrix(trained_model) annotation = mali.getAnnotation("STATE") for block, code in blocks: terminals = ("%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block) pi = pis[terminals] if options.shared_rates == "all": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa": rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa-ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "omega": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = block elif options.shared_rates == "omega-ds": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = "" elif options.shared_rates == "ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block else: rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block if options.shared_frequencies: frequency_prefix = "" else: frequency_prefix = block rs = trained_model.mGrammar.getParameter('%sRs' % rate_prefix_rs) rn = trained_model.mGrammar.getParameter('%sRn' % rate_prefix_rn) ri = trained_model.mGrammar.getParameter('%sRi' % rate_prefix_ri) rv = trained_model.mGrammar.getParameter('%sRv' % rate_prefix_rv) nchars = annotation.count(code) msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv) try: Q, t = RateEstimation.getQMatrix(pi, Rsi=rs * ri, Rsv=rs * rv, Rni=rn * ri, Rnv=rn * rv) avg_omega = (rs + rn) / 2.0 Q0, t0 = RateEstimation.getQMatrix(pi, Rsi=ri * avg_omega, Rsv=rv * avg_omega, Rni=ri * avg_omega, Rnv=rv * avg_omega) avg_kappa = (ri + rv) / 2.0 Q1, t1 = RateEstimation.getQMatrix(pi, Rsi=rs * avg_kappa, Rsv=rs * avg_kappa, Rni=rn * avg_kappa, Rnv=rn * avg_kappa) rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1) dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_kappa = options.value_format % (rI / rI0 * rV0 / rV) o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 except ZeroDivisionError: o_kappa = "na" o_omega = "na" o_dn = "na" o_ds = "na" o_rn = "na" o_rs = "na" o_rn0 = "na" o_rs0 = "na" o_t = "na" o_t0 = "na" Q = None msg = "insufficient data to estimate rate matrix." options.stdout.write("\t".join( map(str, (code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na", o_kappa, result.getLogLikelihood(), "na", nchars)))) if options.with_rho: options.stdout.write( "\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) options.stdout.write("\t%s\n" % msg)
def calculateCoordinates(self): self.mNodeHeights = [0] * self.mNNodes self.mNodeWidthsStart = [0] * self.mNNodes self.mNodeWidthsEnd = [0] * self.mNNodes # if no scales are given, try to do best fit if self.mHeightScaleFactor == 0: rescale_height = True self.mHeightScaleFactor = 1 else: rescale_height = False if self.mBranchScaleFactor == 0: rescale_width = True self.mBranchScaleFactor = 100 else: rescale_width = False ########################################################## # Get Vertical coordinates # Label nodes by their height. Terminal nodes have integer coordinates. # Internal nodes have fractional coordinates (the average between the two # children) counter = [0] def updateHeights(node_id): l = len(self.mTree.node(node_id).succ) if l: # set node height for internal node t = 0 for x in self.mTree.node(node_id).succ: t += self.mNodeHeights[x] # used to use use the following to take into account # the height of symbols. This is wrong and better done by # pre-traversal of the tree # self.mNodeHeights[node_id] = float(t) / float(l) + max( self.mDecoratorInternalNodes.getHeight( node_id ), self.mDecoratorHorizontalBranches.getHeight( node_id )) # instead: use uncorrected heights. self.mNodeHeights[node_id] = float(t) / float(l) else: # set node height for external node self.mNodeHeights[node_id] = counter[0] counter[0] += max(self.mDecoratorExternalNodes.getHeight(node_id), self.mDecoratorHorizontalBranches.getHeight( node_id) ) \ * self.mHeightScaleFactor + self.mTerminalLabelSeparator TreeTools.TreeDFS(self.mTree, self.mTree.root, post_function=updateHeights) self.mMaxNodeHeight = counter[0] ########################################################## # Get horizontal coordinates def updateWidths(node_id): node = self.mTree.node(node_id) d = node.data.branchlength # set default branchlength to 0.01 for empty branch lengths # TODO: deal with trees without branch lengths later. if d <= 0.0: d = 0.01 right = self.mNodeWidthsStart[node_id] + int( d * self.mBranchScaleFactor) self.mNodeWidthsEnd[node_id] = right for s in node.succ: self.mNodeWidthsStart[s] = right TreeTools.TreeDFS(self.mTree, self.mTree.root, pre_function=updateWidths) if rescale_height: m = max(self.mNodeHeights) f = float(self.mDefaultHeight) / m if 100 * f < 1: f = 0.01 self.mHeightScaleFactor = 100 * f self.mNodeHeights = map(lambda x: int(x * f), self.mNodeHeights) self.mMaxNodeHeight *= f if rescale_width: m = max(self.mNodeWidthsEnd) f = float(self.mDefaultWidth) / m self.mBranchScaleFactor = 100 * f self.mNodeWidthsStart = map(lambda x: int(x * f), self.mNodeWidthsStart) self.mNodeWidthsEnd = map(lambda x: int(x * f), self.mNodeWidthsEnd) # add a safety margin for decorators writing above the line. This # is a patch and should be changed such that decorators report # their correct height for x in range(self.mNNodes): self.mNodeHeights[x] += 45
colour_by_species=None, tree=None, branch_scale=0, height_scale=0, ) (options, args) = Experiment.Start(parser, add_pipe_options=True) if options.filename_tree: tree_lines = open(options.filename_tree, "r").readlines() elif options.tree: tree_lines = options.tree else: raise "please supply a species tree." nexus = TreeTools.Newick2Nexus(tree_lines) Tree.updateNexus(nexus) tree = nexus.trees[0] if options.loglevel >= 2: tree.display() plot = SVGTree(tree) plot.setBranchScale(options.branch_scale) plot.setHeightScale(options.height_scale) if options.colour_by_species: rx = re.compile(options.species_regex) extract_species = lambda x: rx.search(x).groups()[0] plot.setDecoratorExternalNodes(
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/plot_duplications.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-e", "--headers", dest="headers", action="store_true", help="first row is a header [ignored].") parser.add_option("-t", "--title", dest="title", type="string", help="page title.") parser.add_option("-f", "--footer", dest="footer", type="string", help="page footer.") parser.add_option("-c", "--contig-sizes", dest="filename_contig_sizes", type="string", help="filname with contig sizes.") parser.add_option("-r", "--radius", dest="radius", type="int", help="radius.") parser.add_option("-i", "--increment", dest="radius_increment", type="int", help="radius increment.") parser.add_option("-u", "--url", dest="url", type="string", help="string to build url for annotation.") parser.add_option("--min-contig", dest="min_contig_size", type="string", help="minimum contig size to delineate.") parser.add_option("--min-value", dest="min_value", type="float", help="minimum branch length.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum branch length.") parser.set_defaults( filename_contig_sizes=None, headers=False, titles="", pattern_filename=None, title="", footer="", radius=3000, min_value=0.0, max_value=0.2, url=None, radius_increment=40, min_contig_size=10000, remove_empty_contigs=True, separator="|", quality2symbol={ 'CG': "circle", 'PG': "circle", 'SG': "circle" }, quality2mask=("RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK"), sort_by_size=True, input_format="pairwise", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_contig_sizes: map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes, "r"), map_functions=(str, int)) # read data and get contigs that are used (i.e.: remove empty contigs) chrs = {} lines = sys.stdin.readlines() if options.remove_empty_contigs: for line in lines: if line[0] == "#": continue d = line[:-1].split("\t") cluster_id, in_locations, in_tree = d[:3] for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue chrs[chr] = 1 for k in map_contig2size.keys(): if k not in chrs: del map_contig2size[k] k = map_contig2size.keys() if len(k) == 0: E.Stop() sys.exit(0) k.sort() if options.sort_by_size: k.sort(lambda x, y: cmp(map_contig2size[x], map_contig2size[y])) plot = DuplicationPlot(k, map_contig2size, num_entries=0) plot.mRadiusIncrement = options.radius_increment plot.mRadius = options.radius plot.mMaxValue = options.max_value plot.mMinValue = options.min_value if options.title: plot.setTitle(options.title) if options.footer: plot.setFooter(options.footer) plot.initializePlot() data = [] if options.input_format == "pairwise": # read data from pairwise analysis # format is: cluster_id, locations of duplications, tree of # duplications for line in lines: if line[0] == "#": continue d = line[:-1].split("\t") cluster_id, in_locations, in_tree = d[:3] mi, ma = 0, 0 found = False n = 0 chrs = {} for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue chrs[chr] = 1 sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) xi = plot.getPosition(chr, strand, sbjct_from) xa = plot.getPosition(chr, strand, sbjct_to) if not mi: mi = xi else: mi = min(mi, xi) n += 1 ma = max(ma, xa) found = True if not found: continue cis = len(chrs) == 1 if options.loglevel >= 2: options.stdlog.write( "# adding duplications in cluster %s: %s with tree %s\n" % (cluster_id, in_locations, in_tree)) data.append((cis, n, mi, ma, cluster_id, in_locations, in_tree)) data.sort() plot.mNumEntries = len(data) plot.initializePlot() last_ndups = 0 for cis, ndups, mi, ma, cluster_id, in_locations, in_tree in data[:]: if ndups != last_ndups: plot.pushRadius() plot.addSeparator() last_ndups = ndups map_gene2location = {} for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to) if not map_gene2location: continue tree = TreeTools.Newick2Tree(in_tree) # the last subset is all nodes again. s = TreeTools.GetSubsets(tree) is_first = True for children, height, branchlength in s[:-1]: if len(children) == 1: continue c = map(lambda x: x.split(options.separator), children) plot.addDuplication(c, map_gene2location, height, url=options.url, with_separator=is_first, link_to_previous=not is_first, quality2symbol=options.quality2symbol, quality2mask=options.quality2mask) is_first = False plot.writeToFile(sys.stdout) E.Stop()
def processMali(mali, options): map_new2old = mali.mapIdentifiers() ids = mali.getIdentifiers() invalid_chars = options.gap_chars + options.mask_chars has_non_overlaps = False pairs = [] if options.iteration == "all-vs-all": for x in range(len(ids)): for y in range(0, x): pairs.append((x, y)) elif options.iteration == "first-vs-all": for y in range(1, len(ids)): pairs.append((0, y)) elif options.iteration == "pairwise": if len(ids) % 2 != 0: raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len( ids) for x in range(0, len(ids), 2): pairs.append((x, x + 1)) elif options.iteration == "tree": pairs = [] else: raise "unknown iteration mode: %s" % (options.iteration) if options.remove_stops: for id, entry in mali.items(): s = entry.mString.upper() fragments = [] for x in range(0, len(s), 3): codon = s[x:x + 3] if Genomics.IsStopCodon(codon): codon = "NNN" fragments.append(codon) entry.mString = "".join(fragments) for x, y in pairs: noverlap = 0 for a, b in zip(mali[ids[x]], mali[ids[y]]): if a not in invalid_chars and b not in invalid_chars: noverlap += 1 if noverlap >= options.min_overlap: break else: has_non_overlaps = True break if options.tree: tree = TreeTools.Newick2Nexus(options.tree).trees[0] map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True) tree.relabel(map_old2new) else: tree = None if options.method == "paml": runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options) elif options.method == "xrate": runXrate(mali, has_non_overlaps, pairs, map_new2old, options)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: trees2tree.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("counts", "min", "max", "sum", "mean", "median", "stddev", "non-redundant", "consensus", "select-largest"), help="aggregation function.") parser.add_option("-r", "--regex-id", dest="regex_id", type="string", help="regex pattern to extract identifier from tree name for the selection functions.") parser.add_option("-w", "--write-values", dest="write_values", type="string", help="if processing multiple trees, write values to file.") parser.add_option("-e", "--error-branchlength", dest="error_branchlength", type="float", help="set branch length without counts to this value.") parser.set_defaults( method="mean", regex_id=None, filtered_branch_lengths=(-999.0, 999.0), write_values = None, error_branchlength = None, separator=":", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.loglevel >= 2: options.stdlog.write("# reading trees from stdin.\n") options.stdlog.flush() nexus = TreeTools.Newick2Nexus(sys.stdin) if options.loglevel >= 1: options.stdlog.write( "# read %i trees from stdin.\n" % len(nexus.trees)) nskipped = 0 ninput = len(nexus.trees) noutput = 0 nerrors = 0 if options.method == "non-redundant": # compute non-redudant trees template_trees = [] template_counts = [] ntree = 0 for tree in nexus.trees: for x in range(0, len(template_trees)): is_compatible, reason = TreeTools.IsCompatible( tree, template_trees[x]) if is_compatible: template_counts[x] += 1 break else: template_counts.append(1) template_trees.append(tree) if options.loglevel >= 2: options.stdlog.write( "# tree=%i, ntemplates=%i\n" % (ntree, len(template_trees))) ntree += 1 for x in range(0, len(template_trees)): if options.loglevel >= 1: options.stdlog.write("# tree: %i, counts: %i, percent=%5.2f\n" % (x, template_counts[x], template_counts[x] * 100.0 / ntotal)) options.stdout.write( TreeTools.Tree2Newick(template_trees[x]) + "\n") elif options.method in ("select-largest",): # select one of the trees with the same name. clusters = {} for x in range(0, len(nexus.trees)): n = nexus.trees[x].name if options.regex_id: n = re.search(options.regex_id, n).groups()[0] if n not in clusters: clusters[n] = [] clusters[n].append(x) new_trees = [] for name, cluster in clusters.items(): new_trees.append( getBestTree([nexus.trees[x] for x in cluster], options.method)) for x in range(0, len(new_trees)): options.stdout.write(">%s\n" % new_trees[x].name) options.stdout.write(TreeTools.Tree2Newick(new_trees[x],) + "\n") noutput += 1 nskipped = ntotal - noutput elif options.method == "consensus": phylip = WrapperPhylip.Phylip() phylip.setLogLevel(options.loglevel - 2) phylip.setProgram("consense") phylip_options = [] phylip_options.append("Y") phylip.setOptions(phylip_options) phylip.setTrees(nexus.trees) result = phylip.run() options.stdout.write( "# consensus tree built from %i trees\n" % (phylip.mNInputTrees)) options.stdout.write( TreeTools.Tree2Newick(result.mNexus.trees[0]) + "\n") noutput = 1 else: if options.method in ("min", "max", "sum", "mean", "counts"): xtree = nexus.trees[0] for n in xtree.chain.keys(): if xtree.node(n).data.branchlength in options.filtered_branch_lengths: xtree.node(n).data.branchlength = 0 ntotals = [1] * len(xtree.chain.keys()) if options.method == "min": f = min elif options.method == "max": f = max elif options.method == "sum": f = lambda x, y: x + y elif options.method == "mean": f = lambda x, y: x + y elif options.method == "counts": f = lambda x, y: x + 1 for n in xtree.chain.keys(): if xtree.node(n).data.branchlength not in options.filtered_branch_lengths: xtree.node(n).data.branchlength = 1 else: xtree.node(n).data.branchlength = 0 else: raise "unknown option %s" % options.method for tree in nexus.trees[1:]: for n in tree.chain.keys(): if tree.node(n).data.branchlength not in options.filtered_branch_lengths: xtree.node(n).data.branchlength = f( xtree.node(n).data.branchlength, tree.node(n).data.branchlength) ntotals[n] += 1 if options.method == "mean": for n in xtree.chain.keys(): if ntotals[n] > 0: xtree.node(n).data.branchlength = float( xtree.node(n).data.branchlength) / ntotals[n] else: if options.error_branchlength is not None: xtree.node( n).data.branchlength = options.error_branchlength if options.loglevel >= 1: options.stdlog.write( "# no counts for node %i - set to %f\n" % (n, options.error_branchlength)) nerrors += 1 else: raise "no counts for node %i" % n else: # collect all values for trees values = [[] for x in range(TreeTools.GetSize(nexus.trees[0]))] for tree in nexus.trees: for n, node in tree.chain.items(): if node.data.branchlength not in options.filtered_branch_lengths: values[n].append(node.data.branchlength) tree = nexus.trees[0] for n, node in tree.chain.items(): if len(values[n]) > 0: if options.method == "stddev": node.data.branchlength = scipy.std(values[n]) elif options.method == "median": node.data.branchlength = scipy.median(values[n]) else: if options.error_branchlength is not None: node.data.branchlength = options.error_branchlength if options.loglevel >= 1: options.stdlog.write( "# no counts for node %i - set to %f\n" % (n, options.error_branchlength)) nerrors += 1 else: raise "no counts for node %i" % n if options.write_values: outfile = open(options.write_values, "w") for n, node in tree.chain.items(): values[n].sort() id = options.separator.join( sorted(TreeTools.GetLeaves(tree, n))) outfile.write("%s\t%s\n" % (id, ";".join(map(str, values[n])))) outfile.close() del nexus.trees[1:] options.stdout.write(TreeTools.Nexus2Newick(nexus) + "\n") noutput = 1 if options.loglevel >= 1: options.stdlog.write("# ntotal=%i, nskipped=%i, noutput=%i, nerrors=%i\n" % ( ninput, nskipped, noutput, nerrors)) E.Stop()
if options.prefix: prefix_tree = ">%s\n" % options.prefix prefix_header = "prefix\t" prefix_row = "%s\t" % options.prefix else: prefix_tree = "" prefix_header = "" prefix_row = "" for method in options.methods: if method == "write-ks-tree": for result in results: options.stdout.write(prefix_tree + TreeTools.Tree2Newick(result.mTreeKs) + "\n") elif method == "write-ka-tree": for result in results: options.stdout.write(prefix_tree + TreeTools.Tree2Newick(result.mTreeKa) + "\n") elif method == "write-kaks-tree": for result in results: options.stdout.write(prefix_tree + TreeTools.Tree2Newick(result.mTreeKaks) + "\n") elif method == "lrt":
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/count_orgs.py 1706 2007-12-11 16:46:11Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string", help="reference tree to read.") parser.add_option("-p", "--filename-patterns", dest="filename_patterns", type="string", help="filename with patterns to output.") parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string", help="filename with summary to output.") parser.add_option("-f", "--format", dest="format", type="choice", choices=("map", "links", "trees"), help="output format.") parser.add_option("-o", "--organisms", dest="column2org", type="string", help="sorted list of organisms.") parser.add_option( "-s", "--species-regex", dest="species_regex", type="string", help="regular expression to extract species from identifier.") parser.add_option( "-g", "--gene-regex", dest="gene_regex", type="string", help="regular expression to extract gene from identifier.") parser.set_defaults( reference_tree=None, format="map", filename_patterns=None, column2org=None, species_regex="^([^|]+)\|", gene_regex="^[^|]+\|[^|]+\|([^|]+)\|", separator="|", filename_summary=None, ) (options, args) = E.Start(parser) if options.reference_tree: if options.reference_tree[0] == "(": nexus = TreeTools.Newick2Nexus(options.reference_tree) else: nexus = TreeTools.Newick2Nexus(open(options.reference_tree, "r")) reference_tree = nexus.trees[0] if options.loglevel >= 3: print "# reference tree:" print reference_tree.display() else: reference_tree = None clusters = {} if options.format == "map": for line in sys.stdin: if line[0] == "#": continue id, r = line[:-1].split("\t") if r not in clusters: clusters[r] = [] clusters[r].append(id) elif options.format == "trees": nexus = TreeTools.Newick2Nexus(sys.stdin) for tree in nexus.trees: clusters[tree.name] = tree.get_taxa() elif options.format == "links": members = set() id = None for line in sys.stdin: if line[0] == "#": continue if line[0] == ">": if id: clusters[id] = members x = re.match(">cluster #(\d+)", line[:-1]) if x: id = x.groups()[0] else: id = line[1:-1] members = set() continue data = line[:-1].split("\t")[:2] members.add(data[0]) members.add(data[1]) if id: clusters[id] = members if len(clusters) == 0: raise "empty input." ######################################################################## ######################################################################## ######################################################################## ## sort out reference tree ######################################################################## rs = re.compile(options.species_regex) rg = re.compile(options.gene_regex) extract_species = lambda x: rs.search(x).groups()[0] ## prune tree to species present species_set = set() for cluster, members in clusters.items(): species_set = species_set.union(set(map(extract_species, members))) if reference_tree: TreeTools.PruneTree(reference_tree, species_set) if options.loglevel >= 1: options.stdlog.write("# Tree after pruning: %i taxa.\n" % len(reference_tree.get_taxa())) if options.column2org: options.column2org = options.column2org.split(",") elif reference_tree: options.column2org = [] for nx in reference_tree.get_terminals(): options.column2org.append(reference_tree.node(nx).get_data().taxon) else: options.column2org = [] for x in species_set: options.column2org.append(x) options.org2column = {} for x in range(len(options.column2org)): options.org2column[options.column2org[x]] = x if reference_tree: reference_patterns = TreeTools.calculatePatternsFromTree( reference_tree, options.column2org) if options.loglevel >= 3: print "# reference patterns:" print reference_patterns ############################################################################## notus = len(options.column2org) patterns = {} species_counts = [SpeciesCounts() for x in options.column2org] ## first genes, then transcripts options.stdout.write( "mali\tpattern\tpresent\tngenes\t%s\tntranscripts\t%s\n" % ("\t".join(options.column2org), "\t".join(options.column2org))) keys = clusters.keys() keys.sort() for cluster in keys: members = clusters[cluster] count_genes = [{} for x in range(len(options.org2column))] count_transcripts = [0] * len(options.org2column) for m in members: data = m.split(options.separator) if len(data) == 4: s, t, g, q = data elif len(data) == 2: s, g = data t = g if s not in options.org2column: raise "unknown species %s" % s col = options.org2column[s] count_transcripts[col] += 1 if g not in count_genes[col]: count_genes[col][g] = 0 count_genes[col][g] += 1 species_counts[col].mGenes.add(g) species_counts[col].mTranscripts.add(t) species_counts[col].mTrees.add(cluster) ntotal_transcripts = reduce(lambda x, y: x + y, count_transcripts) npresent_transcripts = len(filter(lambda x: x > 0, count_transcripts)) ntotal_genes = reduce(lambda x, y: x + y, map(len, count_genes)) npresent_genes = len(filter(lambda x: x > 0, map(len, count_genes))) pattern = GetPattern(count_transcripts, notus) if pattern not in patterns: patterns[pattern] = 0 patterns[pattern] += 1 options.stdout.write( string.join( (cluster, pattern, str(npresent_genes), str(ntotal_genes), string.join(map(str, map(len, count_genes)), "\t"), str(ntotal_transcripts), string.join(map(str, count_transcripts), "\t")), "\t") + "\n") ####################################################################################### ####################################################################################### ####################################################################################### ## write pattern summary ####################################################################################### xx = patterns.keys() xx.sort() if options.filename_patterns: outfile = open(options.filename_patterns, "w") else: outfile = sys.stdout for x in range(len(options.column2org)): outfile.write("# %i = %s\n" % (x, options.column2org[x])) if reference_tree: outfile.write("pattern\tcounts\tisok\n") else: outfile.write("pattern\tcounts\n") for x in xx: if reference_tree: if x in reference_patterns: is_ok = "1" else: is_ok = "0" outfile.write("%s\t%s\t%s\n" % (x, patterns[x], is_ok)) else: outfile.write("%s\t%s\n" % (x, patterns[x])) if outfile != sys.stdout: outfile.close() ####################################################################################### ####################################################################################### ####################################################################################### ## write summary counts per species ####################################################################################### if options.filename_summary: outfile = open(options.filename_summary, "w") else: outfile = sys.stdout outfile.write("species\tntranscripts\tngenes\tntrees\n") for species, col in options.org2column.items(): outfile.write( "%s\t%i\t%i\t%i\n" % (species, len(species_counts[col].mTranscripts), len(species_counts[col].mGenes), len(species_counts[col].mTrees))) if outfile != sys.stdout: outfile.close() E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: optic/count_orgs.py 1706 2007-12-11 16:46:11Z andreas $", usage = globals()["__doc__"]) parser.add_option("-t", "--reference-tree", dest="reference_tree", type="string", help="reference tree to read." ) parser.add_option("-p", "--filename-patterns", dest="filename_patterns", type="string", help="filename with patterns to output." ) parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string", help="filename with summary to output." ) parser.add_option("-f", "--format", dest="format", type="choice", choices=("map", "links", "trees"), help="output format." ) parser.add_option( "-o", "--organisms", dest="column2org", type="string" , help="sorted list of organisms.") parser.add_option( "-s", "--species-regex", dest="species_regex", type="string" , help="regular expression to extract species from identifier.") parser.add_option( "-g", "--gene-regex", dest="gene_regex", type="string" , help="regular expression to extract gene from identifier.") parser.set_defaults( reference_tree = None, format="map", filename_patterns=None, column2org=None, species_regex ="^([^|]+)\|", gene_regex = "^[^|]+\|[^|]+\|([^|]+)\|", separator = "|", filename_summary = None, ) (options, args) = E.Start( parser ) if options.reference_tree: if options.reference_tree[0] == "(": nexus = TreeTools.Newick2Nexus( options.reference_tree ) else: nexus = TreeTools.Newick2Nexus( open(options.reference_tree, "r") ) reference_tree = nexus.trees[0] if options.loglevel >= 3: print "# reference tree:" print reference_tree.display() else: reference_tree = None clusters = {} if options.format == "map": for line in sys.stdin: if line[0] == "#": continue id, r = line[:-1].split("\t") if r not in clusters: clusters[r] = [] clusters[r].append( id ) elif options.format == "trees": nexus = TreeTools.Newick2Nexus( sys.stdin ) for tree in nexus.trees: clusters[tree.name] = tree.get_taxa() elif options.format == "links": members = set() id = None for line in sys.stdin: if line[0] == "#": continue if line[0] == ">" : if id: clusters[id] = members x = re.match(">cluster #(\d+)", line[:-1] ) if x: id = x.groups()[0] else: id = line[1:-1] members = set() continue data = line[:-1].split("\t")[:2] members.add( data[0] ) members.add( data[1] ) if id: clusters[id] = members if len(clusters) == 0: raise "empty input." ######################################################################## ######################################################################## ######################################################################## ## sort out reference tree ######################################################################## rs = re.compile( options.species_regex ) rg = re.compile( options.gene_regex ) extract_species = lambda x: rs.search(x).groups()[0] ## prune tree to species present species_set = set() for cluster, members in clusters.items(): species_set = species_set.union( set(map( extract_species, members) ) ) if reference_tree: TreeTools.PruneTree( reference_tree, species_set ) if options.loglevel >= 1: options.stdlog.write("# Tree after pruning: %i taxa.\n" % len(reference_tree.get_taxa())) if options.column2org: options.column2org = options.column2org.split(",") elif reference_tree: options.column2org = [] for nx in reference_tree.get_terminals(): options.column2org.append( reference_tree.node(nx).get_data().taxon ) else: options.column2org = [] for x in species_set: options.column2org.append( x ) options.org2column = {} for x in range(len(options.column2org)): options.org2column[options.column2org[x]] = x if reference_tree: reference_patterns = TreeTools.calculatePatternsFromTree( reference_tree, options.column2org ) if options.loglevel >= 3: print "# reference patterns:" print reference_patterns ############################################################################## notus = len(options.column2org) patterns = {} species_counts = [ SpeciesCounts() for x in options.column2org ] ## first genes, then transcripts options.stdout.write("mali\tpattern\tpresent\tngenes\t%s\tntranscripts\t%s\n" % ("\t".join(options.column2org) , "\t".join(options.column2org) )) keys = clusters.keys() keys.sort() for cluster in keys: members = clusters[cluster] count_genes = [ {} for x in range(len(options.org2column)) ] count_transcripts = [0] * len(options.org2column) for m in members: data = m.split(options.separator) if len(data) == 4: s, t, g, q = data elif len(data) == 2: s, g = data t = g if s not in options.org2column: raise "unknown species %s" % s col = options.org2column[s] count_transcripts[col] += 1 if g not in count_genes[col]: count_genes[col][g] = 0 count_genes[col][g] += 1 species_counts[col].mGenes.add( g ) species_counts[col].mTranscripts.add( t ) species_counts[col].mTrees.add( cluster ) ntotal_transcripts = reduce( lambda x,y: x+y, count_transcripts) npresent_transcripts = len(filter( lambda x: x > 0, count_transcripts)) ntotal_genes = reduce( lambda x,y: x+y, map(len, count_genes)) npresent_genes = len(filter( lambda x: x > 0, map(len,count_genes))) pattern = GetPattern( count_transcripts, notus ) if pattern not in patterns: patterns[pattern] = 0 patterns[pattern] += 1 options.stdout.write( string.join( (cluster, pattern, str(npresent_genes), str(ntotal_genes), string.join( map(str, map(len, count_genes)), "\t"), str(ntotal_transcripts), string.join( map(str, count_transcripts), "\t")), "\t") + "\n") ####################################################################################### ####################################################################################### ####################################################################################### ## write pattern summary ####################################################################################### xx = patterns.keys() xx.sort() if options.filename_patterns: outfile = open(options.filename_patterns, "w") else: outfile = sys.stdout for x in range(len(options.column2org)): outfile.write("# %i = %s\n" % (x, options.column2org[x])) if reference_tree: outfile.write("pattern\tcounts\tisok\n") else: outfile.write("pattern\tcounts\n") for x in xx: if reference_tree: if x in reference_patterns: is_ok = "1" else: is_ok = "0" outfile.write( "%s\t%s\t%s\n" % (x, patterns[x], is_ok) ) else: outfile.write( "%s\t%s\n" % (x, patterns[x]) ) if outfile != sys.stdout: outfile.close() ####################################################################################### ####################################################################################### ####################################################################################### ## write summary counts per species ####################################################################################### if options.filename_summary: outfile = open(options.filename_summary, "w") else: outfile = sys.stdout outfile.write("species\tntranscripts\tngenes\tntrees\n") for species, col in options.org2column.items(): outfile.write("%s\t%i\t%i\t%i\n" % ( species, len(species_counts[col].mTranscripts), len(species_counts[col].mGenes), len(species_counts[col].mTrees) )) if outfile != sys.stdout: outfile.close() E.Stop()