def processFunc(): # remove old query tempfile if one exists if closure["oldtmp"] != None: os.remove(closure["oldtmp"]) elapse = util.toc() closure["time"] += elapse util.log("blasted %d of %d sequences (%.1f%%), elapse %.0f m, left %.0f m" % ( closure["index"], len(seqs.keys()), 100 * float(closure["index"]) / len(seqs.keys()), closure["time"] / 60.0, elapse / split * (len(seqs.keys()) - closure["index"]) / 60.0)) util.tic() # find new subset of query sequences i = closure["index"] names = seqs.keys()[i:i+split] # if no more sequences then quit if len(names) == 0: return False # start blast tmpfile = util.tempfile(".", "blastp", ".fasta") seqs.write(tmpfile, names = names) pipe = os.popen("blastall -p %s -d %s -i %s -m 8 -e .1 %s" % \ (prog, databaseFile, tmpfile, options)) # update variables closure["oldtmp"] = tmpfile closure["index"] = i + split return pipe
def update(self, stream=None, msg="progress %2.0f%%"): self.pos += 1 if (self.pos > self.prog): self.prog += self.step * self.end if stream is not None: print>>stream, msg % (100 * self.pos / self.end) else: util.log(msg % (100 * self.pos / self.end))
def update(self, stream=None, msg="progress %2.0f%%"): self.pos += 1 if (self.pos > self.prog): self.prog += self.step * self.end if stream != None: print >> stream, msg % (100 * self.pos / self.end) else: util.log(msg % (100 * self.pos / self.end))
def __init__(self, *args, **dargs): Progress.__init__(self, *args) self.width = 60 self.step = 1 / self.width self.bar = 0 if "title" in dargs: title = dargs["title"] else: title = "progress" util.log("+-" + title + ("-"*(self.width-len(title)-1)) + "+") util.indent() util.logExact("|") self.printBar()
def __init__(self, *args, **dargs): Progress.__init__(self, *args) self.width = 60 self.step = 1 / self.width self.bar = 0 if "title" in dargs: title = dargs["title"] else: title = "progress" util.log("+-" + title + ("-" * (self.width - len(title) - 1)) + "+") util.indent() util.logExact("|") self.printBar()
def blast(prog, databaseFile, queryFile, options="", split=100, resume=None): """Executes blastp in several smaller batches""" if not split: # do blasting in one call pipe = os.popen("blastall -p %s -d %s -i %s -m 8 %s" % (prog, databaseFile, queryFile, options)) return BlastReader(pipe) else: # NOTE: split query file into about 100 sequences each # this is a work around for ncbi blastall 2.2.10 problem with outputing # in -m 8 mode. error was "BioseqFindFunc: couldn't uncache" seqs = fasta.read_fasta(queryFile) closure = {"index": 0, "oldtmp": None, "time": 0.0} if resume: try: closure["index"] = seqs.keys().index(resume) util.log("resuming with query '%s' (%d of %d)" % ((resume, closure["index"], len(seqs.keys())))) except ValueError: raise Exception("Could not resume from last query sequence '%s'" % resume) def processFunc(): # remove old query tempfile if one exists if closure["oldtmp"] is not None: os.remove(closure["oldtmp"]) elapse = util.toc() closure["time"] += elapse util.log( "blasted %d of %d sequences (%.1f%%), " "elapse %.0f m, left %.0f m" % ( closure["index"], len(seqs.keys()), 100 * float(closure["index"]) / len(seqs.keys()), closure["time"] / 60.0, elapse / split * (len(seqs.keys()) - closure["index"]) / 60.0, ) ) util.tic() # find new subset of query sequences i = closure["index"] names = seqs.keys()[i : i + split] # if no more sequences then quit if len(names) == 0: return False # start blast tmpfile = util.tempfile(".", "blastp", ".fasta") seqs.write(tmpfile, names=names) pipe = os.popen("blastall -p %s -d %s -i %s -m 8 -e .1 %s" % (prog, databaseFile, tmpfile, options)) # update variables closure["oldtmp"] = tmpfile closure["index"] = i + split return pipe filename = processFunc() if filename: return BlastReader(filename, processFunc) else: return BlastReader(os.popen("less"))
def blast(prog, databaseFile, queryFile, options="", split=100, resume=None): """Executes blastp in several smaller batches""" if not split: # do blasting in one call pipe = os.popen("blastall -p %s -d %s -i %s -m 8 %s" % (prog, databaseFile, queryFile, options)) return BlastReader(pipe) else: # NOTE: split query file into about 100 sequences each # this is a work around for ncbi blastall 2.2.10 problem with outputing # in -m 8 mode. error was "BioseqFindFunc: couldn't uncache" seqs = fasta.read_fasta(queryFile) closure = { "index": 0, "oldtmp": None, "time": 0.0 } if resume: try: closure["index"] = seqs.keys().index(resume) util.log("resuming with query '%s' (%d of %d)" % ( (resume, closure["index"], len(seqs.keys())))) except ValueError: raise Exception( "Could not resume from last query sequence '%s'" % resume) def processFunc(): # remove old query tempfile if one exists if closure["oldtmp"] is not None: os.remove(closure["oldtmp"]) elapse = util.toc() closure["time"] += elapse util.log( "blasted %d of %d sequences (%.1f%%), " "elapse %.0f m, left %.0f m" % ( closure["index"], len(seqs.keys()), 100 * float(closure["index"]) / len(seqs.keys()), closure["time"] / 60.0, elapse / split * (len(seqs.keys()) - closure["index"]) / 60.0)) util.tic() # find new subset of query sequences i = closure["index"] names = seqs.keys()[i:i+split] # if no more sequences then quit if len(names) == 0: return False # start blast tmpfile = util.tempfile(".", "blastp", ".fasta") seqs.write(tmpfile, names=names) pipe = os.popen("blastall -p %s -d %s -i %s -m 8 -e .1 %s" % (prog, databaseFile, tmpfile, options)) # update variables closure["oldtmp"] = tmpfile closure["index"] = i + split return pipe filename = processFunc() if filename: return BlastReader(filename, processFunc) else: return BlastReader(os.popen("less"))
adef = raxml.new_analdef() raxml.init_adef(adef) tr = raxml.new_tree() cmd = "raxmlHPC -t %s -s %s %s" % (treefile, seqfile, options.extra) raxml.init_program(adef, tr, cmd.split(" ")) util.tic("Optimizing model...") raxml.optimize_model(adef, tr) util.toc() # draw_raxml_tree(tr, adef) util.tic("Getting parameters for LH...") bestVector, bestLH, weightSum = raxml.compute_best_LH(tr) util.log("bestLH: %.3f" % bestLH) util.toc() tree = treelib.read_tree(treefile) for node in tree: node.dist = 0 if "boot" in node.data: del node.data["boot"] treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes = set([treehash]) for i in xrange(options.niter): while treehash in treehashes: util.log("random spr") node1, node2 = phylo.propose_random_spr(tree) phylo.perform_spr(tree, node1, node2)
util.tic("Initializing RAXML and optimizing...") module = raxml.RAxML() module.optimize_model(treefile, seqfile, options.extra) util.toc() tree = treelib.read_tree(treefile) for node in tree: node.dist = 0 if "boot" in node.data: del node.data["boot"] treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes = set([treehash]) for i in xrange(options.niter): while treehash in treehashes: util.log("random spr") node1, node2 = phylo.propose_random_spr(tree) phylo.perform_spr(tree, node1, node2) treehash = phylo.hash_tree(treelib.unroot(tree, newCopy=True)) treehashes.add(treehash) tree.write(out, oneline=True); out.write('\n'); out.flush() util.tic("Computing LH...") p, Dlnl = module.compute_lik_test(tree) util.log("pvalue: %.3f, Dlnl: %.3f" % (p, Dlnl)) util.toc() if Dlnl <= 0: util.log("worse likelihood?: %s" % False) # better topology (higher likelihood) else:
def prob_gene_species_alignment_recon(alnfile, partfile, stree, popsizes, duprate, lossrate, subrate, beta, pretime, premean, coal_tree, coal_recon, nsamples_coal, locus_tree, locus_recon, nsamples_locus, daughters, rates, freqs, alphas, threads=1, seed=ALIGNMENT_SEED, eps=0.1, info=None): """ Evaluate terms that depend on T^G and R^G. That is, fix T^L, R^L, and daughters and evaluate the double integral: int int P(t^L | T^L, R^L, S, theta) * P(T^G, R^G, t^G | t^L, T^L, daughters, R^L, theta) * P(A | T^G, t^G) dt^L dt^G This is the probability we used in the searching process. alnfile -- alignment file partfile -- partition file stree -- species tree popsizes -- population sizes in species tree duprate -- duplication rate lossrate -- loss rate subrate -- substitution rate beta -- regularization parameter pretime -- starting time before species tree premean -- mean starting time before species tree coal_tree -- coalescent tree coal_recon -- reconciliation of coalescent tree to locus tree nsamples_coal -- number of times to sample coal times t^G locus_tree -- locus tree (has dup-loss) locus_recon -- reconciliation of locus tree to species tree nsamples_locus -- number of times to sample the locus tree times t^L daughters -- daughter nodes rates, freqs, alphas -- optimization parameters """ locus_events = phylo.label_events(locus_tree, locus_recon) # optimize the parameters # util.tic("optimize parameter") # rates, freqs, alphas = pllprob.optimize_parameters(alnfile, partfile, coal_tree, # threads=threads, seed=seed, eps=eps) # util.toc() # double integral double_integral_list = [] double_integral = 0.0 util.tic("recon prob") for i in xrange(nsamples_locus): # sample t^L, the unit should be in myr #util.tic("topo prob") locus_times = duploss.sample_dup_times(locus_tree, stree, locus_recon, duprate, lossrate, pretime, premean, events=locus_events) treelib.set_dists_from_timestamps(locus_tree, locus_times) # calculate P(T^G, R^G | T^L, t^L, daughters, theta) topology_prob = prob_locus_coal_recon_topology(coal_tree, coal_recon, locus_tree, popsizes, daughters) #util.toc() # for a fixed t^L, compute coal_prob # sample t^G for topology and compute the probabililty of observing the alignment using MonteCarlo integration coal_prob = 0.0 alignment_prob_MonteCarlo = 0.0 alignment_prob_list = [] # check probability of lineage counts for this locus tree zero_lineage_prob = False #util.tic("set times") for lnode in locus_tree: lineages = coal.count_lineages_per_branch(coal_tree, coal_recon, locus_tree) bottom_num, top_num = lineages[lnode] if lnode.parent: T = lnode.dist else: T = util.INF popsizes = popsizes lineage_prob = prob_coal_counts(bottom_num, top_num, T, popsizes) # set zero_lineage_prob = TRUE if one lineage returns zero probability if (lineage_prob == 0.0): zero_lineage_prob = True #util.toc() # if lineage_prob is zero, coal_prob is zero if zero_lineage_prob: coal_prob = -float("inf") # otherwise, we calculate the coal_prob else: for j in xrange(nsamples_coal): # sample coal times and set the coal_tree accordingly # locus tree branch lengths are in myr # make sure the input popsizes are scaled to fit the time unit (typically myr) try: sample_coal_times_topology(coal_tree, coal_recon, locus_tree, popsizes) except (ZeroDivisionError, ValueError): # bad sample util.log("bad sample") alignment_prob = -util.INF continue #=============================================================================== # (log) probability of observing the alignment #util.tic("alignment probability") # convert branch lengths from myr to sub/site for node in coal_tree: node.dist *= subrate #util.tic("alignment prob") # set a regularization parameter beta print beta alignment_prob = beta * prob_alignment(alnfile, partfile, coal_tree, rates, freqs, alphas, threads=threads, seed=seed, eps=eps) #util.toc() ### util.log("p = %.6f" % alignment_prob) #util.toc() #=============================================================================== ### util.log(" log p = %.6g" % alignment_prob) ### util.log(" p = %.6g" % exp(alignment_prob)) alignment_prob_list.append(alignment_prob) ### util.log("p = %f" % alignment_prob_MonteCarlo) # log_sum_exp function exponentiate the log probability of observing alignment, # add them up, and take log again if len(alignment_prob_list) == 0: # all bad samples alignment_prob_MonteCarlo = -util.INF else: alignment_prob_MonteCarlo = log_sum_exp( alignment_prob_list) - log(nsamples_coal) # P(T^G, R^G | T^L, t^L, daughters, theta) * $ P(t^G | ~) * P(A | T^G,t^G) dtG # coal_prob is a log probability coal_prob += topology_prob + alignment_prob_MonteCarlo # add coal probability to a list for further processing double_integral_list.append(coal_prob) # log_sum_exp function exponentiate the log probability of observing alignment, # add them up, and take log again double_integral = log_sum_exp(double_integral_list) - log( nsamples_locus) # logging info if info is not None: info["topology_prob"] = topology_prob # one sample of t^L info[ "alignment_prob"] = alignment_prob_MonteCarlo # one sample of t^L, averaged over t^G info["coal_prob"] = double_integral util.toc() return double_integral