def finish(self): lks = [] if self.lk_mode == "phyml": for job in self.jobs: if job.jobcat != "bionj": continue phyml_job = job tree_file = pjoin(phyml_job.jobdir, self.alg_phylip_file+"_phyml_tree.txt") stats_file = pjoin(phyml_job.jobdir, self.alg_phylip_file+"_phyml_stats.txt") tree = PhyloTree(tree_file) m = re.search('Log-likelihood:\s+(-?\d+\.\d+)', open(stats_file).read()) lk = float(m.groups()[0]) tree.add_feature("lk", lk) tree.add_feature("model", phyml_job.args["--model"]) lks.append([float(tree.lk), tree.model, tree]) elif self.lk_mode == "raxml": for job in self.jobs: if job.jobcat != "raxml": continue raxml_job = job lk = open(pjoin(raxml_job.jobdir, "RAxML_log.%s" %raxml_job.args["-n"])).readline().split()[1] tree = PhyloTree(raxml_job.args["-t"]) tree.add_feature("lk", lk) tree.add_feature("model", raxml_job.model) lks.append([float(tree.lk), tree.model, tree]) # sort lks in ASC order lks.sort() # choose the model with higher likelihood, the lastone in the list best_model = lks[-1][1] best_tree = lks[-1][2] log.log(22, "%s model selected from the following lk values:\n%s" %(best_model, '\n'.join(map(str, lks)))) ModelTesterTask.store_data(self, best_model, lks)
def finish(self): # first job is the raxml tree best_model = None best_model_in_next_line = False t = None for line in open(self.jobs[-1].stdout_file, "rU"): line = line.strip() if best_model_in_next_line and line.startswith("Model"): pass #best_model = line.split("=")[1].strip() elif best_model_in_next_line and line.startswith("partition"): best_model = line.split("=")[1].strip() best_model_in_next_line = False elif line.startswith("Model selected:"): best_model_in_next_line = True elif line.startswith("ML tree (NNI) for the best AIC model ="): nw = line.replace("ML tree (NNI) for the best AIC model =", "") t = PhyloTree(nw) open(self.best_model_file, "w").write(best_model) log.log(26, "Best model: %s" % best_model) if self.ttype == "tree": tree_job = self.jobs[-1] tree_file = os.path.join(tree_job.jobdir, "jModelTest_tree." + self.nodeid) t.write(outfile=self.tree_file) self.model = best_model ModelTesterTask.finish(self)
def finish(self): # first job is the raxml tree best_model = None best_model_in_next_line = False t = None for line in open(self.jobs[-1].stdout_file, "rU"): line = line.strip() if best_model_in_next_line and line.startswith("Model"): pass#best_model = line.split("=")[1].strip() elif best_model_in_next_line and line.startswith("partition"): best_model = line.split("=")[1].strip() best_model_in_next_line = False elif line.startswith("Model selected:"): best_model_in_next_line = True elif line.startswith("ML tree (NNI) for the best AIC model ="): nw = line.replace("ML tree (NNI) for the best AIC model =", "") t = PhyloTree(nw) open(self.best_model_file, "w").write(best_model) log.log(26, "Best model: %s" %best_model) if self.ttype == "tree": tree_job = self.jobs[-1] tree_file = os.path.join(tree_job.jobdir, "jModelTest_tree."+self.nodeid) t.write(outfile=self.tree_file) self.model = best_model ModelTesterTask.finish(self)
def finish(self): lks = [] j = self.jobs[0] tree_file = os.path.join(j.jobdir, self.alg_phylip_file+"_phyml_tree.txt") stats_file = os.path.join(j.jobdir, self.alg_phylip_file+"_phyml_stats.txt") m = re.search('Log-likelihood:\s+(-?\d+\.\d+)', open(stats_file).read()) lk = float(m.groups()[0]) stats = {"lk": lk} tree = PhyloTree(tree_file) TreeTask.store_data(self, tree.write(), stats)
def finish(self): lks = [] j = self.jobs[0] tree_file = os.path.join(j.jobdir, self.alg_phylip_file + "_phyml_tree.txt") stats_file = os.path.join(j.jobdir, self.alg_phylip_file + "_phyml_stats.txt") m = re.search('Log-likelihood:\s+(-?\d+\.\d+)', open(stats_file).read()) lk = float(m.groups()[0]) stats = {"lk": lk} tree = PhyloTree(tree_file) TreeTask.store_data(self, tree.write(), stats)
def finish(self): def euc_dist(x, y): return len(x.symmetric_difference(y)) / float((len(x) + len(y))) dataid = db.get_dataid(*self.task_tree_file.split(".")) ttree = PhyloTree(db.get_data(dataid)) mtree = self.main_tree ttree.dist = 0 cladeid, target_seqs, out_seqs = db.get_node_info( self.threadid, self.nodeid) self.out_seqs = out_seqs self.target_seqs = target_seqs ttree_content = ttree.get_cached_content() if mtree and not out_seqs: mtree_content = mtree.get_cached_content() log.log(24, "Finding best scoring outgroup from previous iteration.") for _n in mtree_content: if _n.cladeid == cladeid: orig_target = _n target_left = set( [_n.name for _n in mtree_content[orig_target.children[0]]]) target_right = set( [_n.name for _n in mtree_content[orig_target.children[1]]]) partition_pairs = [] everything = set([_n.name for _n in ttree_content[ttree]]) for n, content in six.iteritems(ttree_content): if n is ttree: continue left = set([_n.name for _n in content]) right = everything - left d1 = euc_dist(left, target_left) d2 = euc_dist(left, target_right) best_match = min(d1, d2) partition_pairs.append([best_match, left, right, n]) partition_pairs.sort() self.outgroup_match_dist = partition_pairs[0][0] #self.outgroup_match = '#'.join( ['|'.join(partition_pairs[0][1]), # '|'.join(partition_pairs[0][2])] ) outgroup = partition_pairs[0][3] ttree.set_outgroup(outgroup) ttree.dist = orig_target.dist ttree.support = orig_target.support # Merge task and main trees parent = orig_target.up orig_target.detach() parent.add_child(ttree) elif mtree and out_seqs: log.log(26, "Rooting tree using %d custom seqs" % len(out_seqs)) self.outgroup_match = '|'.join(out_seqs) #log.log(22, "Out seqs: %s", len(out_seqs)) #log.log(22, "Target seqs: %s", target_seqs) if len(out_seqs) > 1: #first root to a single seqs outside the outgroup #(should never fail and avoids random outgroup split #problems in unrooted trees) ttree.set_outgroup(ttree & list(target_seqs)[0]) # Now tries to get the outgroup node as a monophyletic clade outgroup = ttree.get_common_ancestor(out_seqs) if set(outgroup.get_leaf_names()) ^ out_seqs: msg = "Monophyly of the selected outgroup could not be granted! Probably constrain tree failed." #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, out_seqs) raise TaskError(self, msg) else: outgroup = ttree & list(out_seqs)[0] ttree.set_outgroup(outgroup) orig_target = self.main_tree.get_common_ancestor(target_seqs) found_target = outgroup.get_sisters()[0] ttree = ttree.get_common_ancestor(target_seqs) outgroup.detach() self.pre_iter_support = orig_target.support # Use previous dist and support ttree.dist = orig_target.dist ttree.support = orig_target.support parent = orig_target.up orig_target.detach() parent.add_child(ttree) else: # ROOTS FIRST ITERATION log.log(24, "Getting outgroup for first NPR split") # if early split is provided in the command line, it # overrides config file mainout = GLOBALS.get("first_split_outgroup", "midpoint") if mainout.lower() == "midpoint": log.log(26, "Rooting to midpoint.") best_outgroup = ttree.get_midpoint_outgroup() if best_outgroup: ttree.set_outgroup(best_outgroup) else: log.warning("Midpoint outgroup could not be set!") ttree.set_outgroup(next(ttree.iter_leaves())) else: if mainout.startswith("~"): # Lazy defined outgroup. Will trust in the common # ancestor of two or more OTUs strict_common_ancestor = False outs = set(mainout[1:].split()) if len(outs) < 2: raise TaskError( self, "First split outgroup error: common " "ancestor calculation requires at least two OTU names" ) else: strict_common_ancestor = True outs = set(mainout.split()) if outs - target_seqs: raise TaskError( self, "Unknown seqs cannot be used to set first split rooting:%s" % (outs - target_seqs)) if len(outs) > 1: anchor = list(set(target_seqs) - outs)[0] ttree.set_outgroup(ttree & anchor) common = ttree.get_common_ancestor(outs) out_seqs = common.get_leaf_names() if common is ttree: msg = "First split outgroup could not be granted:%s" % out_seqs #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs) raise TaskError(self, msg) if strict_common_ancestor and set(out_seqs) ^ outs: msg = "Monophyly of first split outgroup could not be granted:%s" % out_seqs #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs) raise TaskError(self, msg) log.log( 26, "@@8:First split rooting to %d seqs@@1:: %s" % (len(out_seqs), out_seqs)) ttree.set_outgroup(common) else: single_out = outs.pop() common = ttree.set_outgroup(single_out) log.log( 26, "@@8:First split rooting to 1 seq@@1:: %s" % (single_out)) self.main_tree = ttree orig_target = ttree tn = orig_target.copy() self.pre_iter_task_tree = tn self.rf = orig_target.robinson_foulds(ttree) self.pre_iter_support = orig_target.support # Reloads node2content of the rooted tree and generate cladeids ttree_content = self.main_tree.get_cached_content() for n, content in six.iteritems(ttree_content): cid = generate_id([_n.name for _n in content]) n.add_feature("cladeid", cid) #ttree.write(outfile=self.pruned_tree) self.task_tree = ttree
def finish(self): def euc_dist(x, y): return len(x.symmetric_difference(y)) / float((len(x) + len(y))) dataid = db.get_dataid(*self.task_tree_file.split(".")) ttree = PhyloTree(db.get_data(dataid)) mtree = self.main_tree ttree.dist = 0 cladeid, target_seqs, out_seqs = db.get_node_info(self.threadid, self.nodeid) self.out_seqs = out_seqs self.target_seqs = target_seqs ttree_content = ttree.get_cached_content() if mtree and not out_seqs: mtree_content = mtree.get_cached_content() log.log(24, "Finding best scoring outgroup from previous iteration.") for _n in mtree_content: if _n.cladeid == cladeid: orig_target = _n target_left = set([_n.name for _n in mtree_content[orig_target.children[0]]]) target_right = set([_n.name for _n in mtree_content[orig_target.children[1]]]) partition_pairs = [] everything = set([_n.name for _n in ttree_content[ttree]]) for n, content in six.iteritems(ttree_content): if n is ttree: continue left = set([_n.name for _n in content]) right = everything - left d1 = euc_dist(left, target_left) d2 = euc_dist(left, target_right) best_match = min(d1, d2) partition_pairs.append([best_match, left, right, n]) partition_pairs.sort() self.outgroup_match_dist = partition_pairs[0][0] #self.outgroup_match = '#'.join( ['|'.join(partition_pairs[0][1]), # '|'.join(partition_pairs[0][2])] ) outgroup = partition_pairs[0][3] ttree.set_outgroup(outgroup) ttree.dist = orig_target.dist ttree.support = orig_target.support # Merge task and main trees parent = orig_target.up orig_target.detach() parent.add_child(ttree) elif mtree and out_seqs: log.log(26, "Rooting tree using %d custom seqs" % len(out_seqs)) self.outgroup_match = '|'.join(out_seqs) #log.log(22, "Out seqs: %s", len(out_seqs)) #log.log(22, "Target seqs: %s", target_seqs) if len(out_seqs) > 1: #first root to a single seqs outside the outgroup #(should never fail and avoids random outgroup split #problems in unrooted trees) ttree.set_outgroup(ttree & list(target_seqs)[0]) # Now tries to get the outgroup node as a monophyletic clade outgroup = ttree.get_common_ancestor(out_seqs) if set(outgroup.get_leaf_names()) ^ out_seqs: msg = "Monophyly of the selected outgroup could not be granted! Probably constrain tree failed." #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, out_seqs) raise TaskError(self, msg) else: outgroup = ttree & list(out_seqs)[0] ttree.set_outgroup(outgroup) orig_target = self.main_tree.get_common_ancestor(target_seqs) found_target = outgroup.get_sisters()[0] ttree = ttree.get_common_ancestor(target_seqs) outgroup.detach() self.pre_iter_support = orig_target.support # Use previous dist and support ttree.dist = orig_target.dist ttree.support = orig_target.support parent = orig_target.up orig_target.detach() parent.add_child(ttree) else: # ROOTS FIRST ITERATION log.log(24, "Getting outgroup for first NPR split") # if early split is provided in the command line, it # overrides config file mainout = GLOBALS.get("first_split_outgroup", "midpoint") if mainout.lower() == "midpoint": log.log(26, "Rooting to midpoint.") best_outgroup = ttree.get_midpoint_outgroup() if best_outgroup: ttree.set_outgroup(best_outgroup) else: log.warning("Midpoint outgroup could not be set!") ttree.set_outgroup(next(ttree.iter_leaves())) else: if mainout.startswith("~"): # Lazy defined outgroup. Will trust in the common # ancestor of two or more OTUs strict_common_ancestor = False outs = set(mainout[1:].split()) if len(outs) < 2: raise TaskError(self, "First split outgroup error: common " "ancestor calculation requires at least two OTU names") else: strict_common_ancestor = True outs = set(mainout.split()) if outs - target_seqs: raise TaskError(self, "Unknown seqs cannot be used to set first split rooting:%s" %(outs - target_seqs)) if len(outs) > 1: anchor = list(set(target_seqs) - outs)[0] ttree.set_outgroup(ttree & anchor) common = ttree.get_common_ancestor(outs) out_seqs = common.get_leaf_names() if common is ttree: msg = "First split outgroup could not be granted:%s" %out_seqs #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs) raise TaskError(self, msg) if strict_common_ancestor and set(out_seqs) ^ outs: msg = "Monophyly of first split outgroup could not be granted:%s" %out_seqs #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs) raise TaskError(self, msg) log.log(26, "@@8:First split rooting to %d seqs@@1:: %s" %(len(out_seqs),out_seqs)) ttree.set_outgroup(common) else: single_out = outs.pop() common = ttree.set_outgroup(single_out) log.log(26, "@@8:First split rooting to 1 seq@@1:: %s" %(single_out)) self.main_tree = ttree orig_target = ttree tn = orig_target.copy() self.pre_iter_task_tree = tn self.rf = orig_target.robinson_foulds(ttree) self.pre_iter_support = orig_target.support # Reloads node2content of the rooted tree and generate cladeids ttree_content = self.main_tree.get_cached_content() for n, content in six.iteritems(ttree_content): cid = generate_id([_n.name for _n in content]) n.add_feature("cladeid", cid) #ttree.write(outfile=self.pruned_tree) self.task_tree = ttree
def finish(self): lks = [] if self.lk_mode == "phyml": for job in [j for j in self.jobs if j.flag == "phyml"]: tree_file = os.path.join(job.jobdir, self.alg_basename+"_phyml_tree.txt") stats_file = os.path.join(j.jobdir, self.alg_basename+"_phyml_stats.txt") tree = PhyloTree(tree_file) m = re.search('Log-likelihood:\s+(-?\d+\.\d+)', open(stats_file).read()) lk = float(m.groups()[0]) tree.add_feature("lk", lk) tree.add_feature("model", job.args["--model"]) lks.append([float(tree.lk), tree.model, tree]) elif self.lk_mode == "raxml": for job in [j for j in self.jobs if j.flag == "raxml"]: lk = open(os.path.join(job.jobdir, "RAxML_log.%s" %job.args["-n"])).readline().split()[1] tree = PhyloTree(job.args["-t"]) tree.add_feature("lk", lk) tree.add_feature("model", job.model) lks.append([lk, tree.model, tree]) lks.sort() lks.reverse() # choose the model with higher likelihood best_model = lks[-1][1] best_tree = lks[-1][2] open(self.best_model_file, "w").write(best_model) if self.tree_file: tree.write(self.tree_file) ModelTesterTask.finish(self)
def finish(self): lks = [] if self.lk_mode == "phyml": for job in self.jobs: if job.jobcat != "bionj": continue phyml_job = job tree_file = pjoin(phyml_job.jobdir, self.alg_phylip_file + "_phyml_tree.txt") stats_file = pjoin(phyml_job.jobdir, self.alg_phylip_file + "_phyml_stats.txt") tree = PhyloTree(tree_file) m = re.search('Log-likelihood:\s+(-?\d+\.\d+)', open(stats_file).read()) lk = float(m.groups()[0]) tree.add_feature("lk", lk) tree.add_feature("model", phyml_job.args["--model"]) lks.append([float(tree.lk), tree.model, tree]) elif self.lk_mode == "raxml": for job in self.jobs: if job.jobcat != "raxml": continue raxml_job = job lk = open( pjoin(raxml_job.jobdir, "RAxML_log.%s" % raxml_job.args["-n"])).readline().split()[1] tree = PhyloTree(raxml_job.args["-t"]) tree.add_feature("lk", lk) tree.add_feature("model", raxml_job.model) lks.append([float(tree.lk), tree.model, tree]) # sort lks in ASC order lks.sort() # choose the model with higher likelihood, the lastone in the list best_model = lks[-1][1] best_tree = lks[-1][2] log.log( 22, "%s model selected from the following lk values:\n%s" % (best_model, '\n'.join(map(str, lks)))) ModelTesterTask.store_data(self, best_model, lks)
def finish(self): lks = [] if self.lk_mode == "phyml": for job in [j for j in self.jobs if j.flag == "phyml"]: tree_file = os.path.join(job.jobdir, self.alg_basename + "_phyml_tree.txt") stats_file = os.path.join( j.jobdir, self.alg_basename + "_phyml_stats.txt") tree = PhyloTree(tree_file) m = re.search('Log-likelihood:\s+(-?\d+\.\d+)', open(stats_file).read()) lk = float(m.groups()[0]) tree.add_feature("lk", lk) tree.add_feature("model", job.args["--model"]) lks.append([float(tree.lk), tree.model, tree]) elif self.lk_mode == "raxml": for job in [j for j in self.jobs if j.flag == "raxml"]: lk = open( os.path.join(job.jobdir, "RAxML_log.%s" % job.args["-n"])).readline().split()[1] tree = PhyloTree(job.args["-t"]) tree.add_feature("lk", lk) tree.add_feature("model", job.model) lks.append([lk, tree.model, tree]) lks.sort() lks.reverse() # choose the model with higher likelihood best_model = lks[-1][1] best_tree = lks[-1][2] open(self.best_model_file, "w").write(best_model) if self.tree_file: tree.write(self.tree_file) ModelTesterTask.finish(self)