def load_stored_data(self): self.cogs = db.get_task_data(self.taskid, DATATYPES.cogs) self.cog_analysis = db.get_task_data(self.taskid, DATATYPES.cog_analysis)
def load_stored_data(self): # self.tree_file = db.get_dataid(self.taskid, DATATYPES.tree) self.stats = db.get_task_data(self.taskid, DATATYPES.tree_stats)
def load_stored_data(self): self.best_model = db.get_task_data(self.taskid, DATATYPES.best_model) self.model_ranking = db.get_task_data(self.taskid, DATATYPES.model_ranking)
def load_stored_data(self): self.kept_columns[:] = [] # clear list self.kept_columns.append(db.get_task_data(self.taskid, DATATYPES.kept_alg_columns))
def split_tree(task_tree_node, task_outgroups, main_tree, alg_path, npr_conf, threadid, target_cladeids): """Browses a task tree from root to leaves and yields next suitable nodes for NPR iterations. Each yielded node comes with the set of target and outgroup tips. """ def processable_node(_n): """This an internal function that returns true if a given node is suitable for a NPR iteration. It can be used as "is_leaf_fn" when traversing a tree. Note that this function uses several variables which change within the split_tree function, so must be kept within its namespace. """ is_leaf = False for wkname, wkfilter in npr_conf.npr_workflows: # if node is not in the targets or does not meet size filters, skip # workflow if _n is master_node or \ (_TARGET_NODES and _n not in _TARGET_NODES) or \ (target_cladeids and _n.cladeid not in target_cladeids) or \ len(n2content[_n]) < max(wkfilter.get("min_size", 3), 3) or \ ("max_size" in wkfilter and len(n2content[_n]) > wkfilter["max_size"]): continue # If seq_sim filter used, calculate node stats if ALG and ("min_seq_sim" in wkfilter or "max_seq_sim" in wkfilter): if not hasattr(_n, "seqs_mean_ident"): log.log(20, "Calculating node sequence stats...") mx, mn, avg, std = get_seqs_identity(ALG, [__n.name for __n in n2content[_n]]) _n.add_features(seqs_max_ident=mx, seqs_min_ident=mn, seqs_mean_ident=avg, seqs_std_ident=std) log.log(20, "mx=%s, mn=%s, avg=%s, std=%s" %(mx, mn, avg, std)) if _n.seqs_mean_ident < wkfilter["min_seq_sim"]: continue if _n.seqs_mean_ident > wkfilter["max_seq_sim"]: continue else: _n.add_features(seqs_max_ident=None, seqs_min_ident=None, seqs_mean_ident=None, seqs_std_ident=None) if "min_support" in wkfilter: # If we are optimizing only lowly supported nodes, and nodes are # optimized without an outgroup, our target node is actually the # parent of lowly supported nodes. Therefore, I check if support # is low in children nodes, and return this node if so. if not npr_conf.use_outgroup: if not [_ch for _ch in _n.children if _ch.support <= wkfilter["min_support"]]: continue # Otherwise, just skip the node if it above the min support elif _n.support > wkfilter["min_support"]: continue # At this point, node passed all filters of this workflow were met, # so it can be optimized is_leaf = True _n._target_wkname = wkname break return is_leaf log.log(20, "Loading tree content...") n2content = main_tree.get_cached_content() if alg_path: log.log(20, "Loading associated alignment to check seq. similarity") raw_alg = db.get_task_data(*alg_path.split(".")) ALG = SeqGroup(raw_alg) else: ALG = None log.log(20, "Finding next NPR nodes...") # task_tree_node is actually a node in main_tree, since it has been # already merged trees_to_browse = [task_tree_node] npr_nodes = 0 # loads current tree content, so we can check not reconstructing exactly the # same tree tasktree_content = set([leaf.name for leaf in n2content[task_tree_node]]) | set(task_outgroups) while trees_to_browse: master_node = trees_to_browse.pop() # if custom taxa levels are defined as targets, find them in this # subtree _TARGET_NODES = defaultdict(list) # this container is used by # processable_node function opt_levels = GLOBALS[threadid].get('_optimized_levels', None) if opt_levels is not None: # any descendant of the already processed node is suitable for # selection. If the ancestor of level-species is on top of the # task_tree_node, it will be discarded avail_nodes = set(master_node.get_descendants()) for lin in opt_levels: sp2lin, lin2sp = GLOBALS["lineages"] optimized, strict_monophyly = opt_levels[lin] if not optimized: ancestor = main_tree.get_common_ancestor(*lin2sp[lin]) if ancestor in avail_nodes: # check that the node satisfies level monophyly config ancestor_content = set([x.name for x in n2content[ancestor]]) if not strict_monophyly or lin2sp[lin] == ancestor_content: _TARGET_NODES[ancestor].append(lin) elif strict_monophyly: log.log(26, "Discarding not monophyletic level @@11:%s@@1:" %lin) else: log.log(26, "Discarding upper clade @@11:%s@@1:" %lin) for node in master_node.iter_leaves(is_leaf_fn=processable_node): if opt_levels: log.log(28, "Trying to optimizing custom tree level: @@11:%s@@1:" %_TARGET_NODES[node]) for lin in _TARGET_NODES[node]: # Marks the level as optimized, so is not computed again opt_levels[lin][0] = True log.log(28, "Found possible target node of size %s branch support %f" %(len(n2content[node]), node.support)) log.log(28, "First suitable workflow: %s" %(node._target_wkname)) # Finds best outgroup for the target node if npr_conf.use_outgroup: splitterconfname, _ = npr_conf.tree_splitter splitterconf = GLOBALS[threadid][splitterconfname] #seqs, outs = select_outgroups(node, n2content, splitterconf) #seqs, outs = select_closest_outgroup(node, n2content, splitterconf) seqs, outs = select_sister_outgroup(node, n2content, splitterconf) else: seqs = set([_i.name for _i in n2content[node]]) outs = set() if seqs | outs == tasktree_content: log.log(26, "Discarding target node of size %s, due to identity with its parent node" %len(n2content[node])) #print tasktree_content #print seqs #print outs trees_to_browse.append(node) else: npr_nodes += 1 yield node, seqs, outs, node._target_wkname log.log(28, "%s nodes will be optimized", npr_nodes)
def process_task(task, wkname, npr_conf, nodeid2info): alignerconf, alignerclass = npr_conf.aligner cleanerconf, cleanerclass = npr_conf.alg_cleaner mtesterconf, mtesterclass = npr_conf.model_tester treebuilderconf, treebuilderclass = npr_conf.tree_builder if not treebuilderclass: # Allows to dump algs in workflows with no tree tasks treebuilderclass = DummyTree splitterconf, splitterclass = npr_conf.tree_splitter conf = GLOBALS[task.configid] seqtype = task.seqtype nodeid = task.nodeid ttype = task.ttype taskid = task.taskid threadid = task.threadid node_info = nodeid2info[nodeid] size = task.size#node_info.get("size", 0) target_seqs = node_info.get("target_seqs", []) out_seqs = node_info.get("out_seqs", []) if not treebuilderclass or size < 4: # Allows to dump algs in workflows with no tree tasks or if tree # inference does not make sense given the number of sequences. DummyTree # will produce a fake fully collapsed newick tree. treebuilderclass = DummyTree mtesterclass = None # If more than one outgroup are used, enable the use of constrain if out_seqs and len(out_seqs) > 1: constrain_id = nodeid else: constrain_id = None new_tasks = [] if ttype == "msf": # Register Tree constrains constrain_tree = "(%s, (%s));" %(','.join(sorted(task.out_seqs)), ','.join(sorted(task.target_seqs))) _outs = "\n".join(map(lambda name: ">%s\n0" %name, sorted(task.out_seqs))) _tars = "\n".join(map(lambda name: ">%s\n1" %name, sorted(task.target_seqs))) constrain_alg = '\n'.join([_outs, _tars]) db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree) db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg) db.dataconn.commit() # since the creation of some Task # objects may require this info, I need # to commit right now. # Register node db.add_node(task.threadid, task.nodeid, task.cladeid, task.target_seqs, task.out_seqs) nodeid2info[nodeid]["size"] = task.size nodeid2info[nodeid]["target_seqs"] = task.target_seqs nodeid2info[nodeid]["out_seqs"] = task.out_seqs alg_task = alignerclass(nodeid, task.multiseq_file, seqtype, conf, alignerconf) alg_task.size = task.size new_tasks.append(alg_task) elif ttype == "alg" or ttype == "acleaner": if ttype == "alg": nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file elif ttype == "acleaner": nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file alg_fasta_file = getattr(task, "clean_alg_fasta_file", task.alg_fasta_file) alg_phylip_file = getattr(task, "clean_alg_phylip_file", task.alg_phylip_file) # Calculate alignment stats # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file, # conf["app"]["trimal"]) # # max_identity = get_trimal_identity(task.alg_fasta_file, # conf["app"]["trimal"]) # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std) # log.info("Max. Identity: %0.2f", max_identity) #import time #t1 = time.time() #mx, mn, mean, std = get_identity(task.alg_fasta_file) #print time.time()-t1 #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f", # mx, mn, mean, std) #t1 = time.time() if seqtype == "aa" and npr_conf.switch_aa_similarity < 1: try: alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats) except Exception, e: alg_stats = {} if ttype == "alg": algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file) dataid = DATATYPES.alg_phylip elif ttype == "acleaner": algfile = pjoin(GLOBALS["input_dir"], task.clean_alg_phylip_file) dataid = DATATYPES.clean_alg_phylip if "i_mean" not in alg_stats: log.log(24, "Calculating alignment stats...") # dump data if necesary algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file) if not pexist(algfile): # dump phylip alg open(algfile, "w").write(db.get_data(db.get_dataid(taskid, dataid))) mx, mn, mean, std = get_statal_identity(algfile, conf["app"]["statal"]) alg_stats = {"i_max":mx, "i_mean":mean, "i_min":mn, "i_std":std} db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats) log.log(22, "Alignment stats (sequence similarity):") log.log(22, " max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f" % (alg_stats)) else: alg_stats = {"i_max":-1, "i_mean":-1, "i_min":-1, "i_std":-1} #print time.time()-t1 #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f", # mx, mn, mean, std) task.max_ident = alg_stats["i_max"] task.min_ident = alg_stats["i_min"] task.mean_ident = alg_stats["i_mean"] task.std_ident = alg_stats["i_std"] next_task = None if ttype == "alg" and cleanerclass: next_task = cleanerclass(nodeid, seqtype, alg_fasta_file, alg_phylip_file, conf, cleanerconf) else: # Converts aa alignment into nt if necessary if seqtype == "aa" and \ "nt" in GLOBALS["seqtypes"] and \ task.mean_ident >= npr_conf.switch_aa_similarity: log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\ (task.mean_ident, npr_conf.switch_aa_similarity)) alg_fasta_file = "%s.%s" %(taskid, DATATYPES.alg_nt_fasta) alg_phylip_file = "%s.%s" %(taskid, DATATYPES.alg_nt_phylip) try: alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_fasta) alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_phylip) except ValueError: log.log(22, "Calculating codon alignment...") source_alg = pjoin(GLOBALS["input_dir"], task.alg_fasta_file) if ttype == "alg": kept_columns = [] elif ttype == "acleaner": # if original alignment was trimmed, use it as reference # but make the nt alignment only on the kept columns kept_columns = db.get_task_data(taskid, DATATYPES.kept_alg_columns) if not pexist(source_alg): open(source_alg, "w").write(db.get_task_data(taskid, DATATYPES.alg_fasta)) nt_alg = switch_to_codon(source_alg, kept_columns=kept_columns) db.add_task_data(taskid, DATATYPES.alg_nt_fasta, nt_alg.write()) db.add_task_data(taskid, DATATYPES.alg_nt_phylip, nt_alg.write(format='iphylip_relaxed')) npr_conf = IterConfig(conf, wkname, task.size, "nt") seqtype = "nt" if mtesterclass: next_task = mtesterclass(nodeid, alg_fasta_file, alg_phylip_file, constrain_id, conf, mtesterconf) elif treebuilderclass: next_task = treebuilderclass(nodeid, alg_phylip_file, constrain_id, None, seqtype, conf, treebuilderconf) if next_task: next_task.size = task.size new_tasks.append(next_task)