def finish(self): tm_start = time.ctime() all_species = self.targets | self.outgroups cogs, cog_analysis = brh_cogs2(db, all_species, missing_factor=self.missing_factor, seed_sp=self.seed) self.raw_cogs = cogs self.cog_analysis = cog_analysis self.cogs = [] for co in cogs: # self.cogs.append(map(encode_seqname, co)) encoded_names = db.translate_names(co) if len(encoded_names) != len(co): print(set(co) - set(encoded_names.keys())) raise DataError("Some sequence ids could not be translated") self.cogs.append(list(encoded_names.values())) # Sort Cogs according to the md5 hash of its content. Random # sorting but kept among runs list(map(lambda x: x.sort(), self.cogs)) self.cogs.sort(lambda x, y: cmp(md5(','.join(x)), md5(','.join(y)))) log.log(28, "%s COGs detected" % len(self.cogs)) tm_end = time.ctime() #open(pjoin(self.taskdir, "__time__"), "w").write( # '\n'.join([tm_start, tm_end])) CogSelectorTask.store_data(self, self.cogs, self.cog_analysis)
def finish(self): tm_start = time.ctime() all_species = self.targets | self.outgroups cogs, cog_analysis = brh_cogs2(db, all_species, missing_factor=self.missing_factor, seed_sp=self.seed) self.raw_cogs = cogs self.cog_analysis = cog_analysis self.cogs = [] for co in cogs: # self.cogs.append(map(encode_seqname, co)) encoded_names = db.translate_names(co) if len(encoded_names) != len(co): print(set(co) - set(encoded_names.keys())) raise DataError("Some sequence ids could not be translated") self.cogs.append(list(encoded_names.values())) # Sort Cogs according to the md5 hash of its content. Random # sorting but kept among runs list(map(lambda x: x.sort(), self.cogs)) self.cogs.sort(lambda x,y: cmp(md5(','.join(x)), md5(','.join(y)))) log.log(28, "%s COGs detected" %len(self.cogs)) tm_end = time.ctime() #open(pjoin(self.taskdir, "__time__"), "w").write( # '\n'.join([tm_start, tm_end])) CogSelectorTask.store_data(self, self.cogs, self.cog_analysis)
def load_task_info(self): ''' Initialize task information. It generates a unique taskID based on the sibling jobs and sets task working directory.''' # Creates a task id based on its target node and job arguments. The same # tasks, including the same parameters would raise the same id, so it is # easy to check if a task is already done in the working path. if not self.taskid: args_id = md5(','.join(sorted(["%s %s" %(str(pair[0]), str(pair[1])) for pair in six.iteritems(self.args)]))) unique_id = md5(','.join([self.nodeid, self._config_id, args_id] +\ sorted([getattr(j, "jobid", "taskid") for j in self.jobs]))) self.taskid = unique_id
def __init__(self, cogs, seqtype, conf, confname, workflow_checksum): self.confname = confname self.conf = conf #self.cogs_hard_limit = int(conf[confname]["_max_cogs"]) #used_cogs = cogs[:self.cogs_hard_limit] used_cogs = cogs cog_string = '#'.join([','.join(sorted(c)) for c in used_cogs]) cog_keyid = md5(cog_string) # This will be nodeid base_args = {} ConcatAlgTask.__init__(self, cog_keyid, "concat_alg", "ConcatAlg", workflow_checksum=workflow_checksum, base_args=base_args, extra_args=conf[confname]) self.avail_cogs = len(cogs) self.used_cogs = len(used_cogs) self.cogs = used_cogs self.seqtype = seqtype self.cog_ids = set() self.job2alg = {} self.job2model = {} if seqtype == "aa": self.default_model = conf[confname]["_default_aa_model"] elif seqtype == "nt": self.default_model = conf[confname]["_default_nt_model"] self.genetree_workflow = conf[confname]["_workflow"][1:] self.init()
def load_task_info(self): ''' Initialize task information. It generates a unique taskID based on the sibling jobs and sets task working directory.''' # Creates a task id based on its target node and job arguments. The same # tasks, including the same parameters would raise the same id, so it is # easy to check if a task is already done in the working path. if not self.taskid: args_id = md5(','.join( sorted([ "%s %s" % (str(pair[0]), str(pair[1])) for pair in six.iteritems(self.args) ]))) unique_id = md5(','.join([self.nodeid, self._config_id, args_id] +\ sorted([getattr(j, "jobid", "taskid") for j in self.jobs]))) self.taskid = unique_id
def __init__(self, nodeid, task_type, task_name, base_args=None, extra_args=None): if not base_args: base_args = {} if not extra_args: extra_args = {} self.taskid = None # This define which task-processor should be used # (i.e. genetree, sptree). self.task_processor = None # Nodeid is used to identify the tree node associated with # the task. It is calculated as a hash string based on the # list of sequence IDs grouped by the node. self.nodeid = nodeid # task type: "alg|tree|acleaner|mchooser|etc." self.ttype = task_type # Used only to name directories and identify task in log # messages self.tname = task_name # Path to the file containing task status: (D)one, (R)unning # or (W)aiting or (Un)Finished #self.status_file = None #self.inkey_file = None #self.info_file = None self.status = "W" self.all_status = None # keeps a counter of how many cores are being used by running jobs self.cores_used = 0 self.job_status = {} # Set arguments that could be sent to jobs self.args = merge_arg_dicts(extra_args, base_args, parent=self) # extract all internal config values associated to this task # and generate its unique id (later used to generate taskid) self._config_id = md5(','.join( sorted([ "%s %s" % (str(pair[0]), str(pair[1])) for pair in six.iteritems(extra_args) if pair[0].startswith("_") ]))) self.dependencies = set()
def add_task_data(taskid, datatype, data, duplicates="OR IGNORE"): data_id = md5(str(data)) cmd = """ INSERT %s INTO task (taskid, status) VALUES ("%s", "D") """ % (duplicates, taskid) datacursor.execute(cmd) cmd = """ INSERT %s INTO task2data (taskid, datatype, md5) VALUES ("%s", "%s", "%s") """ % (duplicates, taskid, datatype, data_id) datacursor.execute(cmd) cmd = """ INSERT %s INTO data (md5, data) VALUES ("%s", "%s") """ % (duplicates, data_id, zencode(data, data_id)) datacursor.execute(cmd) autocommit() return data_id
def add_task_data(taskid, datatype, data, duplicates="OR IGNORE"): data_id = md5(str(data)) cmd = """ INSERT %s INTO task (taskid, status) VALUES ("%s", "D") """ %(duplicates, taskid) datacursor.execute(cmd) cmd = """ INSERT %s INTO task2data (taskid, datatype, md5) VALUES ("%s", "%s", "%s") """ %(duplicates, taskid, datatype, data_id) datacursor.execute(cmd) cmd = """ INSERT %s INTO data (md5, data) VALUES ("%s", "%s") """ %(duplicates, data_id, zencode(data, data_id)) datacursor.execute(cmd) autocommit() return data_id
def __init__(self, bin, args, jobname=None, parent_ids=None): # Used at execution time self.status = None # How to run the app self.bin = bin # command line arguments self.args = args # Default number of cores used by the job. If more than 1, # this attribute should be changed self.cores = 1 self.exec_type = "insitu" self.jobname = jobname # generates the unique job identifier based on the params of # the app. Some params include path names that can prevent # recycling the job, so a clean it. clean = lambda x: basename(x) if GLOBALS["basedir"] in x or GLOBALS["tasks_dir"] in x else x parsed_id_string = ["%s %s" %(clean(str(pair[0])), clean(str(pair[1]))) for pair in six.iteritems(self.args)] #print '\n'.join(map(str, self.args.items())) self.jobid = md5(','.join(sorted([md5(e) for e in parsed_id_string]))) # self.jobid = md5(','.join(sorted([md5(str(pair)) for pair in # self.args.iteritems()]))) if parent_ids: self.jobid = md5(','.join(sorted(parent_ids+[self.jobid]))) if not self.jobname: self.jobname = re.sub("[^0-9a-zA-Z]", "-", basename(self.bin)) self.ifdone_cmd = "" self.iffail_cmd = "" self.set_jobdir(pjoin(GLOBALS["tasks_dir"], self.jobid)) self.input_files = {} self.dependencies = set()
def __init__(self, nodeid, task_type, task_name, base_args=None, extra_args=None): if not base_args: base_args = {} if not extra_args: extra_args = {} self.taskid = None # This define which task-processor should be used # (i.e. genetree, sptree). self.task_processor = None # Nodeid is used to identify the tree node associated with # the task. It is calculated as a hash string based on the # list of sequence IDs grouped by the node. self.nodeid = nodeid # task type: "alg|tree|acleaner|mchooser|etc." self.ttype = task_type # Used only to name directories and identify task in log # messages self.tname = task_name # Path to the file containing task status: (D)one, (R)unning # or (W)aiting or (Un)Finished #self.status_file = None #self.inkey_file = None #self.info_file = None self.status = "W" self.all_status = None # keeps a counter of how many cores are being used by running jobs self.cores_used = 0 self.job_status = {} # Set arguments that could be sent to jobs self.args = merge_arg_dicts(extra_args, base_args, parent=self) # extract all internal config values associated to this task # and generate its unique id (later used to generate taskid) self._config_id = md5(','.join(sorted(["%s %s" %(str(pair[0]),str(pair[1])) for pair in six.iteritems(extra_args) if pair[0].startswith("_")]))) self.dependencies = set()
def process_task(task, wkname, npr_conf, nodeid2info): cogconf, cogclass = npr_conf.cog_selector concatconf, concatclass = npr_conf.alg_concatenator treebuilderconf, treebuilderclass = npr_conf.tree_builder splitterconf, splitterclass = npr_conf.tree_splitter threadid, nodeid, seqtype, ttype = (task.threadid, task.nodeid, task.seqtype, task.ttype) cladeid, targets, outgroups = db.get_node_info(threadid, nodeid) if not treebuilderclass or task.size < 4: # Allows to dump algs in workflows with no tree tasks or if tree # inference does not make sense given the number of sequences. DummyTree # will produce a fake fully collapsed newick tree. treebuilderclass = DummyTree if outgroups and len(outgroups) > 1: constrain_id = nodeid else: constrain_id = None node_info = nodeid2info[nodeid] conf = GLOBALS[task.configid] new_tasks = [] if ttype == "cog_selector": # Generates a md5 id based on the genetree configuration workflow used # for the concat alg task. If something changes, concat alg will change # and the associated tree will be rebuilt config_blocks = set([wkname]) for key, value in six.iteritems(conf[wkname]): if isinstance(value, list) or isinstance(value, tuple) \ or isinstance(value, set): for elem in value: config_blocks.add(elem[1:]) if isinstance( elem, str) and elem.startswith("@") else None elif isinstance(value, str): config_blocks.add(value[1:]) if value.startswith("@") else None config_checksum = md5(''.join([ "[%s]\n%s" % (x, dict_string(conf[x])) for x in sorted(config_blocks) ])) # THIS PART HAS BEEN MOVED TO COG_SELECTOR TASK # Check that current selection of cogs will cover all target and # outgroup species #cog_hard_limit = int(conf[concatconf]["_max_cogs"]) #sp_repr = defaultdict(int) #for co in task.raw_cogs[:cog_hard_limit]: # for sp, seq in co: # sp_repr[sp] += 1 #missing_sp = (targets | outgroups) - set(sp_repr.keys()) #if missing_sp: # raise TaskError("missing species under current cog selection: %s" %missing_sp) #else: # log.log(28, "Analysis of current COG selection:") # for sp, ncogs in sorted(sp_repr.items(), key=lambda x:x[1]): # log.log(28, " % 30s species present in % 6d COGs" %(sp, ncogs)) # register concat alignment task. NodeId associated to concat_alg tasks # and all its children jobs should take into account cog information and # not only species and outgroups included. concat_job = concatclass(task.cogs, seqtype, conf, concatconf, config_checksum) db.add_node(threadid, concat_job.nodeid, cladeid, targets, outgroups) # Register Tree constrains constrain_tree = "(%s, (%s));" % (','.join( sorted(outgroups)), ','.join(sorted(targets))) _outs = "\n".join([">%s\n0" % name for name in sorted(outgroups)]) _tars = "\n".join([">%s\n1" % name for name in sorted(targets)]) constrain_alg = '\n'.join([_outs, _tars]) db.add_task_data(concat_job.nodeid, DATATYPES.constrain_tree, constrain_tree) db.add_task_data(concat_job.nodeid, DATATYPES.constrain_alg, constrain_alg) db.dataconn.commit() # since the creation of some Task objects # may require this info, I need to commit # right now. concat_job.size = task.size new_tasks.append(concat_job) elif ttype == "concat_alg": # register tree for concat alignment, using constraint tree if # necessary alg_id = db.get_dataid(task.taskid, DATATYPES.concat_alg_phylip) try: parts_id = db.get_dataid(task.taskid, DATATYPES.model_partitions) except ValueError: parts_id = None nodeid2info[nodeid]["size"] = task.size nodeid2info[nodeid]["target_seqs"] = targets nodeid2info[nodeid]["out_seqs"] = outgroups tree_task = treebuilderclass(nodeid, alg_id, constrain_id, None, seqtype, conf, treebuilderconf, parts_id=parts_id) tree_task.size = task.size new_tasks.append(tree_task) elif ttype == "tree": merger_task = splitterclass(nodeid, seqtype, task.tree_file, conf, splitterconf) merger_task.size = task.size new_tasks.append(merger_task) elif ttype == "treemerger": # Lets merge with main tree if not task.task_tree: task.finish() log.log(24, "Saving task tree...") annotate_node(task.task_tree, task) db.update_node(nid=task.nodeid, runid=task.threadid, newick=db.encode(task.task_tree)) db.commit() if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1: current_iter = get_iternumber(threadid) if npr_conf.max_iters and current_iter >= npr_conf.max_iters: log.warning("Maximum number of iterations reached!") else: # Add new nodes source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt" ttree, mtree = task.task_tree, task.main_tree log.log(26, "Processing tree: %s seqs, %s outgroups", len(targets), len(outgroups)) target_cladeids = None if tobool(conf[splitterconf].get("_find_ncbi_targets", False)): tcopy = mtree.copy() ncbi.connect_database() tax2name, tax2track = ncbi.annotate_tree_with_taxa( tcopy, None) #tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, "fake") # for testing sptree example n2content = tcopy.get_cached_content() broken_branches, broken_clades, broken_clade_sizes, tax2name = ncbi.get_broken_branches( tcopy, n2content) log.log( 28, 'restricting NPR to broken clades: ' + colorify( ', '.join( ["%s" % tax2name[x] for x in broken_clades]), "wr")) target_cladeids = set() for branch in broken_branches: print( branch.get_ascii(attributes=['spname', 'taxid'], compact=True)) print([ "%s" % tax2name[x] for x in broken_branches[branch] ]) target_cladeids.add(branch.cladeid) for node, seqs, outs, wkname in get_next_npr_node( task.configid, ttree, task.out_seqs, mtree, None, npr_conf, target_cladeids): # None is to avoid alg checks log.log(24, "Adding new node: %s seqs, %s outgroups", len(seqs), len(outs)) new_task_node = cogclass(seqs, outs, source_seqtype, conf, cogconf) new_task_node.target_wkname = wkname new_tasks.append(new_task_node) db.add_node(threadid, new_task_node.nodeid, new_task_node.cladeid, new_task_node.targets, new_task_node.outgroups) return new_tasks
def process_task(task, wkname, npr_conf, nodeid2info): cogconf, cogclass = npr_conf.cog_selector concatconf, concatclass = npr_conf.alg_concatenator treebuilderconf, treebuilderclass = npr_conf.tree_builder splitterconf, splitterclass = npr_conf.tree_splitter threadid, nodeid, seqtype, ttype = (task.threadid, task.nodeid, task.seqtype, task.ttype) cladeid, targets, outgroups = db.get_node_info(threadid, nodeid) if not treebuilderclass or task.size < 4: # Allows to dump algs in workflows with no tree tasks or if tree # inference does not make sense given the number of sequences. DummyTree # will produce a fake fully collapsed newick tree. treebuilderclass = DummyTree if outgroups and len(outgroups) > 1: constrain_id = nodeid else: constrain_id = None node_info = nodeid2info[nodeid] conf = GLOBALS[task.configid] new_tasks = [] if ttype == "cog_selector": # Generates a md5 id based on the genetree configuration workflow used # for the concat alg task. If something changes, concat alg will change # and the associated tree will be rebuilt config_blocks = set([wkname]) for key, value in six.iteritems(conf[wkname]): if isinstance(value, list) or isinstance(value, tuple) \ or isinstance(value, set): for elem in value: config_blocks.add(elem[1:]) if isinstance(elem, str) and elem.startswith("@") else None elif isinstance(value, str): config_blocks.add(value[1:]) if value.startswith("@") else None config_checksum = md5(''.join(["[%s]\n%s" %(x, dict_string(conf[x])) for x in sorted(config_blocks)])) # THIS PART HAS BEEN MOVED TO COG_SELECTOR TASK # Check that current selection of cogs will cover all target and # outgroup species #cog_hard_limit = int(conf[concatconf]["_max_cogs"]) #sp_repr = defaultdict(int) #for co in task.raw_cogs[:cog_hard_limit]: # for sp, seq in co: # sp_repr[sp] += 1 #missing_sp = (targets | outgroups) - set(sp_repr.keys()) #if missing_sp: # raise TaskError("missing species under current cog selection: %s" %missing_sp) #else: # log.log(28, "Analysis of current COG selection:") # for sp, ncogs in sorted(sp_repr.items(), key=lambda x:x[1]): # log.log(28, " % 30s species present in % 6d COGs" %(sp, ncogs)) # register concat alignment task. NodeId associated to concat_alg tasks # and all its children jobs should take into account cog information and # not only species and outgroups included. concat_job = concatclass(task.cogs, seqtype, conf, concatconf, config_checksum) db.add_node(threadid, concat_job.nodeid, cladeid, targets, outgroups) # Register Tree constrains constrain_tree = "(%s, (%s));" %(','.join(sorted(outgroups)), ','.join(sorted(targets))) _outs = "\n".join([">%s\n0" %name for name in sorted(outgroups)]) _tars = "\n".join([">%s\n1" %name for name in sorted(targets)]) constrain_alg = '\n'.join([_outs, _tars]) db.add_task_data(concat_job.nodeid, DATATYPES.constrain_tree, constrain_tree) db.add_task_data(concat_job.nodeid, DATATYPES.constrain_alg, constrain_alg) db.dataconn.commit() # since the creation of some Task objects # may require this info, I need to commit # right now. concat_job.size = task.size new_tasks.append(concat_job) elif ttype == "concat_alg": # register tree for concat alignment, using constraint tree if # necessary alg_id = db.get_dataid(task.taskid, DATATYPES.concat_alg_phylip) try: parts_id = db.get_dataid(task.taskid, DATATYPES.model_partitions) except ValueError: parts_id = None nodeid2info[nodeid]["size"] = task.size nodeid2info[nodeid]["target_seqs"] = targets nodeid2info[nodeid]["out_seqs"] = outgroups tree_task = treebuilderclass(nodeid, alg_id, constrain_id, None, seqtype, conf, treebuilderconf, parts_id=parts_id) tree_task.size = task.size new_tasks.append(tree_task) elif ttype == "tree": merger_task = splitterclass(nodeid, seqtype, task.tree_file, conf, splitterconf) merger_task.size = task.size new_tasks.append(merger_task) elif ttype == "treemerger": # Lets merge with main tree if not task.task_tree: task.finish() log.log(24, "Saving task tree...") annotate_node(task.task_tree, task) db.update_node(nid=task.nodeid, runid=task.threadid, newick=db.encode(task.task_tree)) db.commit() if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1: current_iter = get_iternumber(threadid) if npr_conf.max_iters and current_iter >= npr_conf.max_iters: log.warning("Maximum number of iterations reached!") else: # Add new nodes source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt" ttree, mtree = task.task_tree, task.main_tree log.log(26, "Processing tree: %s seqs, %s outgroups", len(targets), len(outgroups)) target_cladeids = None if tobool(conf[splitterconf].get("_find_ncbi_targets", False)): tcopy = mtree.copy() ncbi.connect_database() tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, None) #tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, "fake") # for testing sptree example n2content = tcopy.get_cached_content() broken_branches, broken_clades, broken_clade_sizes, tax2name = ncbi.get_broken_branches(tcopy, n2content) log.log(28, 'restricting NPR to broken clades: '+ colorify(', '.join(["%s"%tax2name[x] for x in broken_clades]), "wr")) target_cladeids = set() for branch in broken_branches: print(branch.get_ascii(attributes=['spname', 'taxid'], compact=True)) print(["%s"%tax2name[x] for x in broken_branches[branch]]) target_cladeids.add(branch.cladeid) for node, seqs, outs, wkname in get_next_npr_node(task.configid, ttree, task.out_seqs, mtree, None, npr_conf, target_cladeids): # None is to avoid alg checks log.log(24, "Adding new node: %s seqs, %s outgroups", len(seqs), len(outs)) new_task_node = cogclass(seqs, outs, source_seqtype, conf, cogconf) new_task_node.target_wkname = wkname new_tasks.append(new_task_node) db.add_node(threadid, new_task_node.nodeid, new_task_node.cladeid, new_task_node.targets, new_task_node.outgroups) return new_tasks