def run(starting_from_here=False): res = cmdline( join(orthomcl_bin_dir, 'orthomclDumpPairsFiles.pl'), parameters=[realpath(orthomcl_config_final_path), realpath(config.mcl_input), realpath(config.intermediate_dir), suffix], stderr='log')() with DbCursor() as cursor: for tbl in [ in_paralog_table + suffix, ortholog_table + suffix, coortholog_table + suffix, ]: try: log.debug(' Cleaning the %s table.' % tbl) cursor.execute('select 1 from %s limit 1;' % tbl) log.debug(' ' + str(cursor.fetchone())) cursor.execute('delete from %s;' % tbl) cursor.execute('select 1 from %s limit 1;' % tbl) log.debug(' ' + str(cursor.fetchone())) log.debug('') except Exception, e: log.exception(e)
def load_blast_results(suffix): def run(starting_from_here=False): with DbCursor() as cursor: for tbl in [similar_sequeces_table + suffix]: try: log.debug(" Cleaning the %s table." % tbl) try: cursor.execute("select 1 from %s limit 1;" % tbl) except: pass log.debug(" select 1 from " + tbl + " limit 1; " "result=" + str(cursor.fetchone())) try: cursor.execute("delete from %s;" % tbl) cursor.execute("select 1 from %s limit 1;" % tbl) except: pass log.debug(" select 1 from " + tbl + " limit 1; " "result=" + str(cursor.fetchone())) log.debug("") except Exception, e: log.exception(e) return cmdline( join(orthomcl_bin_dir, "orthomclLoadBlast.pl"), parameters=[realpath(orthomcl_config_final_path), realpath(config.similar_sequences), suffix], )()
def load_blast_results(suffix): def run(starting_from_here=False): with DbCursor() as cursor: for tbl in [ similar_sequeces_table + suffix, ]: try: log.debug(' Cleaning the %s table.' % tbl) try: cursor.execute('select 1 from %s limit 1;' % tbl) except: pass log.debug(' select 1 from ' + tbl + ' limit 1; ' 'result=' + str(cursor.fetchone())) try: cursor.execute('delete from %s;' % tbl) cursor.execute('select 1 from %s limit 1;' % tbl) except: pass log.debug(' select 1 from ' + tbl + ' limit 1; ' 'result=' + str(cursor.fetchone())) log.debug('') except Exception, e: log.exception(e) return cmdline(join(orthomcl_bin_dir, 'orthomclLoadBlast.pl'), parameters=[ realpath(orthomcl_config_final_path), realpath(config.similar_sequences), suffix, ])()
def run(starting_from_here=False): res = cmdline( join(orthomcl_bin_dir, "orthomclDumpPairsFiles.pl"), parameters=[ realpath(orthomcl_config_final_path), realpath(config.mcl_input), realpath(config.intermediate_dir), suffix, ], stderr="log", )() with DbCursor() as cursor: for tbl in [in_paralog_table + suffix, ortholog_table + suffix, coortholog_table + suffix]: try: log.debug(" Cleaning the %s table." % tbl) cursor.execute("select 1 from %s limit 1;" % tbl) log.debug(" " + str(cursor.fetchone())) cursor.execute("delete from %s;" % tbl) cursor.execute("select 1 from %s limit 1;" % tbl) log.debug(" " + str(cursor.fetchone())) log.debug("") except Exception, e: log.exception(e)
def run(starting_from_here=False): return cmdline( join(orthomcl_bin_dir, "orthomclMclToGroups.pl"), parameters=[prefix + "_", start_id], stdin=config.mcl_output, stdout=config.groups_file, )
def exec_cmdline(command): log.info(' ' + command) if debug: res = cmdline(command.split())() else: res = cmdline(command.split(), stdout=None, stderr=None)() if res != 0: log.debug('Running ' + command) if only_warn: log.warning('WARNING: Cannot find or install mcl. ' 'It required to perform some steps. ' 'Try to install it manually: http://micans.org/mcl/src') else: log.error('ERROR: Cannot find or install mcl. ' 'Try to install it manually: http://micans.org/mcl/src') return None, res
def run(starting_from_here=False): return cmdline( join(orthomcl_bin_dir, 'orthomclInstallSchema.pl'), parameters=[ realpath(orthomcl_config_final_path), realpath(config.sql_log), suffix], stderr='log')()
def exec_cmdline(command): log.info(' ' + command) if debug: res = cmdline(command.split())() else: res = cmdline(command.split(), stdout=None, stderr=None)() if res != 0: log.debug('Running ' + command) if only_warn: log.warning( 'WARNING: Cannot find or install mcl. ' 'It required to perform some steps. ' 'Try to install it manually: http://micans.org/mcl/src') else: log.error( 'ERROR: Cannot find or install mcl. ' 'Try to install it manually: http://micans.org/mcl/src') return None, res
def submit(self): cmdl = '-pe pe_smp 1 -S /bin/bash -cwd -j y -q batch.q -N {0} -o {1} ' \ '{2}'.format(self.job_name, realpath(self.log_fpath), realpath(self.runner_fpath)) # cmdl = '-pe pe_smp 1 -S /bin/bash -cwd -j y -o {0} -q batch.q ' \ # '{1}'.format(self.log_fpath, self.runner_fpath) log.debug('submitting job ' + str(self.i)) res = cmdline('qsub', parameters=cmdl.split())() log.debug('submitted, res = ' + str(res)) log.info('') return res
def _run(starting_from_here=False): return cmdline( 'makeblastdb', parameters=[ '-in', realpath(config.good_proteins), '-input_type', 'fasta', '-out', realpath(config.blast_db), '-dbtype', 'prot'], stdout='log', stderr='log')()
def submit(self): cmdl = "-pe pe_smp 1 -S /bin/bash -cwd -j y -q batch.q -N {0} -o {1} " "{2}".format( self.job_name, realpath(self.log_fpath), realpath(self.runner_fpath) ) # cmdl = '-pe pe_smp 1 -S /bin/bash -cwd -j y -o {0} -q batch.q ' \ # '{1}'.format(self.log_fpath, self.runner_fpath) log.debug("submitting job " + str(self.i)) res = cmdline("qsub", parameters=cmdl.split())() log.debug("submitted, res = " + str(res)) log.info("") return res
def run(starting_from_here=False): mcl_bin_path, res = check_install_mcl(debug, only_warn=False) if mcl_bin_path is None: return res return cmdline( mcl_bin_path, parameters=[realpath(config.mcl_input), "--abc", "-I", str(inflation), "-o", realpath(config.mcl_output)], start_ignoring_from=r"Please cite:.*", stderr="log", stdout="log", )()
def find_pairs(suffix): def run(starting_from_here=False): with DbCursor() as cursor: for tbl in [ in_paralog_table + suffix, ortholog_table + suffix, coortholog_table + suffix, ]: try: log.debug(' Cleaning the %s table.' % tbl) try: cursor.execute('select 1 from %s limit 1;' % tbl) except: pass log.debug(' select 1 from ' + tbl + ' limit 1; ' 'result=' + str(cursor.fetchone())) try: cursor.execute('delete from %s;' % tbl) cursor.execute('select 1 from %s limit 1;' % tbl) except: pass log.debug(' select 1 from ' + tbl + ' limit 1; ' 'result=' + str(cursor.fetchone())) log.debug('') except Exception, e: log.exception(e) #res = cmdline( # join(orthomcl_bin_dir, 'orthomclPairs.pl'), # parameters=[ # orthomcl_config, # config.pairs_log, # 'cleanup=only', # 'suffix=' + (suffix if suffix else '*')])() #log.info(' Cleaning: ' + str(res)) print 'starting_from_here: ' + str(starting_from_here) params = [ realpath(orthomcl_config_final_path), realpath(config.pairs_log), 'cleanup=no', 'suffix=' + (suffix if suffix else '*')] if starting_from_here: params += ['startAfter=useLog'] return cmdline( join(orthomcl_bin_dir, 'orthomclPairs.pl'), parameters=params)()
def run(starting_from_here=False): mcl_bin_path, res = check_install_mcl(debug, only_warn=False) if mcl_bin_path is None: return res return cmdline( mcl_bin_path, parameters=[ realpath(config.mcl_input), '--abc', '-I', str(inflation), '-o', realpath(config.mcl_output)], start_ignoring_from=r'Please cite:.*', stderr='log', stdout='log')()
def _run(starting_from_here=False): return cmdline( "makeblastdb", parameters=[ "-in", realpath(config.good_proteins), "-input_type", "fasta", "-out", realpath(config.blast_db), "-dbtype", "prot", ], stdout="log", stderr="log", )()
def run(starting_from_here=False): res = cmdline( 'perl ' + join(orthomcl_bin_dir, 'orthomclFilterFasta.pl'), parameters=[ realpath(config.proteomes_dir), min_length, max_percent_stop, realpath(config.good_proteins), realpath(config.poor_proteins)])() if res != 0: return res total_seqs = sum(1 for _ in SeqIO.parse(config.good_proteins, 'fasta')) if total_seqs == 0: log.error('No good protein sequences found.') return 1 return 0
def find_pairs(suffix): def run(starting_from_here=False): with DbCursor() as cursor: for tbl in [in_paralog_table + suffix, ortholog_table + suffix, coortholog_table + suffix]: try: log.debug(" Cleaning the %s table." % tbl) try: cursor.execute("select 1 from %s limit 1;" % tbl) except: pass log.debug(" select 1 from " + tbl + " limit 1; " "result=" + str(cursor.fetchone())) try: cursor.execute("delete from %s;" % tbl) cursor.execute("select 1 from %s limit 1;" % tbl) except: pass log.debug(" select 1 from " + tbl + " limit 1; " "result=" + str(cursor.fetchone())) log.debug("") except Exception, e: log.exception(e) # res = cmdline( # join(orthomcl_bin_dir, 'orthomclPairs.pl'), # parameters=[ # orthomcl_config, # config.pairs_log, # 'cleanup=only', # 'suffix=' + (suffix if suffix else '*')])() # log.info(' Cleaning: ' + str(res)) print "starting_from_here: " + str(starting_from_here) params = [ realpath(orthomcl_config_final_path), realpath(config.pairs_log), "cleanup=no", "suffix=" + (suffix if suffix else "*"), ] if starting_from_here: params += ["startAfter=useLog"] return cmdline(join(orthomcl_bin_dir, "orthomclPairs.pl"), parameters=params)()
def run(starting_from_here=False): res = cmdline( "perl " + join(orthomcl_bin_dir, "orthomclFilterFasta.pl"), parameters=[ realpath(config.proteomes_dir), min_length, max_percent_stop, realpath(config.good_proteins), realpath(config.poor_proteins), ], )() if res != 0: return res total_seqs = sum(1 for _ in SeqIO.parse(config.good_proteins, "fasta")) if total_seqs == 0: log.error("No good protein sequences found.") return 1 return 0
def _callback(ps): return cmdline('blastp', ps, ignore_output_lines_by_pattern= r'.* at position .* replaced by .*')
def run(starting_from_here=False): return cmdline( join(orthomcl_bin_dir, "orthomclInstallSchema.pl"), parameters=[realpath(orthomcl_config_final_path), realpath(config.sql_log), suffix], stderr="log", )()
def _run(starting_from_here=False): return cmdline( join(orthomcl_bin_dir, "orthomclBlastParser.pl"), parameters=[realpath(config.blast_out), realpath(config.proteomes_dir)], stdout=realpath(config.similar_sequences), )()
def _run(starting_from_here=False): fasta_to_blast = new_good_proteomes or config.good_proteins blast_out = config.blast_out + '_2' if new_good_proteomes else config.blast_out res = 10 if not on_cluster: # threads res = _blast(fasta_to_blast, blast_out, threads=max_jobs) else: qsub = which('qsub') if not qsub: log.warn('No qsub in system: running multuthreaded') res = _blast(fasta_to_blast, blast_out, threads=max_jobs) else: total_seqs = sum(1 for _ in SeqIO.parse(fasta_to_blast, 'fasta')) num_seqs_for_one_job = max(500, total_seqs/max_jobs) num_jobs = total_seqs/num_seqs_for_one_job or 1 # num_seqs_for_one_job = total_seqs/2 # DEBUG # num_jobs = 2 # DEBUG if num_jobs == 1: # one single threaded run res = _blast(fasta_to_blast, blast_out, threads=1) else: # jobs timestamp = str(datetime.now()).replace('-', '_').replace(':', '_').replace(' ', '_') log.info('Splitting data for ' + str(num_jobs) + ' cluster jobs.') class BlastJob: def __init__(self, i): self.i = i self.job_name = workflow_id + '_' + timestamp + '_' + str(i) self.prot_fpath = join(config.intermediate_dir, 'proteins_' + str(i) + '.fasta') self.out_fpath = join(config.intermediate_dir, 'blasted_part_' + str(i) + '.tsv') self.log_fpath = join(config.intermediate_dir, 'run_blast_' + str(i) + '.log') self.runner_fpath = join(config.intermediate_dir, 'run_blast_' + str(i) + '.sh') cmd = ('blastp ' + ' '.join(map(str, _blast_basic_params)) + ' -query ' + realpath(self.prot_fpath) + ' -out ' + realpath(self.out_fpath)) with open(self.runner_fpath, 'w') as f: f.write('#!/bin/bash\n') f.write('. /etc/profile.d/modules.sh\n') f.write('module load blast\n') f.write(cmd + '\n') f.write('date\n') def submit(self): cmdl = '-pe pe_smp 1 -S /bin/bash -cwd -j y -q batch.q -N {0} -o {1} ' \ '{2}'.format(self.job_name, realpath(self.log_fpath), realpath(self.runner_fpath)) # cmdl = '-pe pe_smp 1 -S /bin/bash -cwd -j y -o {0} -q batch.q ' \ # '{1}'.format(self.log_fpath, self.runner_fpath) log.debug('submitting job ' + str(self.i)) res = cmdline('qsub', parameters=cmdl.split())() log.debug('submitted, res = ' + str(res)) log.info('') return res blast_jobs = [] i, i_recs = 1, [] for rec in SeqIO.parse(fasta_to_blast, 'fasta'): i_recs.append(rec) if len(i_recs) > num_seqs_for_one_job: blast_job = BlastJob(i) blast_jobs.append(blast_job) SeqIO.write(i_recs, blast_job.prot_fpath, 'fasta') i, i_recs = i + 1, [] if i_recs: blast_job = BlastJob(i) blast_jobs.append(blast_job) SeqIO.write(i_recs, blast_job.prot_fpath, 'fasta') for bj in blast_jobs: res = bj.submit() if res != 0: log.info('qsub returned exit code ' + str(res)) return res results_script_fpath = join(config.intermediate_dir, 'collect_blasted' + '.sh') collect_log_fpath = join(config.intermediate_dir, 'collect_blasted.log') if isfile(collect_log_fpath): os.remove(collect_log_fpath) with open(results_script_fpath, 'w') as f: f.write('#!/bin/bash\n') f.write('touch ' + collect_log_fpath + '\n') cmdl = '-hold_jid {0} -S /bin/bash -cwd -j y -q batch.q {1}'.format( ','.join(j.job_name for j in blast_jobs), realpath(results_script_fpath)) log.debug('wating for jobs...') res = cmdline('qsub', parameters=cmdl.split())() if res != 0: return res log.info('Waiting for blast jobs to finish...') while not isfile(collect_log_fpath): sleep(3) log.info('All blast finished, proceeding.') # cat_params = '' ok = True for bj in blast_jobs: if not verify_file(bj.out_fpath): ok = False else: log.debug(bj.out_fpath + ' exists, ok') # cat_params += ' ' + bj.prot_fpath if not ok: return 3 # res = cmdline('cat', # parameters=cat_params, # stdout=blast_out) with open(blast_out, 'w') as out: for bj in blast_jobs: with open(bj.out_fpath) as bjout: out.write(bjout.read()) if not verify_file(blast_out): log.debug(blast_out + ' not exist, return 4') return 4 print res if res != 0: return res if new_good_proteomes: log.info(' Appending ' + config.blast_out + '_2 to ' + config.blast_out) with open(config.blast_out, 'a') as b_out: with open(config.blast_out + '_2') as b_out_2: b_out.write(b_out_2.read()) return res
def _run(starting_from_here=False): fasta_to_blast = new_good_proteomes or config.good_proteins blast_out = config.blast_out + "_2" if new_good_proteomes else config.blast_out res = 10 if not on_cluster: # threads res = _blast(fasta_to_blast, blast_out, threads=max_jobs) else: qsub = which("qsub") if not qsub: log.warn("No qsub in system: running multuthreaded") res = _blast(fasta_to_blast, blast_out, threads=max_jobs) else: total_seqs = sum(1 for _ in SeqIO.parse(fasta_to_blast, "fasta")) num_seqs_for_one_job = max(500, total_seqs / max_jobs) num_jobs = total_seqs / num_seqs_for_one_job or 1 # num_seqs_for_one_job = total_seqs/2 # DEBUG # num_jobs = 2 # DEBUG if num_jobs == 1: # one single threaded run res = _blast(fasta_to_blast, blast_out, threads=1) else: # jobs timestamp = str(datetime.now()).replace("-", "_").replace(":", "_").replace(" ", "_") log.info("Splitting data for " + str(num_jobs) + " cluster jobs.") class BlastJob: def __init__(self, i): self.i = i self.job_name = workflow_id + "_" + timestamp + "_" + str(i) self.prot_fpath = join(config.intermediate_dir, "proteins_" + str(i) + ".fasta") self.out_fpath = join(config.intermediate_dir, "blasted_part_" + str(i) + ".tsv") self.log_fpath = join(config.intermediate_dir, "run_blast_" + str(i) + ".log") self.runner_fpath = join(config.intermediate_dir, "run_blast_" + str(i) + ".sh") cmd = ( "blastp " + " ".join(map(str, _blast_basic_params)) + " -query " + realpath(self.prot_fpath) + " -out " + realpath(self.out_fpath) ) with open(self.runner_fpath, "w") as f: f.write("#!/bin/bash\n") f.write(". /etc/profile.d/modules.sh\n") f.write("module load blast\n") f.write(cmd + "\n") f.write("date\n") def submit(self): cmdl = "-pe pe_smp 1 -S /bin/bash -cwd -j y -q batch.q -N {0} -o {1} " "{2}".format( self.job_name, realpath(self.log_fpath), realpath(self.runner_fpath) ) # cmdl = '-pe pe_smp 1 -S /bin/bash -cwd -j y -o {0} -q batch.q ' \ # '{1}'.format(self.log_fpath, self.runner_fpath) log.debug("submitting job " + str(self.i)) res = cmdline("qsub", parameters=cmdl.split())() log.debug("submitted, res = " + str(res)) log.info("") return res blast_jobs = [] i, i_recs = 1, [] for rec in SeqIO.parse(fasta_to_blast, "fasta"): i_recs.append(rec) if len(i_recs) > num_seqs_for_one_job: blast_job = BlastJob(i) blast_jobs.append(blast_job) SeqIO.write(i_recs, blast_job.prot_fpath, "fasta") i, i_recs = i + 1, [] if i_recs: blast_job = BlastJob(i) blast_jobs.append(blast_job) SeqIO.write(i_recs, blast_job.prot_fpath, "fasta") for bj in blast_jobs: res = bj.submit() if res != 0: log.info("qsub returned exit code " + str(res)) return res results_script_fpath = join(config.intermediate_dir, "collect_blasted" + ".sh") collect_log_fpath = join(config.intermediate_dir, "collect_blasted.log") if isfile(collect_log_fpath): os.remove(collect_log_fpath) with open(results_script_fpath, "w") as f: f.write("#!/bin/bash\n") f.write("touch " + collect_log_fpath + "\n") cmdl = "-hold_jid {0} -S /bin/bash -cwd -j y -q batch.q {1}".format( ",".join(j.job_name for j in blast_jobs), realpath(results_script_fpath) ) log.debug("wating for jobs...") res = cmdline("qsub", parameters=cmdl.split())() if res != 0: return res log.info("Waiting for blast jobs to finish...") while not isfile(collect_log_fpath): sleep(3) log.info("All blast finished, proceeding.") # cat_params = '' ok = True for bj in blast_jobs: if not verify_file(bj.out_fpath): ok = False else: log.debug(bj.out_fpath + " exists, ok") # cat_params += ' ' + bj.prot_fpath if not ok: return 3 # res = cmdline('cat', # parameters=cat_params, # stdout=blast_out) with open(blast_out, "w") as out: for bj in blast_jobs: with open(bj.out_fpath) as bjout: out.write(bjout.read()) if not verify_file(blast_out): log.debug(blast_out + " not exist, return 4") return 4 print res if res != 0: return res if new_good_proteomes: log.info(" Appending " + config.blast_out + "_2 to " + config.blast_out) with open(config.blast_out, "a") as b_out: with open(config.blast_out + "_2") as b_out_2: b_out.write(b_out_2.read()) return res
def run(starting_from_here=False): return cmdline( join(orthomcl_bin_dir, 'orthomclMclToGroups.pl'), parameters=[prefix + '_', start_id], stdin=config.mcl_output, stdout=config.groups_file)
def _callback(ps): return cmdline("blastp", ps, ignore_output_lines_by_pattern=r".* at position .* replaced by .*")
def _run(starting_from_here=False): return cmdline( join(orthomcl_bin_dir, 'orthomclBlastParser.pl'), parameters=[realpath(config.blast_out), realpath(config.proteomes_dir)], stdout=realpath(config.similar_sequences))()