def _startPhmmers(self, chunked_reads_fns, chunked_dom_fns, out_dom_fn, primer_fn, pbmatrix_fn): """Run phmmers on chunked reads files in 'chunked_reads_fns' and generate chunked dom files as listed in 'chunked_dom_fns', finally concatenate dom files to 'out_dom_fn'.""" logging.info("Start to launch phmmer on chunked reads.") jobs = [] for reads_fn, domFN in zip(chunked_reads_fns, chunked_dom_fns): p = multiprocessing.Process( target=self._phmmer, args=(reads_fn, domFN, primer_fn, pbmatrix_fn)) jobs.append((p, domFN)) p.start() for p, domFN in jobs: p.join() cmd = "cat {0} >> {1}".format(real_upath(domFN), real_upath(out_dom_fn)) _output, errCode, errMsg = backticks(cmd) if errCode != 0: raise ClassifierException( "Error concatenating dom files: {e}". format(e=str(errMsg))) self._cleanup(chunked_reads_fns) self._cleanup(chunked_dom_fns)
def _startPhmmers(self, chunked_reads_fns, chunked_dom_fns, out_dom_fn, primer_fn, pbmatrix_fn): """Run phmmers on chunked reads files in 'chunked_reads_fns' and generate chunked dom files as listed in 'chunked_dom_fns', finally concatenate dom files to 'out_dom_fn'.""" logging.info("Start to launch phmmer on chunked reads.") jobs = [] for reads_fn, domFN in zip(chunked_reads_fns, chunked_dom_fns): p = multiprocessing.Process( target=self._phmmer, args=(reads_fn, domFN, primer_fn, pbmatrix_fn)) jobs.append((p, domFN)) p.start() for p, domFN in jobs: p.join() cmd = "cat {0} >> {1}".format(real_upath(domFN), real_upath(out_dom_fn)) _output, errCode, errMsg = backticks(cmd) if errCode != 0: raise ClassifierException( "Error concatenating dom files: {e}". format(e=str(errMsg))) self._cleanup(chunked_reads_fns) self._cleanup(chunked_dom_fns)
def _phmmer(self, reads_fn, domFN, primer_fn, pbmaxtrixFN): """Invoke phmmer once.""" cmd = "phmmer --cpu 1 --domtblout {d} --noali --domE 1 ".\ format(d=real_upath(domFN)) + \ "--mxfile {m} ".format(m=real_upath(pbmaxtrixFN)) + \ "--popen 0.07 --pextend 0.07 {r} {p} > /dev/null".\ format(r=real_upath(reads_fn), p=real_upath(primer_fn)) logging.debug("Calling phmmer: {cmd}".format(cmd=cmd)) _output, errCode, errMsg = backticks(cmd) if (errCode != 0): raise ClassifierException( "Error calling phmmer: {e}.".format(e=str(errMsg)))
def _phmmer(self, reads_fn, domFN, primer_fn, pbmaxtrixFN): """Invoke phmmer once.""" cmd = "phmmer --cpu 1 --domtblout {d} --noali --domE 1 ".\ format(d=real_upath(domFN)) + \ "--mxfile {m} ".format(m=real_upath(pbmaxtrixFN)) + \ "--popen 0.07 --pextend 0.07 {r} {p} > /dev/null".\ format(r=real_upath(reads_fn), p=real_upath(primer_fn)) logging.debug("Calling phmmer: {cmd}".format(cmd=cmd)) _output, errCode, errMsg = backticks(cmd) if (errCode != 0): raise ClassifierException( "Error calling phmmer: {e}.".format(e=str(errMsg)))
def build_sa(input_fasta, out_sa): """Generate suffix array of input_fasta""" if op.exists(input_fasta): cmd = "sawriter {o} {i} -blt 8 -welter ".\ format(o=real_upath(out_sa), i=real_upath(input_fasta)) _out, _code, _msg = backticks(cmd) if _code == 0: return True else: # If failed to generate suffix array, warning. logging.warn("Unable to create suffix array for {f}.".format(f=input_fasta)) return False else: raise IOError("Unable to find fasta file {f}.".format(f=input_fasta))
def build_sa(input_fasta, out_sa): """Generate suffix array of input_fasta""" if op.exists(input_fasta): cmd = "sawriter {o} {i} -blt 8 -welter ".\ format(o=real_upath(out_sa), i=real_upath(input_fasta)) _out, _code, _msg = backticks(cmd) if _code == 0: return True else: # If failed to generate suffix array, warning. logging.warn("Unable to create suffix array for {f}.".format(f=input_fasta)) return False else: raise IOError("Unable to find fasta file {f}.".format(f=input_fasta))
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"): """Sanity check if sge can work.""" scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) if not op.exists(scriptDir): os.makedirs(scriptDir) if not op.exists(testDir): os.makedirs(testDir) testSh = op.join(scriptDir, 'test.sh') consensusFa = op.join(testDir, "g_consensus.fasta") testInFa = op.join(testDir, "gcon_in.fa") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert(op.exists(testInFa)) with open(testSh, 'w') as f: f.write("#!/bin/bash\n") f.write("{gcon}".format(gcon=gcon_py) + " {inFa} ".format(inFa=real_upath(testInFa)) + " {testDir}/g_consensus".format(testDir=real_upath(testDir)) + " c1\n") assert(op.exists(testSh)) cmd = "qsub" if sge_opts.sge_queue is not None: cmd += " -q " + sge_opts.sge_queue cmd += " -sync y -pe {env} 1 -cwd -S /bin/bash -V -e /dev/null -o /dev/null {t}".\ format(t=real_upath(testSh), env=sge_opts.sge_env_name) logging.info("Submitting cmd: " + cmd) _out, _code, _msg = backticks(cmd) # answer = FastaReader(GCON_OUT_FA).__iter__().next() # tester = FastaReader(consensusFa).__iter__().next() # # if answer.name != tester.name or \ # answer.sequence != tester.sequence: if not filecmp.cmp(consensusFa, GCON_OUT_FA): errMsg = "Trouble running qsub or output is not as " + \ "expected ({0} and {1} must agree). Abort!".format( consensusFa, GCON_OUT_FA) logging.error(errMsg) return False else: shutil.rmtree(testDir) logging.info("sge and gcon check passed.") return True
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"): """Sanity check if sge can work.""" scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) if not op.exists(scriptDir): os.makedirs(scriptDir) if not op.exists(testDir): os.makedirs(testDir) testSh = op.join(scriptDir, 'test.sh') consensusFa = op.join(testDir, "g_consensus.fasta") testInFa = op.join(testDir, "gcon_in.fa") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert(op.exists(testInFa)) with open(testSh, 'w') as f: f.write("#!/bin/bash\n") f.write("{gcon}".format(gcon=gcon_py) + " {inFa} ".format(inFa=real_upath(testInFa)) + " {testDir}/g_consensus".format(testDir=real_upath(testDir)) + " c1\n") assert(op.exists(testSh)) cmd = "qsub" if sge_opts.sge_queue is not None: cmd += " -q " + sge_opts.sge_queue cmd += " -sync y -pe {env} 1 -cwd -S /bin/bash -V -e /dev/null -o /dev/null {t}".\ format(t=real_upath(testSh), env=sge_opts.sge_env_name) logging.info("Submitting cmd: " + cmd) _out, _code, _msg = backticks(cmd) # answer = FastaReader(GCON_OUT_FA).__iter__().next() # tester = FastaReader(consensusFa).__iter__().next() # # if answer.name != tester.name or \ # answer.sequence != tester.sequence: if not filecmp.cmp(consensusFa, GCON_OUT_FA): errMsg = "Trouble running qsub or output is not as " + \ "expected ({0} and {1} must agree). Abort!".format( consensusFa, GCON_OUT_FA) logging.error(errMsg) return False else: shutil.rmtree(testDir) logging.info("sge and gcon check passed.") return True
def numReads(self): """Return the number of reads in reads_fn.""" cmd = "grep -c '>' {r}".format(r=real_upath(self.reads_fn)) output, errCode, errMsg = backticks(cmd) if errCode != 0: raise ClassifierException("Error reading file {r}:{e}".format( r=self.reads_fn, e=str(errMsg))) return int(output[0])
def numReads(self): """Return the number of reads in reads_fn.""" cmd = "grep -c '>' {r}".format(r=real_upath(self.reads_fn)) output, errCode, errMsg = backticks(cmd) if errCode != 0: raise ClassifierException( "Error reading file {r}:{e}". format(r=self.reads_fn, e=str(errMsg))) return int(output[0])
def _align_withBLASR(self, queryFa, targetFa, outFN, ice_opts, sge_opts): """Align input reads against itself using BLASR.""" if os.path.exists(outFN): logging.info( "{0} already exists. No need to run BLASR.".format(outFN)) else: cmd = "blasr {q} ".format(q=real_upath(queryFa)) + \ "{t} ".format(t=real_upath(targetFa)) + \ "-m 5 -maxLCPLength 15 " + \ "-nproc {cpu} ".format(cpu=sge_opts.blasr_nproc) + \ "-maxScore {score} ".format(score=ice_opts.maxScore) + \ "-bestn {n} -nCandidates {n} ".format(n=ice_opts.bestn) + \ "-out {o}".format(o=real_upath(outFN)) logging.info("Calling {cmd}".format(cmd=cmd)) _output, code, msg = backticks(cmd) if code != 0: errMsg = "{cmd} exited with {code}: {msg}".\ format(cmd=cmd, code=code, msg=msg) logging.error(errMsg) raise RuntimeError(errMsg)
def _align_withBLASR(self, queryFa, targetFa, outFN, ice_opts, sge_opts): """Align input reads against itself using BLASR.""" if os.path.exists(outFN): logging.info("{0} already exists. No need to run BLASR.". format(outFN)) else: cmd = "blasr {q} ".format(q=real_upath(queryFa)) + \ "{t} ".format(t=real_upath(targetFa)) + \ "-m 5 -maxLCPLength 15 " + \ "-nproc {cpu} ".format(cpu=sge_opts.blasr_nproc) + \ "-maxScore {score} ".format(score=ice_opts.maxScore) + \ "-bestn {n} -nCandidates {n} ".format(n=ice_opts.bestn) + \ "-out {o}".format(o=real_upath(outFN)) logging.info("Calling {cmd}".format(cmd=cmd)) _output, code, msg = backticks(cmd) if code != 0: errMsg = "{cmd} exited with {code}: {msg}".\ format(cmd=cmd, code=code, msg=msg) logging.error(errMsg) raise RuntimeError (errMsg)
def blasr_sam_for_quiver(input_fasta, ref_fasta, out_sam_filename, run_cmd=True, blasr_nproc=12): """ input_fasta --- should be in.raw.fa ref_fasta --- reference fasta (ex: g_consensus.fa) to align to out_sam_filename --- sam output aligning in_fasta to ref_fasta run blasr -clipping soft to get sam """ cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} ".format(r=real_upath(ref_fasta)) + \ "-nproc {n} ".format(n=blasr_nproc) + \ "-bestn 5 -nCandidates 10 -sam -clipping soft " + \ "-out {o}".format(o=real_upath(out_sam_filename)) logging.debug("CMD: " + cmd) if run_cmd: _out, _code, _msg = backticks(cmd) if _code != 0: raise RuntimeError("CMD failed: {cmd}\n{e}". format(cmd=cmd, e=_msg)) return cmd
def blasr_sam_for_quiver(input_fasta, ref_fasta, out_sam_filename, run_cmd=True, blasr_nproc=12): """ input_fasta --- should be in.raw.fa ref_fasta --- reference fasta (ex: g_consensus.fa) to align to out_sam_filename --- sam output aligning in_fasta to ref_fasta run blasr -clipping soft to get sam """ cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} ".format(r=real_upath(ref_fasta)) + \ "-nproc {n} ".format(n=blasr_nproc) + \ "-bestn 5 -nCandidates 10 -sam -clipping soft " + \ "-out {o}".format(o=real_upath(out_sam_filename)) logging.debug("CMD: " + cmd) if run_cmd: _out, _code, _msg = backticks(cmd) if _code != 0: raise RuntimeError("CMD failed: {cmd}\n{e}". format(cmd=cmd, e=_msg)) return cmd
def submit_todo_quiver_jobs(self, todo, submitted, sge_opts): """ todo --- a list of sh scripts to run submitted --- a list of sh scripts which have been submitted sge_opts --- SGE options, including use_sge, whether or not to use sge max_sge_jobs, maximum number sge jobs to submit quiver_nproc, number of nproc per job unique_id, unique id to name qsub jobs """ time0 = datetime.now() if sge_opts.use_sge is not True or \ sge_opts.max_sge_jobs == 0: # don't use SGE for job in todo: elog = op.join(self.quivered_log_dir, op.basename(job) + ".elog") olog = op.join(self.quivered_log_dir, op.basename(job) + ".olog") cmd = "bash " + real_upath(job) + " 1>{olog} 2>{elog}".\ format(olog=real_upath(olog), elog=real_upath(elog)) self.run_cmd_and_log(cmd, olog=olog, elog=elog, description="Failed to run Quiver") submitted.append(("local", job)) todo = [] else: while len(todo) > 0: n = min(sge_opts.max_sge_jobs, len(todo)) for job in todo[:n]: # ex: Your job 8613116 ("c20to70.sh") has been submitted elog = op.join(self.quivered_log_dir, op.basename(job) + ".elog") olog = op.join(self.quivered_log_dir, op.basename(job) + ".olog") jid = "ice_quiver_{unique_id}_{name}".format( unique_id=self.sge_opts.unique_id, name=op.basename(job)) qsub_cmd = "qsub" if self.sge_opts.sge_queue is not None: qsub_cmd += " -q " + self.sge_opts.sge_queue qsub_cmd += " -pe {env} {n} ".format(n=sge_opts.quiver_nproc, env=sge_opts.sge_env_name) + \ "-cwd -S /bin/bash -V " + \ "-e {elog} ".format(elog=real_upath(elog)) +\ "-o {olog} ".format(olog=real_upath(olog)) +\ "-N {jid} ".format(jid=jid) + \ "{job}".format(job=real_upath(job)) job_id = self.qsub_cmd_and_log(qsub_cmd) submitted.append((job_id, job)) todo.remove(job) # end of for job in todo[:n] # end of while len(todo) > 0 # end of else (use sge) self.add_log("Total time submitting todo quiver jobs: {0}".format( datetime.now() - time0))
def submit_todo_quiver_jobs(self, todo, submitted, sge_opts): """ todo --- a list of sh scripts to run submitted --- a list of sh scripts which have been submitted sge_opts --- SGE options, including use_sge, whether or not to use sge max_sge_jobs, maximum number sge jobs to submit quiver_nproc, number of nproc per job unique_id, unique id to name qsub jobs """ time0 = datetime.now() if sge_opts.use_sge is not True or \ sge_opts.max_sge_jobs == 0: # don't use SGE for job in todo: elog = op.join(self.quivered_log_dir, op.basename(job) + ".elog") olog = op.join(self.quivered_log_dir, op.basename(job) + ".olog") cmd = "bash " + real_upath(job) + " 1>{olog} 2>{elog}".\ format(olog=real_upath(olog), elog=real_upath(elog)) self.run_cmd_and_log(cmd, olog=olog, elog=elog, description="Failed to run Quiver") submitted.append(("local", job)) todo = [] else: while len(todo) > 0: n = min(sge_opts.max_sge_jobs, len(todo)) for job in todo[:n]: # ex: Your job 8613116 ("c20to70.sh") has been submitted elog = op.join(self.quivered_log_dir, op.basename(job) + ".elog") olog = op.join(self.quivered_log_dir, op.basename(job) + ".olog") jid = "ice_quiver_{unique_id}_{name}".format( unique_id=self.sge_opts.unique_id, name=op.basename(job)) qsub_cmd = "qsub" if self.sge_opts.sge_queue is not None: qsub_cmd += " -q " + self.sge_opts.sge_queue qsub_cmd += " -pe {env} {n} ".format(n=sge_opts.quiver_nproc, env=sge_opts.sge_env_name) + \ "-cwd -S /bin/bash -V " + \ "-e {elog} ".format(elog=real_upath(elog)) +\ "-o {olog} ".format(olog=real_upath(olog)) +\ "-N {jid} ".format(jid=jid) + \ "{job}".format(job=real_upath(job)) job_id = self.qsub_cmd_and_log(qsub_cmd) submitted.append((job_id, job)) todo.remove(job) # end of for job in todo[:n] # end of while len(todo) > 0 # end of else (use sge) self.add_log("Total time submitting todo quiver jobs: {0}".format(datetime.now()-time0))
def quiver_cmds_for_bin(self, cids, quiver_nproc=2): """Return a list of quiver related cmds, to convert sam & ref to cmp.h5 and call quiver, including samtoh5, loadPulses, comph5tools.py, samtools, loadChemistry, quiver... """ first, last = cids[0], cids[-1] self.add_log("Creating quiver cmds for c{first} to c{last}". format(first=first, last=last)) bin_sam_file = self.sam_of_quivered_bin(first, last) bin_ref_fa = self.ref_fa_of_quivered_bin(first, last) bin_cmph5 = self.cmph5_of_quivered_bin(first, last) bin_fq = self.fq_of_quivered_bin(first, last) cmds = [] cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format( sam=real_upath(bin_sam_file), ref=real_upath(bin_ref_fa), cmph5=real_upath(bin_cmph5))) # (Liz) don't gzip the sa #cmds.append("gzip {sam}".format(sam=real_upath(bin_sam_file))) metrics = ["QualityValue", "InsertionQV", "MergeQV", "DeletionQV", "DeletionTag", "SubstitutionTag", "SubstitutionQV"] cmds.append("loadPulses {bas_fofn} ". format(bas_fofn=real_upath(self.bas_fofn)) + "{cmph5} ".format(cmph5=real_upath(bin_cmph5)) + "-byread -metrics " + ",".join(metrics)) cmds.append("cmph5tools.py sort {cmph5}". format(cmph5=real_upath(bin_cmph5))) cmds.append("samtools faidx {ref}".format(ref=real_upath(bin_ref_fa))) cmds.append("loadChemistry.py {bas_fofn} {cmph5}". format(bas_fofn=real_upath(self.bas_fofn), cmph5=real_upath(bin_cmph5))) cmds.append("quiver {cmph5} ".format(cmph5=real_upath(bin_cmph5)) + "-v -j{n} ".format(n=quiver_nproc) + "-r {ref} ".format(ref=real_upath(bin_ref_fa)) + "-o {fq}".format(fq=real_upath(bin_fq))) return cmds
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False, cpus=1): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) #out_fns = [] mkdir(fasta_out_dir) # multiprocessing worker stuff manager = Manager() in_queue = manager.Queue(len(in_fns)) in_queue_count = 0 outfile_track = {} # expected out file --> (cmd, tmp) pool = [] out_fns = [] for in_fn in in_fns: #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug("File {0} already exists. skipping.".format(out_file)) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(tmp_out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file in_queue.put((cmd, tmp_out_file, out_file)) in_queue_count += 1 outfile_track[out_file] = (cmd, tmp_out_file) print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file cpus = min(cpus, in_queue_count) # cap max CPU if there's fewer files to convert for i in xrange(cpus): p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue,)) pool.append(p) #error_flag = False # starting & joining pool worakers for p in pool: p.start() #print >> sys.stderr, "Starting worker", p.name for p in pool: #print >> sys.stderr, "Waiting join", p.name p.join(timeout=1200) if p.is_alive(): p.terminate() # check that all files exists # if it does not, force to run locally for out_file,(cmd, tmp_out_file) in outfile_track.iteritems(): in_queue.put((cmd, tmp_out_file, out_file)) convert_fofn_to_fasta_worker(in_queue) out_fns.append(out_file) #if error_flag: # raise Exception, "Unable to successfuly run convert_fofn_to_fasta, ABORT!" write_files_to_fofn(out_fns, out_filename)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12, use_finer_qv=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \ "-nproc {n} -m 5 ".format(n=blasr_nproc) + \ "-maxScore -1000 -minPctIdentity 85 " + \ "-out {o} ".format(o=real_upath(m5_file)) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=real_upath(sa_file)) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) probqv = ProbFromFastq(input_fastq) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qvmean_get_func=probqv.get_mean, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)
def concat_sam(samfiles, outsam_filename): """ Header looks like: @HD VN:1.3.1 @SQ SN:c31 LN:3104 M5:ef7d3f84dea9d9face43e6fd5b6336c4 @RG ID:2caa54eef6 PU:in.raw_with_partial.fa SM:NO_CHIP_ID @PG ID:BLASR VN:1.3.1.126469 CL:blasr in.raw_with_partial.fa g_consensus.fa -nproc 12 -bestn 5 -nCandidates 10 -sam -out out.sam NOTE: check for M5 conflicts; manipulate them if it conflicts """ f_sq = open(outsam_filename + '.sq', 'w') f_bd = open(outsam_filename + '.bd', 'w') rg_line = None pg_line = None md5_seen = set() if len(samfiles) == 0: raise ValueError("No sam input files to concatenate.") h = open(samfiles[0]) line = h.readline() assert line.startswith('@HD') f_sq.write(line) line = h.readline() assert line.startswith('@SQ') line = h.readline() assert line.startswith('@RG') rg_line = line # write at the end line = h.readline() assert line.startswith('@PG') pg_line = line # write at the end h.close() for f in samfiles: with open(f) as h: assert h.readline().startswith('@HD') line = h.readline() assert line.startswith('@SQ') # ------- check for MD5 conflicts ----------- # m5 = line.strip().split()[-1] assert m5.startswith("M5:") if m5 not in md5_seen: f_sq.write(line) md5_seen.add(m5) else: s = list(m5[3:]) while True: # create a random m5 string. random.shuffle(s) s = "".join(s) if s not in md5_seen: break line = line[:line.find('M5:')] + 'M5:' + s + '\n' logging.debug("MD5 conflict: change to {0}".format(s)) md5_seen.add(s) f_sq.write(line) # ----- end MD5 checking and writing --------- # assert h.readline().startswith('@RG') assert h.readline().startswith('@PG') for line in h: f_bd.write(line) f_bd.close() f_sq.write(rg_line) f_sq.write(pg_line) f_sq.close() cmd = "cat {0}.sq {0}.bd > {0}".format(real_upath(outsam_filename)) _out, _code, _msg = backticks(cmd) if _code != 0: raise IOError("Failed to concat sam files! Abort." + _msg) os.remove(f_sq.name) os.remove(f_bd.name)
def createPickles(self): """For each file in fastq_filenames, call 'ice_partial.py one' to build clusters and to save results to a pickle file. When all pickles are done, union all pickles. """ self.add_log("Mapping non-full-length reads to consensus isoforms.") self.add_log("Creating pickles...", level=logging.INFO) # using --blasr_nproc=4 because DALIGNER uses only 4 cores for idx, fq in enumerate(self.fastq_filenames): # for each splitted non-full-length reads fasta file, build # # partial_uc.pickle cmd = "ice_partial.py one {i} ".format(i=real_upath(fq)) + \ "{r} ".format(r=real_upath(self.ref_fasta)) + \ "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \ "--blasr_nproc={n} ".format(n=4) + \ "--done={d} ".format(d=real_upath(self.done_filenames[idx])) if self.ccs_fofn is not None: cmd += "--ccs_fofn={f} ".format(f=real_upath(self.ccs_fofn)) if self.sa_file is not None: cmd += "--sa={sa} ".format(sa=real_upath(self.sa_file)) self.add_log("Writing command to script {fsh}". format(fsh=self.script_filenames[idx])) self.add_log("CMD: {0}".format(cmd)) with open(self.script_filenames[idx], 'w') as fsh: fsh.write(cmd + "\n") # determine elog & olog partial_log_fn = op.join(self.log_dir, 'IcePartial.{idx}'.format(idx=idx)) elog = partial_log_fn + ".elog" olog = partial_log_fn + ".olog" jid = "ice_partial_{unique_id}_{name}".format( unique_id=self.sge_opts.unique_id, name=op.basename(fq)) qsub_cmd = "qsub" if self.sge_opts.sge_queue is not None: qsub_cmd += " -q " + self.sge_opts.sge_queue qsub_cmd += " -pe {env} {n} ".format(env=self.sge_opts.sge_env_name, n=4) + \ "-cwd -S /bin/bash -V " + \ "-e {elog} ".format(elog=real_upath(elog)) + \ "-o {olog} ".format(olog=real_upath(olog)) + \ "-N {jid} ".format(jid=jid) + \ "{sh}".format(sh=real_upath(self.script_filenames[idx])) self.add_log("Creating a pickle for {f}".format(f=fq)) if self.sge_opts.use_sge is True: self.qsub_cmd_and_log(qsub_cmd) else: cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog), elog=real_upath(elog)) self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False, cpus=1): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) #out_fns = [] mkdir(fasta_out_dir) # multiprocessing worker stuff manager = Manager() out_fns = manager.list() in_queue = manager.Queue(99999) pool = [] for i in xrange(cpus): p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue, out_fns)) pool.append(p) for in_fn in in_fns: #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug( "File {0} already exists. skipping.".format(out_file)) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(tmp_out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file in_queue.put((cmd, tmp_out_file, out_file)) print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file # logging.debug("CMD: {cmd}".format(cmd=cmd)) # _out, _code, _msg = backticks(cmd) # if _code != 0: # raise RuntimeError("CMD failed: {cmd}\n".format(cmd=cmd) + _msg) # trim_subread_flanks(tmp_out_file, out_file) # out_fns.append(out_file) # if op.exists(tmp_out_file): # os.remove(tmp_out_file) # starting & joining pool worakers for p in pool: p.start() #print >> sys.stderr, "Starting worker", p.name for p in pool: #print >> sys.stderr, "Waiting join", p.name p.join() write_files_to_fofn(out_fns, out_filename)
def createPickles(self): """For each file in fasta_filenames, call 'ice_partial.py one' to build clusters and to save results to a pickle file. When all pickles are done, union all pickles. """ self.add_log("Mapping non-full-length reads to consensus isoforms.") self.add_log("Creating pickles...", level=logging.INFO) # using --blasr_nproc=4 because DALIGNER uses only 4 cores for idx, fa in enumerate(self.fasta_filenames): # for each splitted non-full-length reads fasta file, build # # partial_uc.pickle cmd = "ice_partial.py one {i} ".format(i=real_upath(fa)) + \ "{r} ".format(r=real_upath(self.ref_fasta)) + \ "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \ "--blasr_nproc={n} ".format(n=4) + \ "--done={d} ".format(d=real_upath(self.done_filenames[idx])) if self.ccs_fofn is not None: cmd += "--ccs_fofn={f} ".format(f=real_upath(self.ccs_fofn)) if self.sa_file is not None: cmd += "--sa={sa} ".format(sa=real_upath(self.sa_file)) self.add_log("Writing command to script {fsh}". format(fsh=self.script_filenames[idx])) self.add_log("CMD: {0}".format(cmd)) with open(self.script_filenames[idx], 'w') as fsh: fsh.write(cmd + "\n") # determine elog & olog partial_log_fn = op.join(self.log_dir, 'IcePartial.{idx}'.format(idx=idx)) elog = partial_log_fn + ".elog" olog = partial_log_fn + ".olog" jid = "ice_partial_{unique_id}_{name}".format( unique_id=self.sge_opts.unique_id, name=op.basename(fa)) qsub_cmd = "qsub" if self.sge_opts.sge_queue is not None: qsub_cmd += " -q " + self.sge_opts.sge_queue qsub_cmd += " -pe {env} {n} ".format(env=self.sge_opts.sge_env_name, n=4) + \ "-cwd -S /bin/bash -V " + \ "-e {elog} ".format(elog=real_upath(elog)) + \ "-o {olog} ".format(olog=real_upath(olog)) + \ "-N {jid} ".format(jid=jid) + \ "{sh}".format(sh=real_upath(self.script_filenames[idx])) self.add_log("Creating a pickle for {f}".format(f=fa)) if self.sge_opts.use_sge is True: self.qsub_cmd_and_log(qsub_cmd) else: cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog), elog=real_upath(elog)) self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, sa_file=None, ccs_fofn=None, done_filename=None, blasr_nproc=12, use_finer_qv=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = realpath(input_fasta) m5_file = input_fasta + ".blasr" out_pickle = realpath(out_pickle) if sa_file is None: if op.exists(input_fasta + ".sa"): sa_file = input_fasta + ".sa" cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} -bestn 5 ".format(r=real_upath(ref_fasta)) + \ "-nproc {n} -m 5 ".format(n=blasr_nproc) + \ "-maxScore -1000 -minPctIdentity 85 " + \ "-out {o} ".format(o=real_upath(m5_file)) if sa_file is not None and op.exists(sa_file): cmd += "-sa {sa}".format(sa=real_upath(sa_file)) logging.info("CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Command failed: {cmd}\n{e}".format(cmd=cmd, e=_msg) logging.error(errMsg) raise RuntimeError(errMsg) if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: logging.info("Loading QVs from {i} + {f} took {s} secs".format(f=ccs_fofn, i=input_fasta,\ s=time.time()-start_t)) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting {i} + {f} --> {fq}".format(i=input_fasta, f=ccs_fofn, fq=input_fastq)) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) logging.info("Loading QVs from {fq} took {s} secs".format(fq=input_fastq, s=time.time()-start_t)) probqv = ProbFromFastq(input_fastq) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qvmean_get_func=probqv.get_mean, qver_get_func=probqv.get_smoothed, ece_penalty=1, ece_min_len=20, same_strand_only=False, max_missed_start=200, max_missed_end=50) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in FastaReader(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: {f}.".format(f=out_pickle)) with open(out_pickle, 'w') as f: dump({'partial_uc': partial_uc, 'nohit': nohit}, f) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating {f}.".format(f=done_filename)) touch(done_filename)
def concat_sam(samfiles, outsam_filename): """ Header looks like: @HD VN:1.3.1 @SQ SN:c31 LN:3104 M5:ef7d3f84dea9d9face43e6fd5b6336c4 @RG ID:2caa54eef6 PU:in.raw_with_partial.fa SM:NO_CHIP_ID @PG ID:BLASR VN:1.3.1.126469 CL:blasr in.raw_with_partial.fa g_consensus.fa -nproc 12 -bestn 5 -nCandidates 10 -sam -out out.sam NOTE: check for M5 conflicts; manipulate them if it conflicts """ f_sq = open(outsam_filename + '.sq', 'w') f_bd = open(outsam_filename + '.bd', 'w') rg_line = None pg_line = None md5_seen = set() if len(samfiles) == 0: raise ValueError("No sam input files to concatenate.") h = open(samfiles[0]) line = h.readline() assert line.startswith('@HD') f_sq.write(line) line = h.readline() assert line.startswith('@SQ') line = h.readline() assert line.startswith('@RG') rg_line = line # write at the end line = h.readline() assert line.startswith('@PG') pg_line = line # write at the end h.close() for f in samfiles: with open(f) as h: assert h.readline().startswith('@HD') line = h.readline() assert line.startswith('@SQ') # ------- check for MD5 conflicts ----------- # m5 = line.strip().split()[-1] assert m5.startswith("M5:") if m5 not in md5_seen: f_sq.write(line) md5_seen.add(m5) else: s = list(m5[3:]) while True: # create a random m5 string. random.shuffle(s) s = "".join(s) if s not in md5_seen: break line = line[:line.find('M5:')] + 'M5:' + s + '\n' logging.debug("MD5 conflict: change to {0}".format(s)) md5_seen.add(s) f_sq.write(line) # ----- end MD5 checking and writing --------- # assert h.readline().startswith('@RG') assert h.readline().startswith('@PG') for line in h: f_bd.write(line) f_bd.close() f_sq.write(rg_line) f_sq.write(pg_line) f_sq.close() cmd = "cat {0}.sq {0}.bd > {0}".format(real_upath(outsam_filename)) _out, _code, _msg = backticks(cmd) if _code != 0: raise IOError("Failed to concat sam files! Abort." + _msg) os.remove(f_sq.name) os.remove(f_bd.name)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False, cpus=1): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_fofn(fofn_filename) #out_fns = [] mkdir(fasta_out_dir) # multiprocessing worker stuff manager = Manager() in_queue = manager.Queue(len(in_fns)) in_queue_count = 0 outfile_track = {} # expected out file --> (cmd, tmp) pool = [] out_fns = [] for in_fn in in_fns: #print >> sys.stderr, "DEBUG: converting h5 file:", in_fn logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) tmp_out_file = op.join(fasta_out_dir, in_basename + '.fasta.tmp') out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug("File {0} already exists. skipping.".format(out_file)) out_fns.append(out_file) if op.exists(tmp_out_file): os.remove(tmp_out_file) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(tmp_out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" print >> sys.stderr, "DEBUG: putting in queue:", cmd, tmp_out_file, out_file in_queue.put((cmd, tmp_out_file, out_file)) in_queue_count += 1 outfile_track[out_file] = (cmd, tmp_out_file) print >> sys.stderr, "DEBUG: put in queue:", cmd, tmp_out_file, out_file cpus = min(cpus, in_queue_count) # cap max CPU if there's fewer files to convert for i in xrange(cpus): p = Process(target=convert_fofn_to_fasta_worker, args=(in_queue,)) pool.append(p) #error_flag = False # starting & joining pool worakers for p in pool: p.start() #print >> sys.stderr, "Starting worker", p.name for p in pool: #print >> sys.stderr, "Waiting join", p.name p.join(timeout=1200) if p.is_alive(): p.terminate() # check that all files exists # if it does not, force to run locally for out_file,(cmd, tmp_out_file) in outfile_track.iteritems(): in_queue.put((cmd, tmp_out_file, out_file)) convert_fofn_to_fasta_worker(in_queue) out_fns.append(out_file) #if error_flag: # raise Exception, "Unable to successfuly run convert_fofn_to_fasta, ABORT!" write_files_to_fofn(out_fns, out_filename)
def quiver_cmds_for_bin(self, cids, quiver_nproc=2): """Return a list of quiver related cmds, to convert sam & ref to cmp.h5 and call quiver, including samtoh5, loadPulses, comph5tools.py, samtools, loadChemistry, quiver... """ first, last = cids[0], cids[-1] self.add_log("Creating quiver cmds for c{first} to c{last}".format( first=first, last=last)) bin_sam_file = self.sam_of_quivered_bin(first, last) bin_ref_fa = self.ref_fa_of_quivered_bin(first, last) bin_cmph5 = self.cmph5_of_quivered_bin(first, last) bin_fq = self.fq_of_quivered_bin(first, last) cmds = [] cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format( sam=real_upath(bin_sam_file), ref=real_upath(bin_ref_fa), cmph5=real_upath(bin_cmph5))) cmds.append("gzip {sam}".format(sam=real_upath(bin_sam_file))) metrics = [ "QualityValue", "InsertionQV", "MergeQV", "DeletionQV", "DeletionTag", "SubstitutionTag", "SubstitutionQV" ] cmds.append("loadPulses {bas_fofn} ".format( bas_fofn=real_upath(self.bas_fofn)) + "{cmph5} ".format(cmph5=real_upath(bin_cmph5)) + "-byread -metrics " + ",".join(metrics)) cmds.append( "cmph5tools.py sort {cmph5}".format(cmph5=real_upath(bin_cmph5))) cmds.append("samtools faidx {ref}".format(ref=real_upath(bin_ref_fa))) cmds.append("loadChemistry.py {bas_fofn} {cmph5}".format( bas_fofn=real_upath(self.bas_fofn), cmph5=real_upath(bin_cmph5))) cmds.append("quiver {cmph5} ".format(cmph5=real_upath(bin_cmph5)) + "-v -j{n} ".format(n=quiver_nproc) + "-r {ref} ".format(ref=real_upath(bin_ref_fa)) + "-o {fq}".format(fq=real_upath(bin_fq))) return cmds