def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' Modified: 09/14/2015, both ends of subreads in fasta files will be trimmed in IceQuiver (trim_and_write_raw_file) instead of here. """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_file_or_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug( "File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" execute(cmd=cmd) out_fns.append(out_file) write_files_to_fofn(out_fns, out_filename)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' Modified: 09/14/2015, both ends of subreads in fasta files will be trimmed in IceQuiver (trim_and_write_raw_file) instead of here. """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_file_or_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug("File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" execute(cmd=cmd) out_fns.append(out_file) write_files_to_fofn(out_fns, out_filename)
def resolved_tool_contract_runner(rtc): """ For each cluster bin, clean up intermediate files under tmp. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) cluster_bin_indices = [task.cluster_bin_index for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) sentinel_out = rtc.task.output_files[0] with open(sentinel_out, 'w') as writer: for task in p: icef = IceFiles(prog_name="ice_cleanup", root_dir=task.cluster_out_dir) tmp_dir = icef.tmp_dir log.info("Cleaning up, removing %s", tmp_dir) writer.write("removing %s\n" % tmp_dir) execute("rm -rf %s" % real_upath(tmp_dir)) quivered_dir = icef.quivered_dir log.info("Cleaning up, removing %s", quivered_dir) writer.write("removing %s\n" % quivered_dir) execute("rm -rf %s" % real_upath(quivered_dir))
def _startPhmmers(self, chunked_reads_fns, chunked_dom_fns, out_dom_fn, primer_fn, pbmatrix_fn): """Run phmmers on chunked reads files in 'chunked_reads_fns' and generate chunked dom files as listed in 'chunked_dom_fns', finally concatenate dom files to 'out_dom_fn'.""" logging.info("Start to launch phmmer on chunked reads.") jobs = [] for reads_fn, domFN in zip(chunked_reads_fns, chunked_dom_fns): p = multiprocessing.Process(target=self._phmmer, args=(reads_fn, domFN, primer_fn, pbmatrix_fn)) jobs.append((p, domFN)) p.start() for p, domFN in jobs: p.join() cmd = "cat {0} >> {1}".format(real_upath(domFN), real_upath(out_dom_fn)) _output, errCode, errMsg = backticks(cmd) if errCode != 0: raise ClassifierException( "Error concatenating dom files: {e}".format(e=str(errMsg))) self._cleanup(chunked_reads_fns) self._cleanup(chunked_dom_fns)
def blasr_for_quiver(query_fn, ref_fasta, out_fn, bam=False, run_cmd=True, blasr_nproc=12): """ query_fn --- should be in.raw.fasta|bam ref_fasta --- reference fasta (ex: g_consensus.fasta) to align to out_fn --- sam|bam output aligning query_fn to ref_fasta blasr query_fn ref_fasta -out out_fn -sam -clipping soft blasr query_fn ref_fasta -out out_fn -bam """ cmd = "blasr {i} ".format(i=real_upath(query_fn)) + \ "{r} ".format(r=real_upath(ref_fasta)) + \ "--nproc {n} ".format(n=blasr_nproc) + \ "--bestn 5 --nCandidates 10 " + \ ("--sam --clipping soft " if not bam else "--bam ") + \ "--out {o} ".format(o=real_upath(out_fn)) + \ "1>/dev/null 2>/dev/null" if run_cmd: execute(cmd) else: logging.debug("CMD: " + cmd) return cmd
def _startPhmmers(self, chunked_reads_fns, chunked_dom_fns, out_dom_fn, primer_fn, pbmatrix_fn): """Run phmmers on chunked reads files in 'chunked_reads_fns' and generate chunked dom files as listed in 'chunked_dom_fns', finally concatenate dom files to 'out_dom_fn'.""" logging.info("Start to launch phmmer on chunked reads.") jobs = [] for reads_fn, domFN in zip(chunked_reads_fns, chunked_dom_fns): p = multiprocessing.Process( target=self._phmmer, args=(reads_fn, domFN, primer_fn, pbmatrix_fn)) jobs.append((p, domFN)) p.start() for p, domFN in jobs: p.join() cmd = "cat {0} >> {1}".format(real_upath(domFN), real_upath(out_dom_fn)) _output, errCode, errMsg = backticks(cmd) if errCode != 0: raise ClassifierException( "Error concatenating dom files: {e}". format(e=str(errMsg))) self._cleanup(chunked_reads_fns) self._cleanup(chunked_dom_fns)
def map_isoforms_to_reference_transcripts(self): """Map isoforms to reference transcripts.""" m5out = self.output_analysis_fn + ".blasr.out.m5" cmd = 'blasr %s %s --bestn 1 -m 5 --out %s' % \ (real_upath(self.isoseq_output_fa), real_upath(self.reference_transcripts_fn), real_upath(m5out)) execute(cmd) return [r for r in BLASRM5Reader(m5out)]
def _phmmer(self, reads_fn, domFN, primer_fn, pbmaxtrixFN): """Invoke phmmer once.""" cmd = "phmmer --cpu 1 --domtblout {d} --noali --domE 1 ".\ format(d=real_upath(domFN)) + \ "--mxfile {m} ".format(m=real_upath(pbmaxtrixFN)) + \ "--popen 0.07 --pextend 0.07 {r} {p} > /dev/null".\ format(r=real_upath(reads_fn), p=real_upath(primer_fn)) logging.debug("Calling phmmer: {cmd}".format(cmd=cmd)) _output, errCode, errMsg = backticks(cmd) if (errCode != 0): raise ClassifierException( "Error calling phmmer: {e}.".format(e=str(errMsg)))
def createPickles(self): """For each file in fasta_filenames, call 'ICE_PARTIAL_PY one' to build clusters and to save results to a pickle file. When all pickles are done, union all pickles. """ self.add_log("Mapping non-full-length reads to consensus isoforms.") self.add_log("Creating pickles...", level=logging.INFO) for idx, fa in enumerate(self.fasta_filenames): # for each splitted non-full-length reads fasta file, build # # partial_uc.pickle cmd = ICE_PARTIAL_PY + " " + \ "one {i} ".format(i=real_upath(fa)) + \ "{r} ".format(r=real_upath(self.ref_fasta)) + \ "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \ "--blasr_nproc={n} ".format(n=self.sge_opts.blasr_nproc) + \ "--done={d} ".format(d=real_upath(self.done_filenames[idx])) if self.ccs_fofn is not None: cmd += "--ccs_fofn={f} ".format(f=real_upath(self.ccs_fofn)) if self.tmp_dir is not None: cmd += "--tmp_dir={t}".format(t=self.tmp_dir) self.add_log("Writing command to script {fsh}". format(fsh=self.script_filenames[idx])) with open(self.script_filenames[idx], 'w') as fsh: fsh.write(cmd + "\n") # determine elog & olog partial_log_fn = op.join(self.log_dir, 'IcePartial.{idx}'.format(idx=idx)) elog = partial_log_fn + ".elog" olog = partial_log_fn + ".olog" jid = "ice_partial_{unique_id}_{name}".format( unique_id=self.sge_opts.unique_id, name=op.basename(fa)) qsub_cmd = "qsub " + \ "-pe smp {n} ".format(n=self.sge_opts.blasr_nproc) + \ "-cwd -S /bin/bash -V " + \ "-e {elog} ".format(elog=real_upath(elog)) + \ "-o {olog} ".format(olog=real_upath(olog)) + \ "-N {jid} ".format(jid=jid) + \ "{sh}".format(sh=real_upath(self.script_filenames[idx])) self.add_log("Creating a pickle for {f}".format(f=fa)) if self.sge_opts.use_sge is True: self.qsub_cmd_and_log(qsub_cmd) else: cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog), elog=real_upath(elog)) self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
def build_sa(input_fasta, out_sa): """Generate suffix array of input_fasta""" if op.exists(input_fasta): cmd = "sawriter {o} {i} -blt 8 -welter ".\ format(o=real_upath(out_sa), i=real_upath(input_fasta)) dummy_out, code, dummy_msg = backticks(cmd) if code == 0: return True else: # If failed to generate suffix array, warning. logging.warn("Unable to create suffix array for {f}.".format(f=input_fasta)) return False else: raise IOError("Unable to find fasta file {f}.".format(f=input_fasta))
def _align_withBLASR(self, queryFa, targetFa, outFN, ice_opts, sge_opts): """Align input reads against itself using BLASR.""" if op.exists(outFN): logging.info("{0} already exists. No need to run BLASR.".format(outFN)) else: cmd = "blasr {q} ".format(q=real_upath(queryFa)) + \ "{t} ".format(t=real_upath(targetFa)) + \ "-m 5 --maxLCPLength 15 " + \ "--nproc {cpu} ".format(cpu=sge_opts.blasr_nproc) + \ "--maxScore {score} ".format(score=ice_opts.maxScore) + \ "--bestn {n} --nCandidates {n} ".format(n=ice_opts.bestn) + \ "--out {o} ".format(o=real_upath(outFN)) + \ "1>/dev/null 2>/dev/null" logging.info("Calling {cmd}".format(cmd=cmd)) execute(cmd)
def generate_batch_cmds(csv_filename, dirname, cmd_filename, cpus): #, nfl_filename, tucked_filename, subread_xml, cpus): cmd_f = open(cmd_filename, 'w') for r in DictReader(open(csv_filename), delimiter=','): cid = r['cluster'] d2 = os.path.join(dirname, cid) if not os.path.exists(d2): print >> sys.stderr, "Directory {0} does not exist! Abort!".format(d2) sys.exit(-1) cmd_f.write("cd {0}\n".format(real_upath(d2))) fa_files, fq_files = preprocess_flnc_split_if_necessary(d2, int(r['size']), flnc_split=20000) ice2_aligner = 'blasr' if int(r['size']) <= 20000 else 'daligner' cmd_f.write("run_IceInit2.py {fa} init.uc.pickle --aligner_choice=blasr --cpus={c}\n".format(c=cpus, fa=fa_files[0])) cmd_f.write("run_IceIterative2.py {fas} {fqs} isoseq_flnc.fasta . ".format(fas=",".join(fa_files), fqs=",".join(fq_files)) + \ "--init_uc_pickle=init.uc.pickle --aligner_choice={aln} ".format(aln=ice2_aligner) + \ "--blasr_nproc {c} --gcon_nproc {c2}\n".format(c=cpus, c2=min(cpus, 4))) # cmd_f.write("run_IcePartial2.py all {nfl},{tucked} ".format(nfl=nfl_filename, tucked=tucked_filename) + \ # "output/final.consensus.fasta nfl.pickle " + \ # "--root_dir . --aligner_choice=blasr --cpus={c}\n".format(c=cpus)) # cmd_f.write("run_IceArrow2.py all --subread_xml {s} ".format(s=subread_xml) + \ # "--blasr_nproc {c} --arrow_nproc {c} .\n".format(c=cpus)) cmd_f.close()
def generate_batch_cmds(csv_filename, dirname, cmd_filename, cpus): #, nfl_filename, tucked_filename, subread_xml, cpus): cmd_f = open(cmd_filename, 'w') for r in DictReader(open(csv_filename), delimiter=','): cid = r['cluster'] d2 = os.path.join(dirname, cid) if not os.path.exists(d2): print("Directory {0} does not exist! Abort!".format(d2), file=sys.stderr) sys.exit(-1) cmd_f.write("cd {0}\n".format(real_upath(d2))) fa_files, fq_files = preprocess_flnc_split_if_necessary( d2, int(r['size']), flnc_split=20000) ice2_aligner = 'blasr' if int(r['size']) <= 20000 else 'daligner' cmd_f.write( "run_IceInit2.py {fa} init.uc.pickle --aligner_choice=blasr --cpus={c}\n" .format(c=cpus, fa=fa_files[0])) cmd_f.write("run_IceIterative2.py {fas} {fqs} isoseq_flnc.fasta . ".format(fas=",".join(fa_files), fqs=",".join(fq_files)) + \ "--init_uc_pickle=init.uc.pickle --aligner_choice={aln} ".format(aln=ice2_aligner) + \ "--blasr_nproc {c} --gcon_nproc {c2}\n".format(c=cpus, c2=min(cpus, 4))) # cmd_f.write("run_IcePartial2.py all {nfl},{tucked} ".format(nfl=nfl_filename, tucked=tucked_filename) + \ # "output/final.consensus.fasta nfl.pickle " + \ # "--root_dir . --aligner_choice=blasr --cpus={c}\n".format(c=cpus)) # cmd_f.write("run_IceArrow2.py all --subread_xml {s} ".format(s=subread_xml) + \ # "--blasr_nproc {c} --arrow_nproc {c} .\n".format(c=cpus)) cmd_f.close()
def sort_sam(in_sam, out_sam): """ Sort input sam file and write to output sam file. """ # Copy SAM headers copy_sam_header(in_sam=in_sam, out_sam=out_sam) # Call sort to sort gmap output sam file cmd_args = [ 'sort', '-k 3,3', '-k 4,4n', real_upath(in_sam), '| grep -v \'^@\' ', ' >> ', real_upath(out_sam) ] if os.stat(in_sam).st_size == 0: # overwrite cmds if file is empty cmd_args = ['touch', out_sam] execute(' '.join(cmd_args))
def arrow_cmds_for_bin(self, cids): """ Return a list of quiver related cmds. Input format must be BAM. """ first, last = cids[0], cids[-1] self.add_log("Creating arrow cmds for c{first} to c{last}".format( first=first, last=last)) bin_ref_fa = self.ref_fa_of_arrowed_bin(first, last) bin_fq = self.fq_of_arrowed_bin(first, last) bin_unsorted_bam_file = self.bam_of_arrowed_bin(first, last, is_sorted=False) bin_bam_file = self.bam_of_arrowed_bin(first, last, is_sorted=True) bin_bam_prefix = self._arrowed_bin_prefix(first, last) cmds = [] if not self.use_samtools_v_1_3_1: # SA2.*, SA3.0, SA3.1 and SA3.2 use v0.1.19 cmds.append("samtools sort {f} {d}".format( f=real_upath(bin_unsorted_bam_file), d=real_upath(bin_bam_prefix))) else: # SA3.3 and up use v1.3.1 cmds.append("samtools sort {f} -o {d}.bam".format( f=real_upath(bin_unsorted_bam_file), d=real_upath(bin_bam_prefix))) cmds.append("samtools index {f}".format(f=real_upath(bin_bam_file))) cmds.append("samtools faidx {ref}".format(ref=real_upath(bin_ref_fa))) cmds.append("pbindex {f}".format(f=real_upath(bin_bam_file))) # cmds.append("variantCaller --maskRadius 3 -x 1 --minAccuracy 0 --algorithm=best " + # "{f} ".format(f=real_upath(bin_bam_file)) + # "--verbose -j{n} ".format(n=self.sge_opts.arrow_nproc) + # "--referenceFilename={ref} ".format(ref=real_upath(bin_ref_fa)) + # "-o {fq}".format(fq=real_upath(bin_fq))) cmds.append("variantCaller --algorithm=best " + "{f} ".format(f=real_upath(bin_bam_file)) + "--verbose -j{n} ".format(n=self.sge_opts.arrow_nproc) + "--referenceFilename={ref} ".format( ref=real_upath(bin_ref_fa)) + "-o {fq}".format(fq=real_upath(bin_fq))) return cmds
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"): """Sanity check if sge can work.""" scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) if not op.exists(scriptDir): os.makedirs(scriptDir) if not op.exists(testDir): os.makedirs(testDir) testSh = op.join(scriptDir, 'test.sh') consensusFa = op.join(testDir, "g_consensus.fasta") testInFa = op.join(testDir, "gcon_in.fasta") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert op.exists(testInFa) cmd = " ".join([ gcon_py, real_upath(testInFa), "{testDir}/g_consensus".format(testDir=real_upath(testDir)), "c1" ]) write_cmd_to_script(cmd=cmd, script=testSh) assert op.exists(testSh) cmd = sge_opts.qsub_cmd(script=real_upath(testSh), num_threads=1, wait_before_exit=True) logging.debug("Submitting cmd: " + cmd) backticks(cmd) if not filecmp.cmp(consensusFa, GCON_OUT_FA): errMsg = "Trouble running qsub or output is not as " + \ "expected ({0} and {1} must agree). Abort!".format( consensusFa, GCON_OUT_FA) logging.error(errMsg) return False else: shutil.rmtree(testDir) logging.info("sge and gcon check passed.") return True
def sanity_check_sge(sge_opts, scriptDir, testDirName="gcon_test_dir"): """Sanity check if sge can work.""" scriptDir = realpath(scriptDir) testDir = op.join(scriptDir, testDirName) if not op.exists(scriptDir): os.makedirs(scriptDir) if not op.exists(testDir): os.makedirs(testDir) testSh = op.join(scriptDir, 'test.sh') consensusFa = op.join(testDir, "g_consensus.fasta") testInFa = op.join(testDir, "gcon_in.fasta") if op.exists(testInFa): os.remove(testInFa) shutil.copy(GCON_IN_FA, testInFa) assert op.exists(testInFa) cmd = " ".join([gcon_py, real_upath(testInFa), "{testDir}/g_consensus".format(testDir=real_upath(testDir)), "c1"]) write_cmd_to_script(cmd=cmd, script=testSh) assert op.exists(testSh) cmd = sge_opts.qsub_cmd(script=real_upath(testSh), num_threads=1, wait_before_exit=True) logging.debug("Submitting cmd: " + cmd) backticks(cmd) if not filecmp.cmp(consensusFa, GCON_OUT_FA): errMsg = "Trouble running qsub or output is not as " + \ "expected ({0} and {1} must agree). Abort!".format( consensusFa, GCON_OUT_FA) logging.error(errMsg) return False else: shutil.rmtree(testDir) logging.info("sge and gcon check passed.") return True
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files cwd = realpath(os.getcwd()) cmd_args = [ 'cd %s' % real_upath(op.join(gmap_db_dir, gmap_db_name)), 'ls *.iit *meta', 'sleep 3', 'cd %s' % real_upath(cwd) ] execute(' && '.join(cmd_args)) cmd_args = [ 'gmap', '-D {d}'.format(d=real_upath(gmap_db_dir)), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', '--max-intronlength-ends 200000', # for long genes real_upath(gmap_input_filename), '>', real_upath(unsorted_sam_filename), '2>{log}'.format(log=real_upath(log_filename)) ] # Call gmap to map isoforms to reference and output sam. try: execute(' '.join(cmd_args)) except Exception: logging.debug("gmap failed, try again.") execute('sleep 3') execute(' '.join(cmd_args)) # sort sam file sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def make_db(self): """Make dazz database for input file. 1. fasta2DB 2. DBsplit 3. get & store number of blocks *.dazz.fasta.db will be created. """ log.debug("Making DAZZ database for %s.", self.dazz_filename) if not op.exists(self.dazz_filename): raise RuntimeError( "%s hasn't been converted to daligner-compatible format." % self.input_filename) if op.exists(self.db_filename): cmd = "DBrm %s" % real_upath(self.dazz_filename) execute(cmd=cmd) cmd = "fasta2DB %s %s " % (real_upath( self.dazz_filename), real_upath(self.dazz_filename)) execute(cmd=cmd) cmd = "DBsplit -s200 %s" % real_upath(self.dazz_filename) execute(cmd)
def generate_batch_cmds_for_polishing(chunk_prefix, nfl_filename, subread_xml, cpus, cmd_filename, walltime, queue): subread_xml = real_upath(subread_xml) nfl_filename = real_upath(nfl_filename) fastas = glob.glob(chunk_prefix + '*.consensus.fasta') # verify that the pickles exists as well for fasta in fastas: pickle = fasta[:-len('.consensus.fasta')] + '.pickle' print "looking for", pickle assert os.path.exists(pickle) dirname = fasta[:-len('.consensus.fasta')] if os.path.exists(dirname): print >> sys.stderr, "Directory {0} already exist! Abort!".format( dirname) sys.exit(-1) cmd_f = open(cmd_filename, 'w') for fasta in fastas: pickle = fasta[:-len('.consensus.fasta')] + '.pickle' dirname = fasta[:-len('.consensus.fasta')] full_fasta = real_upath(fasta) full_pickle = real_upath(pickle) os.makedirs(dirname) os.chdir(dirname) os.symlink(full_fasta, os.path.basename(full_fasta)) os.makedirs('output') os.chdir('output') os.symlink(full_pickle, 'final.pickle') os.symlink(full_fasta, 'final.consensus.fasta') os.chdir('../../') f = open(os.path.join(dirname, dirname + '.sh'), 'w') f.write("#!/bin/bash\n") f.write( "source /projects/banchereau-lab/ISO-seq/annotation_processing/pitchfork_ToFU2_dev/setup-env.sh\n" ) f.write("module load gcc/4.9.2\n") f.write("module load graphviz\n") f.write( "PATH=$PATH:/projects/banchereau-lab/ISO-seq/annotation_processing/cDNA_Cupcake/sequence\n" ) f.write("cd $PBS_O_WORKDIR\n") f.write("run_IcePartial2.py all {nfl} {p}.consensus.fasta {p}.nfl.pickle "\ "--root_dir {d} --aligner_choice=daligner --cpus={c}\n".format(\ p=dirname, nfl=nfl_filename, d=real_upath(dirname), c=cpus)) f.write("run_IceArrow2.py all {d} --subread_xml {s} --blasr_nproc {c} --arrow_nproc {c} --hq_min_full_length_reads=1\n".format(\ d=real_upath(dirname), s=subread_xml, c=cpus)) f.close() cmd_f.write( "qsub -q {q} -l walltime={w} -l nodes=1:ppn={c} {sh}\n".format( sh=real_upath(f.name), c=cpus, w=walltime, q=queue))
def submit_jobs_local_or_remote(self, files_to_run): """ Run jobs either locally or through SGE. Return a list of [(sge_job_id, filename)], which is also written to log/submitted_arrow_jobs.txt """ flag_run_locally = (self.sge_opts.use_sge is not True) or (self.sge_opts.max_sge_jobs == 0) if flag_run_locally: self.add_log("Files to submit locally: {0}\n".format( ",".join(files_to_run))) else: self.add_log("Files to submit through SGE: {0}\n".format( ",".join(files_to_run))) submit_f = open(self.arrow_submission_run_file, 'w') submitted = [] for file in files_to_run: elog = op.join(self.arrowed_log_dir, op.basename(file) + ".elog") olog = op.join(self.arrowed_log_dir, op.basename(file) + ".olog") if flag_run_locally: cmd = "bash {f}".format(f=real_upath(file)) self.run_cmd_and_log(cmd, olog=olog, elog=elog, description="Failed to run Arrow") submitted.append(("local", file)) submit_f.write("{0}\t{1}\n".format("local", file)) else: jid = "ice_arrow_{unique_id}_{name}".format( unique_id=self.sge_opts.unique_id, name=op.basename(file)) qsub_cmd = self.sge_opts.qsub_cmd( script=file, num_threads=self.sge_opts.arrow_nproc, wait_before_exit=False, depend_on_jobs=None, elog=elog, olog=olog, is_script=True, jobid=jid) job_id = self.qsub_cmd_and_log(qsub_cmd) submitted.append((job_id, file)) submit_f.write("{0}\t{1}\n".format(job_id, file)) submit_f.close() return submitted
def init_cluster_by_clique(self): """ Only called once and in the very beginning, when (probably a subset) of sequences are given to generate the initial cluster. readsFa --- initial fasta filename, probably called *_split00.fasta qver_get_func --- function that returns QVs on reads qvmean_get_func --- function that returns the mean QV on reads bestn --- parameter in BLASR, higher helps in finding perfect cliques but bigger output nproc, maxScore --- parameter in BLASR, set maxScore appropriate to input transcript length ece_penalty, ece_min_len --- parameter in isoform hit calling Self-blasr input then iteratively find all mutually exclusive cliques (in decreasing size) Returns dict of cluster_index --> list of seqids which is the 'uc' dict that can be used by IceIterative """ alignGraph = None if self.ice_opts.aligner_choice == 'blasr': outFN = self.readsFa + '.self.blasr' self._align_withBLASR(queryFa=self.readsFa, targetFa=self.readsFa, outFN=outFN) alignGraph = self._makeGraphFromM5(m5FN=outFN) elif self.ice_opts.aligner_choice == 'daligner': try: runner = self._align_withDALIGNER( queryFa=self.readsFa, output_dir=op.dirname(real_upath(self.readsFa))) alignGraph = self._makeGraphFromLA4Ice(runner=runner) runner.clean_run() except RuntimeError: # daligner probably crashed, fall back to blasr outFN = self.readsFa + '.self.blasr' self._align_withBLASR(queryFa=self.readsFa, targetFa=self.readsFa, outFN=outFN) alignGraph = self._makeGraphFromM5(m5FN=outFN) else: raise Exception, "Unrecognized aligner_choice {0}!".format( self.ice_opts.aligner_choice) uc = IceInit2._findCliques(alignGraph=alignGraph, readsFa=self.readsFa) return uc
def generate_batch_cmds(csv_filename, dirname, cmd_filename, cpus): #, nfl_filename, tucked_filename, subread_xml, cpus): cmd_f = open(cmd_filename, 'w') for r in DictReader(open(csv_filename), delimiter=','): cid = r['cluster'] d2 = os.path.join(dirname, cid) if not os.path.exists(d2): print >> sys.stderr, "Directory {0} does not exist! Abort!".format( d2) sys.exit(-1) cmd_f.write("#!/bin/bash\n") cmd_f.write( "source /projects/banchereau-lab/ISO-seq/annotation_processing/pitchfork_ToFU2_dev/setup-env.sh\n" ) cmd_f.write("module load gcc/4.9.2\n") cmd_f.write( "PATH=$PATH:/projects/banchereau-lab/ISO-seq/annotation_processing/cDNA_Cupcake/sequence\n" ) cmd_f.write("cd $PBS_O_WORKDIR\n") cmd_f.write("cd {0}\n".format(real_upath(d2))) fa_files, fq_files = preprocess_flnc_split_if_necessary( d2, int(r['size']), flnc_split=20000) ice2_aligner = 'blasr' if int(r['size']) <= 20000 else 'daligner' cmd_f.write( "run_IceInit2.py {fa} init.uc.pickle --aligner_choice=blasr --cpus={c}\n" .format(c=cpus, fa=fa_files[0])) cmd_f.write("run_IceIterative2.py {fas} {fqs} isoseq_flnc.fasta . ".format(fas=",".join(fa_files), fqs=",".join(fq_files)) + \ "--init_uc_pickle=init.uc.pickle --aligner_choice={aln} ".format(aln=ice2_aligner) + \ "--blasr_nproc {c} --gcon_nproc {c2}\n".format(c=cpus, c2=min(cpus, 4))) # cmd_f.write("run_IcePartial2.py all {nfl},{tucked} ".format(nfl=nfl_filename, tucked=tucked_filename) + \ # "output/final.consensus.fasta nfl.pickle " + \ # "--root_dir . --aligner_choice=blasr --cpus={c}\n".format(c=cpus)) # cmd_f.write("run_IceArrow2.py all --subread_xml {s} ".format(s=subread_xml) + \ # "--blasr_nproc {c} --arrow_nproc {c} .\n".format(c=cpus)) cmd_f.close()
def submit_todo_quiver_jobs(self, todo, submitted, sge_opts): """ todo --- a list of sh scripts to run submitted --- a list of sh scripts which have been submitted sge_opts --- SGE options, including use_sge, whether or not to use sge max_sge_jobs, maximum number sge jobs to submit quiver_nproc, number of nproc per job unique_id, unique id to name qsub jobs """ self.add_log("Submitting todo quiver jobs.") if sge_opts.use_sge is not True or \ sge_opts.max_sge_jobs == 0: # don't use SGE for job in todo: elog = op.join(self.quivered_log_dir, op.basename(job) + ".elog") olog = op.join(self.quivered_log_dir, op.basename(job) + ".olog") cmd = "bash " + real_upath(job) + " 1>{olog} 2>{elog}".\ format(olog=real_upath(olog), elog=real_upath(elog)) self.run_cmd_and_log(cmd, olog=olog, elog=elog, description="Failed to run Quiver") submitted.append(("local", job)) todo = [] else: while len(todo) > 0: n = min(sge_opts.max_sge_jobs, len(todo)) for job in todo[:n]: # ex: Your job 8613116 ("c20to70.sh") has been submitted elog = op.join(self.quivered_log_dir, op.basename(job) + ".elog") olog = op.join(self.quivered_log_dir, op.basename(job) + ".olog") jid = "ice_quiver_{unique_id}_{name}".format( unique_id=self.sge_opts.unique_id, name=op.basename(job)) qsub_cmd = "qsub " + \ "-pe smp {n} ".\ format(n=sge_opts.quiver_nproc) + \ "-cwd -S /bin/bash -V " + \ "-e {elog} ".format(elog=real_upath(elog)) +\ "-o {olog} ".format(olog=real_upath(olog)) +\ "-N {jid} ".format(jid=jid) + \ "{job}".format(job=real_upath(job)) job_id = self.qsub_cmd_and_log(qsub_cmd) submitted.append((job_id, job)) todo.remove(job)
def generate_batch_cmds_for_polishing(chunk_prefix, nfl_filename, subread_xml, cpus, cmd_filename): subread_xml = real_upath(subread_xml) nfl_filename = real_upath(nfl_filename) fastas = glob.glob(chunk_prefix + '*.consensus.fasta') # verify that the pickles exists as well for fasta in fastas: pickle = fasta[:-len('.consensus.fasta')] + '.pickle' print("looking for", pickle) assert os.path.exists(pickle) dirname = fasta[:-len('.consensus.fasta')] if os.path.exists(dirname): print("Directory {0} already exist! Abort!".format(dirname), file=sys.stderr) sys.exit(-1) cmd_f = open(cmd_filename, 'w') for fasta in fastas: pickle = fasta[:-len('.consensus.fasta')] + '.pickle' dirname = fasta[:-len('.consensus.fasta')] full_fasta = real_upath(fasta) full_pickle = real_upath(pickle) os.makedirs(dirname) os.chdir(dirname) os.symlink(full_fasta, os.path.basename(full_fasta)) os.makedirs('output') os.chdir('output') os.symlink(full_pickle, 'final.pickle') os.symlink(full_fasta, 'final.consensus.fasta') os.chdir('../../') f = open(os.path.join(dirname, dirname + '.sh'), 'w') f.write("run_IcePartial2.py all {nfl} {p}.consensus.fasta {p}.nfl.pickle "\ "--root_dir {d} --aligner_choice=daligner --cpus={c}\n".format(\ p=dirname, nfl=nfl_filename, d=real_upath(dirname), c=cpus)) f.write("run_IceArrow2.py all {d} --subread_xml {s} --blasr_nproc {c} --arrow_nproc {c} --hq_min_full_length_reads=2\n".format(\ d=real_upath(dirname), s=subread_xml, c=cpus)) f.close() cmd_f.write("qsub -cwd -S /bin/bash -pe smp 12 -V {sh}\n".format( sh=real_upath(f.name)))
def generate_batch_cmds_for_polishing(chunk_prefix, nfl_filename, subread_xml, cpus, cmd_filename): subread_xml = real_upath(subread_xml) nfl_filename = real_upath(nfl_filename) fastas = glob.glob(chunk_prefix + '*.consensus.fasta') # verify that the pickles exists as well for fasta in fastas: pickle = fasta[:-len('.consensus.fasta')] + '.pickle' print "looking for", pickle assert os.path.exists(pickle) dirname = fasta[:-len('.consensus.fasta')] if os.path.exists(dirname): print >> sys.stderr, "Directory {0} already exist! Abort!".format(dirname) sys.exit(-1) cmd_f = open(cmd_filename, 'w') for fasta in fastas: pickle = fasta[:-len('.consensus.fasta')] + '.pickle' dirname = fasta[:-len('.consensus.fasta')] full_fasta = real_upath(fasta) full_pickle = real_upath(pickle) os.makedirs(dirname) os.chdir(dirname) os.symlink(full_fasta, os.path.basename(full_fasta)) os.makedirs('output') os.chdir('output') os.symlink(full_pickle, 'final.pickle') os.symlink(full_fasta, 'final.consensus.fasta') os.chdir('../../') f = open(os.path.join(dirname, dirname+'.sh'), 'w') f.write("run_IcePartial2.py all {nfl} {p}.consensus.fasta {p}.nfl.pickle "\ "--root_dir {d} --aligner_choice=daligner --cpus={c}\n".format(\ p=dirname, nfl=nfl_filename, d=real_upath(dirname), c=cpus)) f.write("run_IceArrow2.py all {d} --subread_xml {s} --blasr_nproc {c} --arrow_nproc {c} --hq_min_full_length_reads=2\n".format(\ d=real_upath(dirname), s=subread_xml, c=cpus)) f.close() cmd_f.write("qsub -cwd -S /bin/bash -pe smp 12 -V {sh}\n".format(sh=real_upath(f.name)))
from argparse import ArgumentParser parser = ArgumentParser( "Generate batch commands for running IceInit2->IceIterative2 for each preCluster output bin" ) parser.add_argument( "precluster_csv", help="Cluster CSV file (ex: preCluster.cluster_info.csv)") parser.add_argument("precluster_dir", help="preCluster out directory (ex: preCluster_out/)") #parser.add_argument("nfl_filename", help="nFL filename (ex: isoseq_nfl.fasta)") #parser.add_argument("tucked_filename", help="tucked filename (ex: preCluster_out.tucked.fasta)") #parser.add_argument("subread_xml", help="Subread XML") parser.add_argument("--cpus", default=20, type=int, help="Number of CPUs (default: 20)") parser.add_argument("--cmd_filename", default='cmds', help="Output command filename (default: cmds)") args = parser.parse_args() generate_batch_cmds( args.precluster_csv, real_upath(args.precluster_dir), args.cmd_filename, #real_upath(args.nfl_filename), #real_upath(args.tucked_filename), #real_upath(args.subread_xml), args.cpus)
def quiver_cmds_for_bin(self, cids, quiver_nproc=2, bam=False): """ Return a list of quiver related cmds. Input format can be FASTA or BAM. If inputs are in FASTA format, call samtoh5, loadPulses, comph5tools.py, samtools, loadChemistry, quiver... If inputs are in BAM format, call quiver directly. """ first, last = cids[0], cids[-1] self.add_log("Creating quiver cmds for c{first} to c{last}". format(first=first, last=last)) bin_ref_fa = self.ref_fa_of_quivered_bin(first, last) bin_sam_file = self.sam_of_quivered_bin(first, last) bin_cmph5 = self.cmph5_of_quivered_bin(first, last) bin_fq = self.fq_of_quivered_bin(first, last) bin_unsorted_bam_file = self.bam_of_quivered_bin(first, last, is_sorted=False) bin_bam_file = self.bam_of_quivered_bin(first, last, is_sorted=True) bin_bam_prefix = self._quivered_bin_prefix(first, last) quiver_input = bin_cmph5 if not bam else bin_bam_file cmds = [] if not bam: raise IOError("conversion to cmp.h5 no longer supported") cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format( sam=real_upath(bin_sam_file), ref=real_upath(bin_ref_fa), cmph5=real_upath(bin_cmph5))) cmds.append("gzip {sam}".format(sam=real_upath(bin_sam_file))) metrics = ["QualityValue", "InsertionQV", "MergeQV", "DeletionQV", "DeletionTag", "SubstitutionTag", "SubstitutionQV"] cmds.append("loadPulses {bas_fofn} ". format(bas_fofn=real_upath(self.bas_fofn)) + "{cmph5} ".format(cmph5=real_upath(bin_cmph5)) + "-byread -metrics " + ",".join(metrics)) cmds.append("cmph5tools.py sort {cmph5}". format(cmph5=real_upath(bin_cmph5))) cmds.append("loadChemistry.py {bas_fofn} {cmph5}". format(bas_fofn=real_upath(self.bas_fofn), cmph5=real_upath(bin_cmph5))) else: cmds.append("samtools sort {f} {d}".format( f=real_upath(bin_unsorted_bam_file), d=real_upath(bin_bam_prefix))) cmds.append("samtools index {f}".format(f=real_upath(bin_bam_file))) cmds.append("samtools faidx {ref}".format(ref=real_upath(bin_ref_fa))) cmds.append("pbindex {f}".format(f=real_upath(bin_bam_file))) cmds.append("variantCaller --algorithm=best " + "{f} ".format(f=real_upath(quiver_input)) + "--verbose -j{n} ".format(n=quiver_nproc) + "--referenceFilename={ref} ".format(ref=real_upath(bin_ref_fa)) + "-o {fq}".format(fq=real_upath(bin_fq))) return cmds
def concat_sam(samfiles, outsam_filename): """ Header looks like: @HD VN:1.3.1 @SQ SN:c31 LN:3104 M5:ef7d3f84dea9d9face43e6fd5b6336c4 @RG ID:2caa54eef6 PU:in.raw_with_partial.fasta SM:NO_CHIP_ID @PG ID:BLASR VN:1.3.1.126469 CL:blasr in.raw_with_partial.fasta g_consensus.fasta -nproc 12 -bestn 5 -nCandidates 10 -sam -out out.sam NOTE: check for M5 conflicts; manipulate them if it conflicts """ f_sq = open(outsam_filename + '.sq', 'w') f_bd = open(outsam_filename + '.bd', 'w') rg_line = None pg_line = None md5_seen = set() if len(samfiles) == 0: raise ValueError("No sam input files to concatenate.") h = open(samfiles[0]) line = h.readline() assert line.startswith('@HD') f_sq.write(line) line = h.readline() assert line.startswith('@SQ') line = h.readline() assert line.startswith('@RG') rg_line = line # write at the end line = h.readline() assert line.startswith('@PG') pg_line = line # write at the end h.close() for f in samfiles: with open(f) as h: assert h.readline().startswith('@HD') line = h.readline() assert line.startswith('@SQ') # ------- check for MD5 conflicts ----------- # m5 = line.strip().split()[-1] assert m5.startswith("M5:") if m5 not in md5_seen: f_sq.write(line) md5_seen.add(m5) else: s = list(m5[3:]) while True: # create a random m5 string. random.shuffle(s) s = "".join(s) if s not in md5_seen: break line = line[:line.find('M5:')] + 'M5:' + s + '\n' logging.debug("MD5 conflict: change to {0}".format(s)) md5_seen.add(s) f_sq.write(line) # ----- end MD5 checking and writing --------- # assert h.readline().startswith('@RG') assert h.readline().startswith('@PG') for line in h: f_bd.write(line) f_bd.close() f_sq.write(rg_line) f_sq.write(pg_line) f_sq.close() cmd = "cat {0}.sq {0}.bd > {0}".format(real_upath(outsam_filename)) execute(cmd=cmd, errmsg="Failed to concat sam files! Abort.", errcls=IOError) os.remove(f_sq.name) os.remove(f_bd.name)
def build_uc_from_partial_blasr(input_fasta, ref_fasta, out_pickle, done_filename, ice_opts, probqv, qv_prob_threshold=0.3, cpus=4, no_qv_or_aln_checking=False, tmp_dir=None, sID_starts_with_c=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 100 --nCandidates 200 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=cpus) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--minAlnLength {a} ".format(a=ice_opts.min_match_len) + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) logging.info("Calling blasr_against_ref ...") # no need to provide full_missed_start/end for nFLs, since is_FL = False hitItems = blasr_against_ref2(output_filename=m5_file, is_FL=False, sID_starts_with_c=sID_starts_with_c, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, qv_prob_threshold=qv_prob_threshold, ece_penalty=ice_opts.ece_penalty, ece_min_len=ice_opts.ece_min_len, max_missed_start=ice_opts.max_missed_start, max_missed_end=ice_opts.max_missed_end, full_missed_start=ice_opts.full_missed_start, full_missed_end=ice_opts.full_missed_end, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)
def createPickles(self): """For each file in fasta_filenames, call 'ICE_PARTIAL_PY one' to build clusters and to save results to a pickle file. When all pickles are done, union all pickles. """ self.add_log("Mapping non-full-length reads to consensus isoforms.") self.add_log("Creating pickles...", level=logging.INFO) for idx, fa in enumerate(self.fasta_filenames): # for each splitted non-full-length reads fasta file, build # # partial_uc.pickle # ex: # python run_IcePartial2.py one isoseq_nfl.fasta isoseq_nfl.fastq \ # output/final.consensus.fasta isoseq_nfl.fasta.pickle --aligner_choice=blasr --cpus=12 if self.fastq_filenames is not None: fq = self.fastq_filenames[idx] else: fq = None cmd = ICE_PARTIAL_PY + " " cmd += "one {fa} ".format(fa=real_upath(fa)) if fq is not None: cmd += "--fq {fq} ".format(fq=real_upath(fq)) cmd += "{r} ".format(r=real_upath(self.ref_fasta)) + \ "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \ "--aligner_choice={c} ".format(c=self.ice_opts.aligner_choice) + \ "--cpus={n} ".format(n=self.cpus) + \ "--max_missed_start={0} ".format(self.ice_opts.max_missed_start) + \ "--max_missed_end={0} ".format(self.ice_opts.max_missed_end) + \ "--ece_penalty={0} ".format(self.ice_opts.ece_penalty) + \ "--ece_min_len={0} ".format(self.ice_opts.ece_min_len) + \ "--done={d} ".format(d=real_upath(self.done_filenames[idx])) if self.tmp_dir is not None: cmd += "--tmp_dir={t}".format(t=self.tmp_dir) self.add_log("Writing command to script {fsh}".format( fsh=self.script_filenames[idx])) with open(self.script_filenames[idx], 'w') as fsh: fsh.write(cmd + "\n") # determine elog & olog partial_log_fn = op.join(self.log_dir, 'IcePartial.{idx}'.format(idx=idx)) elog = partial_log_fn + ".elog" olog = partial_log_fn + ".olog" jid = "ice_partial_{unique_id}_{name}".format( unique_id=self.sge_opts.unique_id, name=op.basename(fa)) self.add_log("Creating a pickle for {f}".format(f=fa)) if self.sge_opts.use_sge is True: qsub_cmd = self.sge_opts.qsub_cmd(script=real_upath( self.script_filenames[idx]), num_threads=self.cpus, wait_before_exit=False, depend_on_jobs=None, elog=real_upath(elog), olog=real_upath(olog), is_script=True, jobid=jid) # qsub_cmd = "qsub " + \ # "-pe smp {n} ".format(n=self.sge_opts.blasr_nproc) + \ # "-cwd -S /bin/bash -V " + \ # "-e {elog} ".format(elog=real_upath(elog)) + \ # "-o {olog} ".format(olog=real_upath(olog)) + \ # "-N {jid} ".format(jid=jid) + \ # "{sh}".format(sh=real_upath(self.script_filenames[idx])) self.qsub_cmd_and_log(qsub_cmd) else: cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog), elog=real_upath(elog)) self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
def createPickles(self): """For each file in fasta_filenames, call 'ICE_PARTIAL_PY one' to build clusters and to save results to a pickle file. When all pickles are done, union all pickles. """ self.add_log("Mapping non-full-length reads to consensus isoforms.") self.add_log("Creating pickles...", level=logging.INFO) for idx, fa in enumerate(self.fasta_filenames): # for each splitted non-full-length reads fasta file, build # # partial_uc.pickle # ex: # python run_IcePartial2.py one isoseq_nfl.fasta isoseq_nfl.fastq \ # output/final.consensus.fasta isoseq_nfl.fasta.pickle --aligner_choice=blasr --cpus=12 if self.fastq_filenames is not None: fq = self.fastq_filenames[idx] else: fq = None cmd = ICE_PARTIAL_PY + " " cmd += "one {fa} ".format(fa=real_upath(fa)) if fq is not None: cmd += "--fq {fq} ".format(fq=real_upath(fq)) cmd += "{r} ".format(r=real_upath(self.ref_fasta)) + \ "{o} ".format(o=real_upath(self.pickle_filenames[idx])) + \ "--aligner_choice={c} ".format(c=self.ice_opts.aligner_choice) + \ "--cpus={n} ".format(n=self.cpus) + \ "--max_missed_start={0} ".format(self.ice_opts.max_missed_start) + \ "--max_missed_end={0} ".format(self.ice_opts.max_missed_end) + \ "--ece_penalty={0} ".format(self.ice_opts.ece_penalty) + \ "--ece_min_len={0} ".format(self.ice_opts.ece_min_len) + \ "--done={d} ".format(d=real_upath(self.done_filenames[idx])) if self.tmp_dir is not None: cmd += "--tmp_dir={t}".format(t=self.tmp_dir) self.add_log("Writing command to script {fsh}". format(fsh=self.script_filenames[idx])) with open(self.script_filenames[idx], 'w') as fsh: fsh.write(cmd + "\n") # determine elog & olog partial_log_fn = op.join(self.log_dir, 'IcePartial.{idx}'.format(idx=idx)) elog = partial_log_fn + ".elog" olog = partial_log_fn + ".olog" jid = "ice_partial_{unique_id}_{name}".format( unique_id=self.sge_opts.unique_id, name=op.basename(fa)) self.add_log("Creating a pickle for {f}".format(f=fa)) if self.sge_opts.use_sge is True: qsub_cmd = self.sge_opts.qsub_cmd(script=real_upath(self.script_filenames[idx]), num_threads=self.cpus, wait_before_exit=False, depend_on_jobs=None, elog=real_upath(elog), olog=real_upath(olog), is_script=True, jobid=jid) # qsub_cmd = "qsub " + \ # "-pe smp {n} ".format(n=self.sge_opts.blasr_nproc) + \ # "-cwd -S /bin/bash -V " + \ # "-e {elog} ".format(elog=real_upath(elog)) + \ # "-o {olog} ".format(olog=real_upath(olog)) + \ # "-N {jid} ".format(jid=jid) + \ # "{sh}".format(sh=real_upath(self.script_filenames[idx])) self.qsub_cmd_and_log(qsub_cmd) else: cmd += " 1>{olog} 2>{elog}".format(olog=real_upath(olog), elog=real_upath(elog)) self.run_cmd_and_log(cmd=cmd, olog=olog, elog=elog)
# "output/final.consensus.fasta nfl.pickle " + \ # "--root_dir . --aligner_choice=blasr --cpus={c}\n".format(c=cpus)) # cmd_f.write("run_IceArrow2.py all --subread_xml {s} ".format(s=subread_xml) + \ # "--blasr_nproc {c} --arrow_nproc {c} .\n".format(c=cpus)) cmd_f.close() if __name__ == "__main__": from argparse import ArgumentParser parser = ArgumentParser("Generate batch commands for running IceInit2->IceIterative2 for each preCluster output bin") parser.add_argument("precluster_csv", help="Cluster CSV file (ex: preCluster.cluster_info.csv)") parser.add_argument("precluster_dir", help="preCluster out directory (ex: preCluster_out/)") #parser.add_argument("nfl_filename", help="nFL filename (ex: isoseq_nfl.fasta)") #parser.add_argument("tucked_filename", help="tucked filename (ex: preCluster_out.tucked.fasta)") #parser.add_argument("subread_xml", help="Subread XML") parser.add_argument("--cpus", default=20, type=int, help="Number of CPUs (default: 20)") parser.add_argument("--cmd_filename", default='cmds', help="Output command filename (default: cmds)") args = parser.parse_args() generate_batch_cmds(args.precluster_csv, real_upath(args.precluster_dir), args.cmd_filename, #real_upath(args.nfl_filename), #real_upath(args.tucked_filename), #real_upath(args.subread_xml), args.cpus)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, blasr_nproc=12, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=blasr_nproc) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(.01, .07, .06) else: # FIXME this will not work with current CCS bam output, which lacks # QV pulse features required - this is handled via a workaround in # pbtranscript.tasks.ice_partial logging.info("Loading probability from QV in %s", ccs_fofn) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)
def quiver_cmds_for_bin(self, cids, quiver_nproc=2, bam=False): """ Return a list of quiver related cmds. Input format can be FASTA or BAM. If inputs are in FASTA format, call samtoh5, loadPulses, comph5tools.py, samtools, loadChemistry, quiver... If inputs are in BAM format, call quiver directly. """ first, last = cids[0], cids[-1] self.add_log("Creating quiver cmds for c{first} to c{last}".format( first=first, last=last)) bin_ref_fa = self.ref_fa_of_quivered_bin(first, last) bin_sam_file = self.sam_of_quivered_bin(first, last) bin_cmph5 = self.cmph5_of_quivered_bin(first, last) bin_fq = self.fq_of_quivered_bin(first, last) bin_unsorted_bam_file = self.bam_of_quivered_bin(first, last, is_sorted=False) bin_bam_file = self.bam_of_quivered_bin(first, last, is_sorted=True) bin_bam_prefix = self._quivered_bin_prefix(first, last) quiver_input = bin_cmph5 if not bam else bin_bam_file cmds = [] if not bam: raise IOError("conversion to cmp.h5 no longer supported") cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format( sam=real_upath(bin_sam_file), ref=real_upath(bin_ref_fa), cmph5=real_upath(bin_cmph5))) cmds.append("gzip {sam}".format(sam=real_upath(bin_sam_file))) metrics = [ "QualityValue", "InsertionQV", "MergeQV", "DeletionQV", "DeletionTag", "SubstitutionTag", "SubstitutionQV" ] cmds.append("loadPulses {bas_fofn} ".format( bas_fofn=real_upath(self.bas_fofn)) + "{cmph5} ".format(cmph5=real_upath(bin_cmph5)) + "-byread -metrics " + ",".join(metrics)) cmds.append("cmph5tools.py sort {cmph5}".format( cmph5=real_upath(bin_cmph5))) cmds.append("loadChemistry.py {bas_fofn} {cmph5}".format( bas_fofn=real_upath(self.bas_fofn), cmph5=real_upath(bin_cmph5))) else: cmds.append("samtools sort {f} {d}".format( f=real_upath(bin_unsorted_bam_file), d=real_upath(bin_bam_prefix))) cmds.append( "samtools index {f}".format(f=real_upath(bin_bam_file))) cmds.append("samtools faidx {ref}".format(ref=real_upath(bin_ref_fa))) cmds.append("pbindex {f}".format(f=real_upath(bin_bam_file))) cmds.append("variantCaller --algorithm=best " + "{f} ".format(f=real_upath(quiver_input)) + "--verbose -j{n} ".format(n=quiver_nproc) + "--referenceFilename={ref} ".format( ref=real_upath(bin_ref_fa)) + "-o {fq}".format(fq=real_upath(bin_fq))) return cmds