def run(self): """Run""" iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=None, fasta_fofn=None, sge_opts=None, prog_name="ice_quiver_merge") iceq.add_log(self.cmd_str()) iceq.add_log("root_dir: {d}.".format(d=self.root_dir)) iceq.add_log("Total number of chunks: N = {N}.".format(N=self.N)) src = [ iceq.submitted_quiver_jobs_log_of_chunk_i(i=i, num_chunks=self.N) for i in range(0, self.N) ] for f in src: if not nfs_exists(f): raise IOError("Log {f} ".format(f=f) + "of submitted quiver jobs does not exist.") dst = iceq.submitted_quiver_jobs_log iceq.add_log( "Collecting submitted quiver jobs from:\n{src}\nto {dst}.".format( src="\n".join(src), dst=dst)) cat_files(src=src, dst=dst) iceq.close_log()
def run(self): """Run""" iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=None, fasta_fofn=None, sge_opts=None, prog_name="ice_quiver_merge") iceq.add_log(self.cmd_str()) iceq.add_log("root_dir: {d}.".format(d=self.root_dir)) iceq.add_log("Total number of chunks: N = {N}.".format(N=self.N)) src = [iceq.submitted_quiver_jobs_log_of_chunk_i(i=i, num_chunks=self.N) for i in range(0, self.N)] for f in src: if not nfs_exists(f): raise IOError("Log {f} ".format(f=f) + "of submitted quiver jobs does not exist.") dst = iceq.submitted_quiver_jobs_log iceq.add_log("Collecting submitted quiver jobs from:\n{src}\nto {dst}.". format(src="\n".join(src), dst=dst)) cat_files(src=src, dst=dst) iceq.close_log()
def test_cat_files(self): """Test cat_files.""" fn_1 = op.join(self.data_dir, "primers.fa") fn_2 = op.join(self.data_dir, "test_phmmer.fa") out_fn_1 = op.join(self.out_dir, "test_cat_1") out_fn_2 = op.join(self.out_dir, "test_cat_2") std_out_fn_2 = op.join(self.stdout_dir, "test_cat_2") cat_files(src=[fn_1], dst=out_fn_1) cat_files(src=[fn_1, fn_2], dst=out_fn_2) self.assertTrue(filecmp.cmp(out_fn_1, fn_1)) self.assertTrue(filecmp.cmp(out_fn_2, std_out_fn_2))
def runChimeraDetector(self): """Call chimera detection on full-length reads, and non-full-length reads if required.""" # Create forward/reverse primers for chimera detection. self._processPrimers( primer_fn=self.primer_fn, window_size=self.chimera_detection_opts.primer_search_window, primer_out_fn=self.primer_chimera_fn, revcmp_primers=True) # Detect chimeras among full-length reads, separate flnc reads and # flc reads. logging.info("Detect chimeric reads from trimmed full-length reads.") (self.summary.num_flnc, self.summary.num_flc, self.summary.num_flnc_bases, _x) = \ self._detect_chimera(in_fasta=self._trimmed_fl_reads_fn, out_nc_fasta=self.out_flnc_fn, out_c_fasta=self.out_flc_fn, primer_report_fn=self._primer_report_fl_fn, out_dom=self.out_trimmed_fl_dom_fn, num_reads=self.summary.num_fl, job_name="fl") assert(self.summary.num_fl == self.summary.num_flnc + self.summary.num_flc) logging.info("Done with chimera detection on trimmed full-length " + "reads.") # Detect chimeras among non-full-length reads if required, separate # nflnc reads and nflc reads, rewrite self.primer_report_nfl_fn. if self.chimera_detection_opts.detect_chimera_nfl is True: logging.info("Detect chimeric reads from trimmed non-full-length " + "reads.") (self.summary.num_nflnc, self.summary.num_nflc, _x, _y) = \ self._detect_chimera(in_fasta=self._trimmed_nfl_reads_fn, out_nc_fasta=self.out_nflnc_fn, out_c_fasta=self.out_nflc_fn, primer_report_fn=self._primer_report_nfl_fn, out_dom=self.out_trimmed_nfl_dom_fn, num_reads=self.summary.num_nfl, job_name="nfl") assert(self.summary.num_nfl == self.summary.num_nflnc + self.summary.num_nflc) logging.info("Done with chimera detection on trimmed " + "non-full-length reads.") # Concatenate out_nflnc_fn and out_nflc_fn as out_nfl_fn cat_files(src=[self.out_nflnc_fn, self.out_nflc_fn], dst=self.out_nfl_fn) # Concatenate out_flnc and out_nflnc to make out_all_reads_fn cat_files(src=[self.out_flnc_fn, self.out_nflnc_fn], dst=self.out_all_reads_fn) else: # Soft link _trimmed_nfl_reads_fn as out_nfl_fn ln(self._trimmed_nfl_reads_fn, self.out_nfl_fn) # Concatenate out_flnc and out_nfl to make out_all_reads_fn cat_files(src=[self.out_flnc_fn, self.out_nfl_fn], dst=self.out_all_reads_fn) # primer info of fl/nfl reads reported to _primer_report_fl_fn # and _primer_report_nfl_fn, concatenate them in order to make # a full report: primer_report_fn. cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn], dst=self.primer_report_fn) # Delete intermediate files. self._cleanup([self._primer_report_nfl_fn, self._primer_report_fl_fn])
def runChimeraDetector(self): """Call chimera detection on full-length reads, and non-full-length reads if required.""" # Create forward/reverse primers for chimera detection. self._processPrimers( primer_fn_forward=self.primer_fn_forward, primer_fn_reverse=self.primer_fn_reverse, window_size=self.chimera_detection_opts.primer_search_window, primer_out_fn=self.primer_chimera_fn, revcmp_primers=True) # Detect chimeras among full-length reads, separate flnc reads and # flc reads. logging.info("Detect chimeric reads from trimmed full-length reads.") (self.summary.num_flnc, self.summary.num_flc, self.summary.num_flnc_bases, _x) = \ self._detect_chimera(in_fasta=self._trimmed_fl_reads_fn, out_nc_fasta=self.out_flnc_fn, out_c_fasta=self.out_flc_fn, primer_report_fn=self._primer_report_fl_fn, out_dom=self.out_trimmed_fl_dom_fn, num_reads=self.summary.num_fl, job_name="fl") assert(self.summary.num_fl == self.summary.num_flnc + self.summary.num_flc) logging.info("Done with chimera detection on trimmed full-length " + "reads.") # Detect chimeras among non-full-length reads if required, separate # nflnc reads and nflc reads, rewrite self.primer_report_nfl_fn. if self.chimera_detection_opts.detect_chimera_nfl is True: logging.info("Detect chimeric reads from trimmed non-full-length " + "reads.") (self.summary.num_nflnc, self.summary.num_nflc, _x, _y) = \ self._detect_chimera(in_fasta=self._trimmed_nfl_reads_fn, out_nc_fasta=self.out_nflnc_fn, out_c_fasta=self.out_nflc_fn, primer_report_fn=self._primer_report_nfl_fn, out_dom=self.out_trimmed_nfl_dom_fn, num_reads=self.summary.num_nfl, job_name="nfl") assert(self.summary.num_nfl == self.summary.num_nflnc + self.summary.num_nflc) logging.info("Done with chimera detection on trimmed " + "non-full-length reads.") # Concatenate out_nflnc_fn and out_nflc_fn as out_nfl_fn cat_files(src=[self.out_nflnc_fn, self.out_nflc_fn], dst=self.out_nfl_fn) # Concatenate out_flnc and out_nflnc to make out_all_reads_fn cat_files(src=[self.out_flnc_fn, self.out_nflnc_fn], dst=self.out_all_reads_fn) else: # Soft link _trimmed_nfl_reads_fn as out_nfl_fn ln(self._trimmed_nfl_reads_fn, self.out_nfl_fn) # Concatenate out_flnc and out_nfl to make out_all_reads_fn cat_files(src=[self.out_flnc_fn, self.out_nfl_fn], dst=self.out_all_reads_fn) # primer info of fl/nfl reads reported to _primer_report_fl_fn # and _primer_report_nfl_fn, concatenate them in order to make # a full report: primer_report_fn. cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn], dst=self.primer_report_fn) # Delete intermediate files. self._cleanup([self._primer_report_nfl_fn, self._primer_report_fl_fn])
def runChimeraDetector(self): """Detect chimeras from trimmed reads.""" logging.info("Start to detect chimeras from trimmed reads.") need_cleanup = True if os.path.exists(self.out_trimmed_reads_dom_fn): logging.info("Output already exists. Parsing {0}.".format(self.out_trimmed_reads_dom_fn)) need_cleanup = False else: # Create forward/reverse primers for chimera detection. _primer_indices = self._processPrimers( primer_fn=self.primer_fn, window_size=self.chimera_detection_opts.primer_search_window, primer_out_fn=self.primer_chimera_fn, revcmp_primers=True) num_chunks = max(min(self.summary.num_fl, self.cpus), 1) #logging.debug("Split non-full-length reads into {n} chunks.". # format(n=num_chunks)) # Only detect chimeras on full-length reads in order to save time reads_per_chunk = int(math.ceil(self.summary.num_fl / (float(num_chunks)))) num_chunks = int(math.ceil(self.summary.num_fl/float(reads_per_chunk))) self.chunked_trimmed_reads_fns = generateChunkedFN(self.out_dir, "in.trimmed.fa_split", num_chunks) self.chunked_trimmed_reads_dom_fns = generateChunkedFN(self.out_dir, "out.trimmed.hmmer_split", num_chunks) self._chunkReads(reads_fn=self._trimmed_fl_reads_fn, reads_per_chunk=reads_per_chunk, chunked_reads_fns=self.chunked_trimmed_reads_fns, extract_front_back_only=False) self._startPhmmers(self.chunked_trimmed_reads_fns, self.chunked_trimmed_reads_dom_fns, self.out_trimmed_reads_dom_fn, self.primer_chimera_fn, self.pbmatrix_fn) suspicous_hits = self._getChimeraRecord(self.out_trimmed_reads_dom_fn, self.chimera_detection_opts) # Only detect chimeras on full-length reads in order to save time self._updateChimeraInfo(suspicous_hits=suspicous_hits, in_read_fn=self._trimmed_fl_reads_fn, out_flnc_fn=self.out_flnc_fn, out_flc_fn=self.out_flc_fn, primer_report_fl_fn=self._primer_report_fl_fn) # full-length non-chimeric reads written to out_flnc.fa # non-full-length reads written to out_nfl.fa # primer info of fl reads reported to _primer_report_fl_fn # primer info of nfl reads reported to _primer_report_nfl_fn # Need to: (1) concatenate out_flnc and out_nfl to make # out_all_reads_fn # (2) concatenate _primer_report_fl_fn and # _primer_report_nfl_fn to make primer_report_fn cat_files(src=[self.out_flnc_fn, self.out_nfl_fn], dst=self.out_all_reads_fn) cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn], dst=self.primer_report_fn) if need_cleanup: self._cleanup(self.chunked_trimmed_reads_fns + self.chunked_trimmed_reads_dom_fns) logging.info("Done with chimera detection.")
def runChimeraDetector(self): """Detect chimeras from trimmed reads.""" logging.info("Start to detect chimeras from trimmed reads.") need_cleanup = True if os.path.exists(self.out_trimmed_reads_dom_fn): logging.info("Output already exists. Parsing {0}.".format( self.out_trimmed_reads_dom_fn)) need_cleanup = False else: # Create forward/reverse primers for chimera detection. _primer_indices = self._processPrimers( primer_fn=self.primer_fn, window_size=self.chimera_detection_opts.primer_search_window, primer_out_fn=self.primer_chimera_fn, revcmp_primers=True) num_chunks = max(min(self.summary.num_fl, self.cpus), 1) #logging.debug("Split non-full-length reads into {n} chunks.". # format(n=num_chunks)) # Only detect chimeras on full-length reads in order to save time reads_per_chunk = int( math.ceil(self.summary.num_fl / (float(num_chunks)))) num_chunks = int( math.ceil(self.summary.num_fl / float(reads_per_chunk))) self.chunked_trimmed_reads_fns = generateChunkedFN( self.out_dir, "in.trimmed.fa_split", num_chunks) self.chunked_trimmed_reads_dom_fns = generateChunkedFN( self.out_dir, "out.trimmed.hmmer_split", num_chunks) self._chunkReads(reads_fn=self._trimmed_fl_reads_fn, reads_per_chunk=reads_per_chunk, chunked_reads_fns=self.chunked_trimmed_reads_fns, extract_front_back_only=False) self._startPhmmers(self.chunked_trimmed_reads_fns, self.chunked_trimmed_reads_dom_fns, self.out_trimmed_reads_dom_fn, self.primer_chimera_fn, self.pbmatrix_fn) suspicous_hits = self._getChimeraRecord(self.out_trimmed_reads_dom_fn, self.chimera_detection_opts) # Only detect chimeras on full-length reads in order to save time self._updateChimeraInfo(suspicous_hits=suspicous_hits, in_read_fn=self._trimmed_fl_reads_fn, out_flnc_fn=self.out_flnc_fn, out_flc_fn=self.out_flc_fn, primer_report_fl_fn=self._primer_report_fl_fn) # full-length non-chimeric reads written to out_flnc.fa # non-full-length reads written to out_nfl.fa # primer info of fl reads reported to _primer_report_fl_fn # primer info of nfl reads reported to _primer_report_nfl_fn # Need to: (1) concatenate out_flnc and out_nfl to make # out_all_reads_fn # (2) concatenate _primer_report_fl_fn and # _primer_report_nfl_fn to make primer_report_fn cat_files(src=[self.out_flnc_fn, self.out_nfl_fn], dst=self.out_all_reads_fn) cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn], dst=self.primer_report_fn) if need_cleanup: self._cleanup(self.chunked_trimmed_reads_fns + self.chunked_trimmed_reads_dom_fns) logging.info("Done with chimera detection.")