def ice_quiver_postprocess_a_cluster_bin(cluster_out_dir, ipq_opts): """ ice_quiver_postprocess a cluster bin, create summary.json, cluster_report.csv, hq|lq_isoforms.fa|fq. return hq.fq and lq.fq Parameters: cluster_out_dir - root dir running ICE, ice_partial and ice_quiver for this cluster bin. """ _jobs_log = op.join(cluster_out_dir, "log", "submitted_quiver_jobs.txt") shell_scripts = [] for file_name in os.listdir(op.join(cluster_out_dir, "quivered")): if file_name.endswith(".sh"): shell_scripts.append(file_name) with open(_jobs_log, 'w') as f: f.write("\n".join(["\t".join(["local", s]) for s in shell_scripts])) icep = IceQuiverPostprocess(root_dir=cluster_out_dir, ipq_opts=ipq_opts) for file_name in os.listdir(op.join(cluster_out_dir, "quivered")): if file_name.endswith(".sh"): shell_scripts.append(file_name) icep.run() return (icep.quivered_good_fq, icep.quivered_bad_fq)
def run(self): """Run""" iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=self.bas_fofn, fasta_fofn=self.fasta_fofn, sge_opts=self.sge_opts, tmp_dir=self.tmp_dir) iceq.validate_inputs() iceq.run() icepq = IceQuiverPostprocess(root_dir=self.root_dir, use_sge=self.sge_opts.use_sge, quit_if_not_done=False, ipq_opts=self.ipq_opts) icepq.run() return 0
class Polish(IceFiles): """Polish isoforms clusters using Quiver.""" def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, ipq_opts, fasta_fofn=None, tmp_dir=None): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. ccs.fofn of ccs files. ipq_opts --- IceQuiverHQLQOptions qv_trim_5: ignore QV of n bases in the 5' end qv_trim_3: ignore QV of n bases in the 3' end hq_quiver_min_accuracy: minimum allowed quiver accuracy to mark an isoform as high quality hq_isoforms_fa|fq: polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq: polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn, tmp_dir=tmp_dir) self.nfl_fa = realpath(nfl_fa) self.ice_opts = ice_opts self.sge_opts = sge_opts self.ipq_opts = ipq_opts self.add_log("ece_penalty: {0}, ece_min_len: {1}".format( self.ice_opts.ece_penalty, self.ice_opts.ece_min_len)) self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IceQuiverPostprocess self._nfl_splitted_fas = None self.validate_inputs() def validate_inputs(self): """ Validate input directories: root_dir, and files: nfl_fa, bas_fofn, ccs_fofn. """ self.add_log("Validating inputs.") errMsg = "" if not op.exists(self.root_dir): errMsg = "Root dir {d} is not an existing directory!".\ format(d=self.root_dir) if not op.exists(self.nfl_fa): errMsg = "Failed to find non-full-length reads {f}!".\ format(f=self.nfl_fa) if self.bas_fofn is None: errMsg = "bas_fofn (subreadset) must be specified." if not op.exists(self.bas_fofn): errMsg = "Failed to find bas fofn (subreadset) {f}!".format( f=self.bas_fofn) if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise ValueError(errMsg) def run(self): """ First, split non-full-length (nfl) fasta files into smaller chunks, assign nfl reads in each splitted fasta file into unpolished isoform clusters and then merge all pickles into self.nfl_all_pickle_fn. Second, bin every 100 clusters, for each bin, call blasr, samto5h, loadPulses, cmph5tools to create cmp.h5 files and call quiver to polish each isoforms within each bin. Finally, pick up good isoform clusters whose QV errors is less than a threshold. Save all high quality isoforms to hq_isoforms_fa|fq if they are not None Save all low quality isoforms to lq_isoforms_fa|fq if they are not None """ if guess_file_format(self.bas_fofn) != FILE_FORMATS.BAM: # Create input.fasta.fofn from bas_fofn self.add_log("Creating fasta fofn from bas/bax.h5/bam fofn", level=logging.INFO) if self.fasta_fofn is None: self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn") self.add_log("fasta fofn={f}".format(f=self.fasta_fofn)) convert_fofn_to_fasta(fofn_filename=self.bas_fofn, out_filename=self.fasta_fofn, fasta_out_dir=self.nfl_dir) else: self.fasta_fofn = None # Split non-full-length reads into smaller fasta files # and save files to root_dir/nfl_00.fasta, ..., . self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.nfl_reads_per_split), level=logging.INFO) self._nfl_splitted_fas = splitFasta( input_fasta=self.nfl_fa, reads_per_split=self.ice_opts.nfl_reads_per_split, out_dir=self.nfl_dir, out_prefix="input.split") msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas) self.add_log(msg, level=logging.INFO) # Generating dazz DB for final.consensus.fasta ref_obj = DazzIDHandler(input_filename=self.final_consensus_fa, converted=False) ref_obj.make_db() msg = "Dazz DB made for: " + ref_obj.dazz_filename self.add_log(msg, level=logging.INFO) # Process nfl reads in each splitted fasta. self.add_log("Initializing IceAllPartials.", level=logging.INFO) self.icep = IceAllPartials(root_dir=self.root_dir, fasta_filenames=self._nfl_splitted_fas, ref_fasta=self.final_consensus_fa, out_pickle=self.nfl_all_pickle_fn, sge_opts=self.sge_opts, ccs_fofn=self.ccs_fofn) self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn), level=logging.INFO) self.icep.run() self.add_log("IceAllPartials completed.", level=logging.INFO) self.add_log("Initializing IceQuiver.", level=logging.INFO) self.iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=self.bas_fofn, fasta_fofn=self.fasta_fofn, sge_opts=self.sge_opts, tmp_dir=self.tmp_dir) self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn), level=logging.INFO) self.iceq.run() self.add_log("IceQuiver finished.", level=logging.INFO) self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO) self.icepq = IceQuiverPostprocess(root_dir=self.root_dir, use_sge=self.sge_opts.use_sge, quit_if_not_done=False, ipq_opts=self.ipq_opts) self.add_log( "IceQuiverPostprocess log: {f}.".format(f=self.icepq.log_fn), level=logging.INFO) self.icepq.run() self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)
class Polish(IceFiles): """Polish isoforms clusters using Quiver.""" def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, ipq_opts, fasta_fofn=None, tmp_dir=None): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. ccs.fofn of ccs files. ipq_opts --- IceQuiverHQLQOptions qv_trim_5: ignore QV of n bases in the 5' end qv_trim_3: ignore QV of n bases in the 3' end hq_quiver_min_accuracy: minimum allowed quiver accuracy to mark an isoform as high quality hq_isoforms_fa|fq: polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq: polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn, tmp_dir=tmp_dir) self.nfl_fa = realpath(nfl_fa) self.ice_opts = ice_opts self.sge_opts = sge_opts self.ipq_opts = ipq_opts self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len)) self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IceQuiverPostprocess self._nfl_splitted_fas = None self.validate_inputs() def validate_inputs(self): """ Validate input directories: root_dir, and files: nfl_fa, bas_fofn, ccs_fofn. """ self.add_log("Validating inputs.") errMsg = "" if not op.exists(self.root_dir): errMsg = "Root dir {d} is not an existing directory!".\ format(d=self.root_dir) if not op.exists(self.nfl_fa): errMsg = "Failed to find non-full-length reads {f}!".\ format(f=self.nfl_fa) if self.bas_fofn is None: errMsg = "bas_fofn (subreadset) must be specified." if not op.exists(self.bas_fofn): errMsg = "Failed to find bas fofn (subreadset) {f}!".format(f=self.bas_fofn) if errMsg != "": self.add_log(errMsg, level=logging.ERROR) raise ValueError(errMsg) def run(self): """ First, split non-full-length (nfl) fasta files into smaller chunks, assign nfl reads in each splitted fasta file into unpolished isoform clusters and then merge all pickles into self.nfl_all_pickle_fn. Second, bin every 100 clusters, for each bin, call blasr, samto5h, loadPulses, cmph5tools to create cmp.h5 files and call quiver to polish each isoforms within each bin. Finally, pick up good isoform clusters whose QV errors is less than a threshold. Save all high quality isoforms to hq_isoforms_fa|fq if they are not None Save all low quality isoforms to lq_isoforms_fa|fq if they are not None """ if guess_file_format(self.bas_fofn) != FILE_FORMATS.BAM: # Create input.fasta.fofn from bas_fofn self.add_log("Creating fasta fofn from bas/bax.h5/bam fofn", level=logging.INFO) if self.fasta_fofn is None: self.fasta_fofn = op.join(self.nfl_dir, "input.fasta.fofn") self.add_log("fasta fofn={f}".format(f=self.fasta_fofn)) convert_fofn_to_fasta(fofn_filename=self.bas_fofn, out_filename=self.fasta_fofn, fasta_out_dir=self.nfl_dir) else: self.fasta_fofn = None # Split non-full-length reads into smaller fasta files # and save files to root_dir/nfl_00.fasta, ..., . self.add_log("Splitting {nfl} into ".format(nfl=self.nfl_fa) + "smaller files each containing {n} reads.".format( n=self.ice_opts.nfl_reads_per_split), level=logging.INFO) self._nfl_splitted_fas = splitFasta(input_fasta=self.nfl_fa, reads_per_split=self.ice_opts.nfl_reads_per_split, out_dir=self.nfl_dir, out_prefix="input.split") msg = "Splitted files are: " + "\n".join(self._nfl_splitted_fas) self.add_log(msg, level=logging.INFO) # Generating dazz DB for final.consensus.fasta ref_obj = DazzIDHandler(input_filename=self.final_consensus_fa, converted=False) ref_obj.make_db() msg = "Dazz DB made for: " + ref_obj.dazz_filename self.add_log(msg, level=logging.INFO) # Process nfl reads in each splitted fasta. self.add_log("Initializing IceAllPartials.", level=logging.INFO) self.icep = IceAllPartials( root_dir=self.root_dir, fasta_filenames=self._nfl_splitted_fas, ref_fasta=self.final_consensus_fa, out_pickle=self.nfl_all_pickle_fn, sge_opts=self.sge_opts, ccs_fofn=self.ccs_fofn) self.add_log("IceAllPartials log: {f}.".format(f=self.icep.log_fn), level=logging.INFO) self.icep.run() self.add_log("IceAllPartials completed.", level=logging.INFO) self.add_log("Initializing IceQuiver.", level=logging.INFO) self.iceq = IceQuiver(root_dir=self.root_dir, bas_fofn=self.bas_fofn, fasta_fofn=self.fasta_fofn, sge_opts=self.sge_opts, tmp_dir=self.tmp_dir) self.add_log("IceQuiver log: {f}.".format(f=self.iceq.log_fn), level=logging.INFO) self.iceq.run() self.add_log("IceQuiver finished.", level=logging.INFO) self.add_log("Initializing IceQuiverPostprocess.", level=logging.INFO) self.icepq = IceQuiverPostprocess(root_dir=self.root_dir, use_sge=self.sge_opts.use_sge, quit_if_not_done=False, ipq_opts=self.ipq_opts) self.add_log("IceQuiverPostprocess log: {f}.". format(f=self.icepq.log_fn), level=logging.INFO) self.icepq.run() self.add_log("IceQuiverPostprocess finished.", level=logging.INFO)