def _validate_inputs(self, root_dir, N): """ Check inputs, return (splitted_pickles, out_pickle) """ icef = IceFiles(prog_name="ice_partial_merge", root_dir=root_dir, no_log_f=False) # root_dir/output/map_noFL/input.split_{0:03d}.fasta.partial_uc.pickle splitted_pickles = [icef.nfl_pickle_i(i) for i in range(0, N)] dones = [icef.nfl_done_i(i) for i in range(0, N)] # Check if inputs exist. errMsg = "" for done in dones: if not nfs_exists(done): errMsg = "DONE file {f} does not exist.".format(f=done) for pickle in splitted_pickles: if not nfs_exists(pickle): errMsg = "Pickle file {f} does not exist.".format(f=pickle) if len(errMsg) != 0: raise ValueError(errMsg) # root_dir/output/map_noFL/nfl.all.partial_uc.pickle out_pickle = icef.nfl_all_pickle_fn return (splitted_pickles, out_pickle)
def __init__(self, root_dir, fasta_filenames, fastq_filenames, ref_fasta, out_pickle, ice_opts, sge_opts, cpus, tmp_dir=None): """ fasta_filenames --- a list of splitted nfl fasta files. ref_fasta --- (unpolished) consensus isoforms out_pickle --- a pickle file with all nfl fasta reads root_dir --- ICE root output directory tmp_dir --- if not None, write temporary clusters, dazz, las files to the given temporaray directory sge_opts --- params for SGE environment, including use_sge : use SGE or not max_sge_jobs: maximum number of sub-jobs submitted unique_id : unique qsub job id, important that this DOES NOT CONFLICT! """ self.prog_name = "IceAllPartials" IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir, tmp_dir=tmp_dir) self.fasta_filenames, self.ref_fasta = \ self._validate_inputs(fasta_filenames=fasta_filenames, ref_fasta=ref_fasta) if fastq_filenames is not None: for fq in fastq_filenames: assert op.exists(fq) self.fastq_filenames = fastq_filenames # note: could be None self.out_pickle = out_pickle self.ice_opts = ice_opts self.sge_opts = sge_opts self.cpus = cpus # this is the number of CPUs to use per SGE job or per local job self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir) mkdir(self.nfl_dir) self.add_log("input fasta files are: " + ", ".join(self.fasta_filenames)) self.add_log("temp pickle files are: " + ", ".join(self.pickle_filenames)) self.add_log("out pickle file is: " + self.out_pickle) self.add_log("temp directory is: " + str(self.tmp_dir))
def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, ipq_opts, fasta_fofn=None, tmp_dir=None): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. ccs.fofn of ccs files. ipq_opts --- IceQuiverHQLQOptions qv_trim_5: ignore QV of n bases in the 5' end qv_trim_3: ignore QV of n bases in the 3' end hq_quiver_min_accuracy: minimum allowed quiver accuracy to mark an isoform as high quality hq_isoforms_fa|fq: polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq: polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn, tmp_dir=tmp_dir) self.nfl_fa = realpath(nfl_fa) self.ice_opts = ice_opts self.sge_opts = sge_opts self.ipq_opts = ipq_opts self.add_log("ece_penalty: {0}, ece_min_len: {1}".format(self.ice_opts.ece_penalty, self.ice_opts.ece_min_len)) self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IceQuiverPostprocess self._nfl_splitted_fas = None self.validate_inputs()
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:03d}.fasta splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if not nfs_exists(nfl_fa): errMsg = ("The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads * 1.0 / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def _validate_inputs(self, root_dir, nfl_fa, N): """ Check inputs, return (num_reads, number_reads_per_chunk, nfl_dir, [i-th_chunk_nfl_fa for i in [0...N-1]]) """ icef = IceFiles(prog_name="ice_partial_split", root_dir=root_dir, no_log_f=False) nfl_dir = icef.nfl_dir # root_dir/output/map_noFL/input.split_{0:03d}.fasta splitted_nfl_fas = [icef.nfl_fa_i(i) for i in range(0, N)] mkdir(icef.nfl_dir) # Check if inputs exist. errMsg = "" if N <= 0 or N > 100: errMsg = "Input file can not be splitted into %d chunks!" % N if not nfs_exists(nfl_fa): errMsg = ("The input non-full-length reads fasta file " + "{f} does not exists. ".format(f=nfl_fa)) if len(errMsg) != 0: raise ValueError(errMsg) num_reads = num_reads_in_fasta(nfl_fa) reads_per_split = int(max(1, ceil(num_reads * 1.0 / N))) return (num_reads, reads_per_split, nfl_dir, splitted_nfl_fas)
def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir): """ Check inputs, write $ICE_PARTIAL_PY i command to script_file and return (input_fasta, ref_fasta, out_pickle, done_file) for the i-th chunk of nfl reads. """ icef = IceFiles(prog_name="ice_partial_{i}".format(i=i), root_dir=root_dir, no_log_f=False) # root_dir/output/final.consensus.fasta ref_fasta = icef.final_consensus_fa ref_dazz = icef.final_dazz_db # root_dir/output/map_noFL/input.split_{0:03d}.fasta input_fasta = icef.nfl_fa_i(i) # $input_fasta.partial_uc.pickle out_pickle = icef.nfl_pickle_i(i) # $input_fasta.partial_uc.pickle.DONE done_file = icef.nfl_done_i(i) # $input_fasta.partial_uc.sh script_file = icef.nfl_script_i(i) # Check if inputs exist. errMsg = "" if not nfs_exists(input_fasta): errMsg = ( "The {i}-th splitted non-full-length reads ".format(i=i) + "fasta file {f} does not exist. ".format(f=input_fasta) + "Please run $ICE_PARTIAL_PY split first.") elif not nfs_exists(ref_fasta): errMsg = ("The unpolished consensus isoforms fasta file " + "{f} does not exist. ".format(f=ref_fasta) + "Please make sure ICE is successfully done.") elif not nfs_exists(ref_dazz): errMsg = ("The dazz db " + "{f} does not exist. ".format(f=ref_dazz) + "Please make sure it is already built.") if len(errMsg) != 0: raise ValueError(errMsg) # Save cmd to script_file. cmd = self._cmd_str(root_dir=root_dir, i=[i], ccs_fofn=ccs_fofn, blasr_nproc=blasr_nproc, tmp_dir=tmp_dir) with open(script_file, 'w') as writer: writer.write(cmd + "\n") icef.add_log( "Writing CMD to: {script_file}".format(script_file=script_file)) icef.close_log() return (input_fasta, ref_fasta, out_pickle, done_file)
def __init__(self, root_dir, fasta_filenames, ref_fasta, out_pickle, sge_opts, ccs_fofn=None, tmp_dir=None): """ fasta_filenames --- a list of splitted nfl fasta files. ref_fasta --- (unpolished) consensus isoforms out_pickle --- a pickle file with all nfl fasta reads ccs_fofn --- should be reads_of_insert.fofn or None root_dir --- ICE root output directory tmp_dir --- if not None, write temporary clusters, dazz, las files to the given temporaray directory sge_opts --- params for SGE environment, including use_sge : use SGE or not max_sge_jobs: maximum number of sub-jobs submitted unique_id : unique qsub job id, important that this DOES NOT CONFLICT! blasr_nproc: blasr -nproc param, number of threads per cpu. """ self.prog_name = "IceAllPartials" IceFiles.__init__(self, prog_name=self.prog_name, root_dir=root_dir, tmp_dir=tmp_dir) self.fasta_filenames, self.ref_fasta, self.ccs_fofn, = \ self._validate_inputs(fasta_filenames=fasta_filenames, ref_fasta=ref_fasta, ccs_fofn=ccs_fofn) self.out_pickle = out_pickle self.sge_opts = sge_opts self.add_log("Making dir for mapping noFL reads: " + self.nfl_dir) mkdir(self.nfl_dir) self.add_log("input fasta files are: " + ", ".join(self.fasta_filenames)) self.add_log("temp pickle files are: " + ", ".join(self.pickle_filenames)) self.add_log("out pickle file is: " + self.out_pickle) self.add_log("temp directory is: " + str(self.tmp_dir))
def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir): """ Check inputs, write $ICE_PARTIAL_PY i command to script_file and return (input_fasta, ref_fasta, out_pickle, done_file) for the i-th chunk of nfl reads. """ icef = IceFiles(prog_name="ice_partial_{i}".format(i=i), root_dir=root_dir, no_log_f=False) # root_dir/output/final.consensus.fasta ref_fasta = icef.final_consensus_fa ref_dazz = icef.final_dazz_db # root_dir/output/map_noFL/input.split_{0:03d}.fasta input_fasta = icef.nfl_fa_i(i) # $input_fasta.partial_uc.pickle out_pickle = icef.nfl_pickle_i(i) # $input_fasta.partial_uc.pickle.DONE done_file = icef.nfl_done_i(i) # $input_fasta.partial_uc.sh script_file = icef.nfl_script_i(i) # Check if inputs exist. errMsg = "" if not nfs_exists(input_fasta): errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) + "fasta file {f} does not exist. ".format(f=input_fasta) + "Please run $ICE_PARTIAL_PY split first.") elif not nfs_exists(ref_fasta): errMsg = ("The unpolished consensus isoforms fasta file " + "{f} does not exist. ".format(f=ref_fasta) + "Please make sure ICE is successfully done.") elif not nfs_exists(ref_dazz): errMsg = ("The dazz db " + "{f} does not exist. ".format(f=ref_dazz) + "Please make sure it is already built.") if len(errMsg) != 0: raise ValueError(errMsg) # Save cmd to script_file. cmd = self._cmd_str(root_dir=root_dir, i=[i], ccs_fofn=ccs_fofn, blasr_nproc=blasr_nproc, tmp_dir=tmp_dir) with open(script_file, 'w') as writer: writer.write(cmd + "\n") icef.add_log("Writing CMD to: {script_file}". format(script_file=script_file)) icef.close_log() return (input_fasta, ref_fasta, out_pickle, done_file)
def run_after(self, rtc, output_dir): self.assertTrue(op.exists(rtc.task.output_files[0])) out_dir = op.join(OUT_DIR, "test_gather_polished_isoforms_in_each_bin") cluster_out_dirs = [ op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES ] out_hq_fns = [ op.join(d, fn) for d in cluster_out_dirs for fn in HQ_ISOFORMS_FNS ] print "out_hq_fns %s" % out_hq_fns self.assertTrue(all([op.exists(f) for f in out_hq_fns])) out_lq_fns = [ op.join(d, fn) for d in cluster_out_dirs for fn in LQ_ISOFORMS_FNS ] print "out_lq_fns %s" % out_lq_fns self.assertTrue(all([op.exists(f) for f in out_lq_fns])) print "out_lq_fa %s is not empty" % out_lq_fns[0] n = len([r for r in FastaReader(out_lq_fns[0])]) self.assertTrue(n > 0) out_logs = [ IceFiles(prog_name="", root_dir=d).submitted_quiver_jobs_log for d in cluster_out_dirs ] print "out_logs %s" % out_logs self.assertTrue(all([op.exists(f) for f in out_logs]))
def resolved_tool_contract_runner(rtc): """ For each cluster bin, clean up intermediate files under tmp. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) cluster_bin_indices = [task.cluster_bin_index for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) sentinel_out = rtc.task.output_files[0] with open(sentinel_out, 'w') as writer: for task in p: icef = IceFiles(prog_name="ice_cleanup", root_dir=task.cluster_out_dir) tmp_dir = icef.tmp_dir log.info("Cleaning up, removing %s", tmp_dir) writer.write("removing %s\n" % tmp_dir) execute("rm -rf %s" % real_upath(tmp_dir)) quivered_dir = icef.quivered_dir log.info("Cleaning up, removing %s", quivered_dir) writer.write("removing %s\n" % quivered_dir) execute("rm -rf %s" % real_upath(quivered_dir))
def __init__(self, root_dir, nfl_fa, bas_fofn, ccs_fofn, ice_opts, sge_opts, ipq_opts, fasta_fofn=None, tmp_dir=None): """ root_dir --- IceFiles.root_dir, usually data/clusterOutDir nfl_fa --- non-full-length reads in fasta, e.g., isoseq_nfl.fasta bas_fofn --- e.g. input.fofn of bas|bax.h5 files ccs_fofn --- e.g. ccs.fofn of ccs files. ipq_opts --- IceQuiverHQLQOptions qv_trim_5: ignore QV of n bases in the 5' end qv_trim_3: ignore QV of n bases in the 3' end hq_quiver_min_accuracy: minimum allowed quiver accuracy to mark an isoform as high quality hq_isoforms_fa|fq: polished, hiqh quality consensus isoforms in fasta|q lq_isoforms_fa|fq: polished, low quality consensus isoforms in fasta|q """ IceFiles.__init__(self, prog_name="IcePolish", root_dir=root_dir, bas_fofn=bas_fofn, ccs_fofn=ccs_fofn, fasta_fofn=fasta_fofn, tmp_dir=tmp_dir) self.nfl_fa = realpath(nfl_fa) self.ice_opts = ice_opts self.sge_opts = sge_opts self.ipq_opts = ipq_opts self.add_log("ece_penalty: {0}, ece_min_len: {1}".format( self.ice_opts.ece_penalty, self.ice_opts.ece_min_len)) self.icep = None # IceAllPartials. self.iceq = None # IceQuiver self.icepq = None # IceQuiverPostprocess self._nfl_splitted_fas = None self.validate_inputs()
def run_after(self, rtc, output_dir): self.assertTrue(op.exists(rtc.task.output_files[0])) out_dir = op.join(OUT_DIR, "test_ice_partial_cluster_bins") cluster_out_dirs = [ op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES ] out_pickles = [ IceFiles(prog_name="", root_dir=d).nfl_pickle_i(i=i) for d in cluster_out_dirs for i in range(N_NFL_CHUNKS) ] print "output scattered nfl pickles are %s" % out_pickles self.assertTrue(all([op.exists(f) for f in out_pickles]))
def run_after(self, rtc, output_dir): self.assertTrue(op.exists(rtc.task.output_files[0])) out_dir = op.join(OUT_DIR, "test_gather_ice_partial_cluster_bins_pickle") cluster_out_dirs = [ op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES ] out_pickles = [ IceFiles(prog_name="", root_dir=d).nfl_all_pickle_fn for d in cluster_out_dirs ] print "output nfl pickles are %s" % out_pickles self.assertTrue(all([op.exists(f) for f in out_pickles]))
def prefix_nfl_pickle_tuples(self): """Returns a list of (sample_prefix, nfl_uc_pickle) tuples.""" ret = [] for sample_prefix, cluster_out_d in self.prefix_dict.iteritems(): sample_prefix = sample_prefix if not sample_prefix.endswith( '|') else sample_prefix[0:-1] nfl_fn = IceFiles(prog_name="Count", root_dir=cluster_out_d, no_log_f=True).nfl_all_pickle_fn if not op.exists(nfl_fn): raise IOError( "NFL pickle %s of sample prefix %s does not exist." % (nfl_fn, sample_prefix)) ret.append((sample_prefix, nfl_fn)) return ret
def run_after(self, rtc, output_dir): self.assertTrue(op.exists(rtc.task.output_files[i]) for i in range(7)) out_dir = op.join(OUT_DIR, "test_combine_cluster_bins") cluster_out_dirs = [ op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES ] combined_lq_cs = rtc.task.output_files[5] print "combined_lq_fa %s must not be empty" % combined_lq_cs n = len([r for r in ContigSet(combined_lq_cs)]) self.assertTrue(n > 0) out_logs = [ IceFiles(prog_name="", root_dir=d).submitted_quiver_jobs_log for d in cluster_out_dirs ] print "out_logs %s" % out_logs self.assertTrue(all([op.exists(f) for f in out_logs]))
def resolved_tool_contract_runner(rtc): """Given resolved tool contract, run""" p = ChunkTasksPickle.read(rtc.task.input_files[0]) p.sorted_by_attr(attr='cluster_bin_index') assert all([isinstance(task, PartialChunkTask) for task in p]) with open(rtc.task.output_files[0], 'w') as writer: for i, group in groupby(p, lambda x: x.cluster_bin_index): gs = [g for g in group] nfl_pickles_of_bin_i = [g.nfl_pickle for g in gs] out_pickle = IceFiles(prog_name="", root_dir=gs[0].cluster_out_dir, no_log_f=True).nfl_all_pickle_fn log.info("Combining nfl pickles of cluster bin %s.", str(i)) log.debug("nfl pickles are: %s.", (", ".join(nfl_pickles_of_bin_i))) log.debug("Output merged nfl pickle is %s.", out_pickle) combine_nfl_pickles(splitted_pickles=nfl_pickles_of_bin_i, out_pickle=out_pickle) writer.write("Merge nfl pickles of cluster bin %s DONE: %s\n" % (i, out_pickle))
def nfl_pickle(self): """Return output nfl pickle of the i-th chunk.""" return IceFiles(prog_name="", root_dir=self.cluster_out_dir, no_log_f=True).nfl_pickle_i(self.nfl_index)
def _validate_inputs(self, root_dir, i, ccs_fofn, blasr_nproc, tmp_dir, ref_fasta=None): """ Check inputs, write $ICE_PARTIAL_PY i command to script_file and return (input_fasta, ref_fasta, out_pickle, done_file) for the i-th chunk of nfl reads. """ icef = IceFiles(prog_name="ice_partial_{i}".format(i=i), root_dir=root_dir, no_log_f=False) # root_dir/output/final.consensus.fasta ref_fasta = icef.final_consensus_fa ref_dazz = icef.final_dazz_db # root_dir/output/map_noFL/input.split_{0:03d}.fasta input_fasta = icef.nfl_fa_i(i) # $input_fasta.partial_uc.pickle out_pickle = icef.nfl_pickle_i(i) # $input_fasta.partial_uc.pickle.DONE done_file = icef.nfl_done_i(i) # $input_fasta.partial_uc.sh script_file = icef.nfl_script_i(i) # Check if inputs exist. errMsg = "" if not nfs_exists(input_fasta): errMsg = ("The {i}-th splitted non-full-length reads ".format(i=i) + "fasta file {f} does not exist. ".format(f=input_fasta) + "Please run $ICE_PARTIAL_PY split first.") elif not nfs_exists(ref_fasta): # ref_fasta --- root_dir/output/final.consensus.fasta # ref_dazz --- root_dir/output/final.consensus.dazz.fasta.db # ref_fasta and ref_dazz must exist if ICE has run successfully in # root_dir. If either one does not exist, it means ICE has not # successfully run in root_dir. Then we have to throw an error message # requring users to copy the root_dir/output directory manually, # rather than providing an option to overwrite ref_fasta and build # ref_dazz, because a race condition can happen when multiple # IcePartialI tasks start to run at the same time, which can corrupt # fasta and dazz db files and lead to unexpected runtime errors. errMsg = ("The unpolished consensus isoforms fasta file " + "{f} does not exist. ".format(f=ref_fasta) + "Please make sure ICE is successfully done in root_dir, " + "or copy ICE output directory (e.g., cluster_out/output) " + "to {dst}".format(dst=op.dirname(ref_fasta))) elif not nfs_exists(ref_dazz): errMsg = ("The dazz db " + "{f} does not exist. ".format(f=ref_dazz) + "Please make sure it is already built.") if len(errMsg) != 0: raise IOError(errMsg) # Save cmd to script_file. cmd = self._cmd_str(root_dir=root_dir, i=[i], ccs_fofn=ccs_fofn, blasr_nproc=blasr_nproc, tmp_dir=tmp_dir) with open(script_file, 'w') as writer: writer.write(cmd + "\n") icef.add_log("Writing CMD to: {script_file}". format(script_file=script_file)) icef.close_log() return (input_fasta, ref_fasta, out_pickle, done_file)
def consensus_isoforms_file(self): """Return output consensus isoform file, cluster_out/output/final.consensus.fasta""" return IceFiles(root_dir=self.cluster_out_dir, prog_name="", no_log_f=True).final_consensus_fa
def nfl_pickle(self): """Return output nfl pickle file, cluster_out/output/nfl.all.partial_uc.pickle """ return IceFiles(prog_name="", root_dir=self.cluster_out_dir, no_log_f=True).nfl_all_pickle_fn
def flnc_pickle(self): """Return output flnc pickle file, cluster_out/output/final.pickle""" return IceFiles(root_dir=self.cluster_out_dir, prog_name="", no_log_f=True).final_pickle_fn