def copy_in_fasta_to_out(in_dir, out_dir, filename): """copy filename from in_dir (e.g., data) to out_dir, return out_fasta """ mknewdir(out_dir) cmd = "cp %s %s" % (op.join(in_dir, filename), op.join(out_dir, filename)) execute(cmd=cmd) return op.join(out_dir, filename)
def setUp(self): """Initialize.""" self.inputDir = op.join(DATA_DIR, self.testName) self.outDir = op.join(OUT_DIR, self.testName) self.stdoutDir = op.join(STD_DIR, self.testName) self.filename = "test_DazzIDHandler.contigset.xml" self.stdout_dazz_fasta = op.join(self.stdoutDir, self.filename[0:-4] + ".dazz.fasta") self.stdout_pickle = self.stdout_dazz_fasta + ".pickle" mknewdir(self.outDir)
def _test_daligner_against_ref(self, test_name, use_sge, sge_opts, prob_model_from="fake"): """Test daligner_against_ref with and without using sge.""" copy_dir = op.join(self.dataDir, "test_daligner_against_ref") output_dir = op.join(self.outDir, test_name) mknewdir(output_dir) qname, tname = "test_daligner_query.fasta", "test_daligner_target.fasta" query_filename = op.join(output_dir, qname) target_filename = op.join(output_dir, tname) prob_model = None if prob_model_from == "fake": prob_model = ProbFromModel(0.01, 0.07, 0.06) elif prob_model_from == "fastq": fastq_fn = op.join(copy_dir, "test_daligner_reads.fastq") prob_model = ProbFromFastq(fastq_fn) else: self.assertTrue(False) qver_get_func = prob_model.get_smoothed qvmean_get_func = prob_model.get_mean dummy_o, c, dummy_m = backticks("cp %s %s" % (op.join(copy_dir, qname), query_filename)) self.assertTrue(c == 0) dummy_o, c, dummy_m = backticks("cp %s %s" % (op.join(copy_dir, tname), target_filename)) self.assertTrue(c == 0) old_dir = os.getcwd() os.chdir(output_dir) runner = DalignerRunner(query_filename=query_filename, target_filename=target_filename, is_FL=True, same_strand_only=True, use_sge=use_sge, sge_opts=sge_opts) runner.run(output_dir=op.join(self.outDir, test_name)) hits = [] for la4ice_filename in runner.la4ice_filenames: hits.extend(daligner_against_ref(query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=True, sID_starts_with_c=False, qver_get_func=qver_get_func, qvmean_get_func=qvmean_get_func)) # Num of hits may change when daligner or parameters change. self.assertTrue(len(hits), 706) self.assertEqual(str(hits[0]), "m54007_160109_025449/27984844/29_646_CCS/0_617 aligns to m54007_160109_025449/28836279/631_54_CCS") os.chdir(output_dir)
def _test_bin_manual(self, bin_manual, expected_bin_manual): """Test SeparateFLNCBySize setting bin manually.""" out_dir=op.join(OUT_DIR, 'separate_flnc_by_size_bin_manual') mknewdir(out_dir) with SeparateFLNCBySize(flnc_filename=FLNC_FASTA, bin_manual=bin_manual, root_dir=out_dir) as obj: obj.run() self.assertEqual(obj.sorted_keys, expected_bin_manual) for index, key in enumerate(obj.sorted_keys): with FastaReader(obj.out_fasta_files[index]) as reader: self.assertTrue(all([key[0].contains(len(r.sequence)) for r in reader]))
def setUp(self): """Initialize.""" self.inputDir = op.join(DATA_DIR, self.testName) self.outDir = op.join(OUT_DIR, self.testName) self.stdoutDir = op.join(STD_DIR, self.testName) self.fastaFileName = "test_DazzIDHandler.fasta" self.stdout_dazz_fasta = op.join(self.stdoutDir, self.fastaFileName[0:-6] + ".dazz.fasta") self.stdout_pickle = self.stdout_dazz_fasta + ".pickle" mknewdir(self.outDir) # Copy inputDir/test_DazzIDHandler.fasta to outDir. execute("cp %s %s" % (op.join(self.inputDir, self.fastaFileName), op.join(self.outDir, self.fastaFileName)))
def setUp(self): """Initialize.""" self.inputDir = op.join(DATA_DIR, self.testName) self.outDir = op.join(OUT_DIR, self.testName) self.stdoutDir = op.join(STD_DIR, self.testName) self.fastaFileName = "test_DazzIDHandler.fasta" self.stdout_dazz_fasta = op.join( self.stdoutDir, self.fastaFileName[0:-6] + ".dazz.fasta") self.stdout_pickle = self.stdout_dazz_fasta + ".pickle" mknewdir(self.outDir) # Copy inputDir/test_DazzIDHandler.fasta to outDir. execute("cp %s %s" % (op.join(self.inputDir, self.fastaFileName), op.join(self.outDir, self.fastaFileName)))
def _test_bin_manual(self, bin_manual, expected_bin_manual): """Test SeparateFLNCBySize setting bin manually.""" out_dir = op.join(OUT_DIR, 'separate_flnc_by_size_bin_manual') mknewdir(out_dir) with SeparateFLNCBySize(flnc_filename=FLNC_FASTA, bin_manual=bin_manual, root_dir=out_dir) as obj: obj.run() self.assertEqual(obj.sorted_keys, expected_bin_manual) for index, key in enumerate(obj.sorted_keys): with FastaReader(obj.out_fasta_files[index]) as reader: self.assertTrue( all([key[0].contains(len(r.sequence)) for r in reader]))
class TestClusterBins(pbcommand.testkit.PbTestApp): """Call python -m pbtranscript.tasks.cluster_bins --resolved-tool-contract rtc.json""" out_dir = op.join(OUT_DIR, "test_cluster_bins") mknewdir(out_dir) out_cluster_chunks_pickle = op.join(out_dir, "cluster_chunks.pickle") make_pickle(in_pickle=cluster_chunks_pickle, out_pickle=out_cluster_chunks_pickle, root_dir=out_dir) DRIVER_BASE = "python -m pbtranscript.tasks.cluster_bins" INPUT_FILES = [ out_cluster_chunks_pickle, # input 0, cluster_chunks.pickle ccs_ds ] # idx 1, ccs def run_after(self, rtc, output_dir): out_dir = op.join(OUT_DIR, "test_cluster_bins") cluster_out_dirs = [ op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES ] self.assertTrue(op.exists(rtc.task.output_files[0])) out_consensus_isoforms = [ op.join(d, "output", "final.consensus.fasta") for d in cluster_out_dirs ] print out_consensus_isoforms self.assertTrue(all([op.exists(f) for f in out_consensus_isoforms]))
class TestGatherIcePartialPickle(pbcommand.testkit.PbTestApp): """Call python -m pbtranscript.tasks.gather_ice_partial_cluster_bins_pickle --resolved-tool-contract rtc.json""" out_dir = op.join(OUT_DIR, "test_gather_ice_partial_cluster_bins_pickle") mknewdir(out_dir) out_partial_chunks_pickle = op.join(out_dir, "partial_chunks.pickle") make_pickle(in_pickle=partial_chunks_pickle, out_pickle=out_partial_chunks_pickle, root_dir=out_dir, copy_consensus_isoforms=True, copy_nfl_pickle=True) DRIVER_BASE = "python -m pbtranscript.tasks.gather_ice_partial_cluster_bins_pickle" INPUT_FILES = [ out_partial_chunks_pickle, # input 0, partial_chunk.pickle done_txt ] # idx 1, sentinel file def run_after(self, rtc, output_dir): self.assertTrue(op.exists(rtc.task.output_files[0])) out_dir = op.join(OUT_DIR, "test_gather_ice_partial_cluster_bins_pickle") cluster_out_dirs = [ op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES ] out_pickles = [ IceFiles(prog_name="", root_dir=d).nfl_all_pickle_fn for d in cluster_out_dirs ] print "output nfl pickles are %s" % out_pickles self.assertTrue(all([op.exists(f) for f in out_pickles]))
def test_run(self): """Test run(output_dir, min_match_len, sensitive_mode). running on sge and locally. """ run_on_sge = (backticks('qstat')[1] == 0) if run_on_sge: self.runner.use_sge = True self.runner.sge_opts = SgeOptions(100) mknewdir(self.out_dir) self.runner.run(output_dir=self.out_dir) for las_filename in self.runner.las_filenames: print "Checking existance of " + las_filename self.assertTrue(op.exists(las_filename)) for la4ice_filename in self.runner.la4ice_filenames: print "Checking existance of " + la4ice_filename self.assertTrue(op.exists(la4ice_filename)) # Run locally self.runner.use_sge = False mknewdir(self.out_dir) self.runner.run(output_dir=self.out_dir) for las_filename in self.runner.las_filenames: print "Checking existance of " + las_filename self.assertTrue(op.exists(las_filename)) for la4ice_filename in self.runner.la4ice_filenames: print "Checking existance of " + la4ice_filename self.assertTrue(op.exists(la4ice_filename)) # clean all output self.runner.clean_run() for las_filename in self.runner.las_filenames: print "Checking %s has been removed.\n" % las_filename self.assertTrue(not op.exists(las_filename)) for la4ice_filename in self.runner.la4ice_filenames: print "Checking %s has been removed.\n" % la4ice_filename self.assertTrue(not op.exists(la4ice_filename))
def test_as_contigset(self): """Test as_contigset""" out_dir = op.join(OUT_DIR, 'test_Utils') mknewdir(out_dir) fa = op.join(out_dir, "empty.fasta") xml = op.join(out_dir, "empty.contigset.xml") fai = fa + ".fai" execute("touch %s" % fa) as_contigset(fa, xml) self.assertTrue(op.exists(xml)) self.assertTrue(op.exists(fai)) fn = 'reads_of_insert.fasta' shutil.copy(src=op.join(DATA_DIR, fn), dst=op.join(out_dir, fn)) fa = op.join(out_dir, fn) as_contigset(fa, fa) fai = fa + ".fai" xml = op.join(out_dir, 'reads_of_insert.contigset.xml') as_contigset(fa, xml) self.assertTrue(op.exists(xml)) self.assertTrue(op.exists(fai))
class TestGatherPolishedIsoforms(pbcommand.testkit.PbTestApp): """Call python -m pbtranscript.tasks.gather_polished_isoforms_in_each_bin --resolved-tool-contract rtc.json""" out_dir = op.join(OUT_DIR, "test_gather_polished_isoforms_in_each_bin") mknewdir(out_dir) out_polish_chunks_pickle = op.join(out_dir, "polish_chunks.pickle") make_pickle(in_pickle=polish_chunks_pickle, out_pickle=out_polish_chunks_pickle, root_dir=out_dir, copy_consensus_isoforms=True, copy_flnc_pickle=True, copy_nfl_pickle=True, copy_quivered=True) DRIVER_BASE = "python -m pbtranscript.tasks.gather_polished_isoforms_in_each_bin" INPUT_FILES = [ out_polish_chunks_pickle, # input 0, polish_chunk.pickle done_txt ] # idx 1, sentinel file def run_after(self, rtc, output_dir): self.assertTrue(op.exists(rtc.task.output_files[0])) out_dir = op.join(OUT_DIR, "test_gather_polished_isoforms_in_each_bin") cluster_out_dirs = [ op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES ] out_hq_fns = [ op.join(d, fn) for d in cluster_out_dirs for fn in HQ_ISOFORMS_FNS ] print "out_hq_fns %s" % out_hq_fns self.assertTrue(all([op.exists(f) for f in out_hq_fns])) out_lq_fns = [ op.join(d, fn) for d in cluster_out_dirs for fn in LQ_ISOFORMS_FNS ] print "out_lq_fns %s" % out_lq_fns self.assertTrue(all([op.exists(f) for f in out_lq_fns])) print "out_lq_fa %s is not empty" % out_lq_fns[0] n = len([r for r in FastaReader(out_lq_fns[0])]) self.assertTrue(n > 0) out_logs = [ IceFiles(prog_name="", root_dir=d).submitted_quiver_jobs_log for d in cluster_out_dirs ] print "out_logs %s" % out_logs self.assertTrue(all([op.exists(f) for f in out_logs]))
class TestCombineClusterBins(pbcommand.testkit.PbTestApp): """Call python -m pbtranscript.tasks.combine_cluster_bins --resolved-tool-contract rtc.json""" out_dir = op.join(OUT_DIR, "test_combine_cluster_bins") mknewdir(out_dir) out_cluster_chunks_pickle = op.join(out_dir, "cluster_chunks.pickle") make_pickle(in_pickle=cluster_chunks_pickle, out_pickle=out_cluster_chunks_pickle, root_dir=out_dir, copy_consensus_isoforms=True, copy_flnc_pickle=True, copy_nfl_pickle=True) cluster_out_dirs = [ op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES ] for D, d in zip(CLUSTER_OUT_DIRS, cluster_out_dirs): polish_log = op.join("log", "submitted_quiver_jobs.txt") shutil.copy(op.join(D, polish_log), op.join(d, polish_log)) for fn in HQ_ISOFORMS_FNS + LQ_ISOFORMS_FNS: shutil.copy(op.join(D, fn), op.join(d, fn)) DRIVER_BASE = "python -m pbtranscript.tasks.combine_cluster_bins" INPUT_FILES = [ out_cluster_chunks_pickle, # input 0, cluster_chunk.pickle done_txt ] # idx 1, sentinel file def run_after(self, rtc, output_dir): self.assertTrue(op.exists(rtc.task.output_files[i]) for i in range(7)) out_dir = op.join(OUT_DIR, "test_combine_cluster_bins") cluster_out_dirs = [ op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES ] combined_lq_cs = rtc.task.output_files[5] print "combined_lq_fa %s must not be empty" % combined_lq_cs n = len([r for r in ContigSet(combined_lq_cs)]) self.assertTrue(n > 0) out_logs = [ IceFiles(prog_name="", root_dir=d).submitted_quiver_jobs_log for d in cluster_out_dirs ] print "out_logs %s" % out_logs self.assertTrue(all([op.exists(f) for f in out_logs]))
class TestCreateChunks(pbcommand.testkit.PbTestApp): """Call python -m pbtranscript.tasks.create_chunks --resolved-tool-contract rtc.json""" out_dir = op.join(OUT_DIR, "test_create_chunks") mknewdir(out_dir) separate_flnc_pickle = op.join(out_dir, "separate_flnc.pickle") call_separate_flnc(flnc_ds=flnc_ds, out_dir=out_dir, out_pickle=separate_flnc_pickle) DRIVER_BASE = "python -m pbtranscript.tasks.create_chunks" INPUT_FILES = [ separate_flnc_pickle, # input 0, separate_flnc.pickle nfl_ds ] # input 1, nfl.xml def run_after(self, rtc, output_dir): print rtc.task.output_files[0] print rtc.task.output_files[1] print rtc.task.output_files[2]
class TestIcePolishClusterBins(pbcommand.testkit.PbTestApp): """Call python -m pbtranscript.tasks.ice_polish_cluster_bins --resolved-tool-contract rtc.json""" out_dir = op.join(OUT_DIR, "test_ice_polish_cluster_bins") mknewdir(out_dir) out_polish_chunks_pickle = op.join(out_dir, "polish_chunks.pickle") make_pickle(in_pickle=polish_chunks_pickle, out_pickle=out_polish_chunks_pickle, root_dir=out_dir, copy_consensus_isoforms=True, copy_flnc_pickle=True, copy_nfl_pickle=True) DRIVER_BASE = "python -m pbtranscript.tasks.ice_polish_cluster_bins" INPUT_FILES = [ out_polish_chunks_pickle, # input 0, polish_chunk.pickle done_txt, # idx 1, sentinel file subreads_ds ] # idx 2, subreads.bam def run_after(self, rtc, output_dir): self.assertTrue(op.exists(rtc.task.output_files[0]))
def run(self, output_dir='.', min_match_len=300, sensitive_mode=False): """ if self.use_sge --- writes to <scripts>/daligner_job_#.sh else --- run locally, dividing into self.cpus/4 tasks (capped max at 4) NOTE 1: when using SGE, be careful that multiple calls to this might end up writing to the SAME job.sh files, this should be avoided by changing <scripts> directory NOTE 2: more commonly this should be invoked locally (since ice_partial.py i/one be qsub-ed), in that case it is more recommended to keep self.cpus = 4 so that each daligner job is run consecutively and that the original qsub job should have been called with qsub -pe smp 4 (set by --blasr_nproc 4) In this way, the daligner jobs are called consecutively, but LA4Ice is parallelized 4X """ self.output_dir = realpath(output_dir) # Reset output_dir old_dir = realpath(op.curdir) mkdir(output_dir) os.chdir(output_dir) if self.use_sge: mknewdir(self.script_dir) # prepare done scripts is no longer necessary. #self.write_daligner_done_script() #self.write_la4ice_done_script() # (a) run all daligner jobs daligner_cmds = self.daligner_cmds(min_match_len=min_match_len, sensitive_mode=sensitive_mode) logging.info("Start daligner cmds " + ("using sge." if self.use_sge else "locally.")) logging.debug("CMD: " + "\n".join(daligner_cmds)) start_t = time.time() failed = [] if self.use_sge: failed.extend( sge_job_runner(cmds_list=daligner_cmds, script_files=self.daligner_scripts, #done_script=self.daligner_done_script, num_threads_per_job=DALIGNER_NUM_THREADS, sge_opts=self.sge_opts, qsub_try_times=3, wait_timeout=600, run_timeout=600, rescue="sge", rescue_times=3)) else: # max 4 at a time to avoid running out of memory... failed.extend( local_job_runner(cmds_list=daligner_cmds, num_threads=max(1, min(self.cpus/4, 4)))) logging.info("daligner jobs took " + str(time.time()-start_t) + " sec.") # (b) run all LA4Ice jobs start_t = time.time() logging.info("Start LA4Ice cmds " + ("using sge." if self.use_sge else "locally.")) la4ice_cmds = self.la4ice_cmds logging.debug("CMD: " + "\n".join(la4ice_cmds)) if self.use_sge: failed.extend( sge_job_runner(cmds_list=la4ice_cmds, script_files=self.la4ice_scripts, #done_script=self.la4ice_done_script, num_threads_per_job=DALIGNER_NUM_THREADS, sge_opts=self.sge_opts, qsub_try_times=3, wait_timeout=600, run_timeout=600, rescue="sge", rescue_times=3)) else: # max 4 at a time to avoid running out of memory... failed.extend( local_job_runner(cmds_list=la4ice_cmds, num_threads=max(1, min(self.cpus, 4)))) logging.info("LA4Ice jobs took " + str(time.time()-start_t) + " sec.") os.chdir(old_dir) if len(failed) == 0: return 0 else: raise RuntimeError("%s.run failed, %s." % (op.basename(self.__class__), "\n".join([x[0] for x in failed])))
def _test_daligner_against_ref(self, test_name, use_sge, sge_opts, prob_model_from="fake"): """Test daligner_against_ref with and without using sge.""" copy_dir = op.join(self.dataDir, "test_daligner_against_ref") output_dir = op.join(self.outDir, test_name) mknewdir(output_dir) qname, tname = "test_daligner_query.fasta", "test_daligner_target.fasta" query_filename = op.join(output_dir, qname) target_filename = op.join(output_dir, tname) prob_model = None if prob_model_from == "fake": prob_model = ProbFromModel(0.01, 0.07, 0.06) elif prob_model_from == "fastq": fastq_fn = op.join(copy_dir, "test_daligner_reads.fastq") prob_model = ProbFromFastq(fastq_fn) else: self.assertTrue(False) qver_get_func = prob_model.get_smoothed qvmean_get_func = prob_model.get_mean dummy_o, c, dummy_m = backticks( "cp %s %s" % (op.join(copy_dir, qname), query_filename)) self.assertTrue(c == 0) dummy_o, c, dummy_m = backticks( "cp %s %s" % (op.join(copy_dir, tname), target_filename)) self.assertTrue(c == 0) old_dir = os.getcwd() os.chdir(output_dir) runner = DalignerRunner(query_filename=query_filename, target_filename=target_filename, is_FL=True, same_strand_only=True, use_sge=use_sge, sge_opts=sge_opts) runner.run(output_dir=op.join(self.outDir, test_name)) hits = [] for la4ice_filename in runner.la4ice_filenames: hits.extend( daligner_against_ref( query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=True, sID_starts_with_c=False, qver_get_func=qver_get_func, qvmean_get_func=qvmean_get_func)) # Num of hits may change when daligner or parameters change. self.assertTrue(len(hits), 706) self.assertEqual( str(hits[0]), "m54007_160109_025449/27984844/29_646_CCS/0_617 aligns to m54007_160109_025449/28836279/631_54_CCS" ) os.chdir(output_dir)