def wait_for_sge_jobs(cmd, jids, timeout): """ This replaces the original qsub -sync y -hold_jid j1,j2..... command which can still be hung if certain jobs got stuck. If timeout occurs, simply qdel all jids (ignoring whether they exist or not) and let the main function that calls it handle what to do """ def get_active_jids(): stuff = os.popen("qstat").read().strip().split('\n') for x in stuff[2:]: job_id = x.split()[0] yield job_id p = Process(target=wait_for_sge_jobs_worker, args=(cmd,)) p.start() p.join(timeout) if p.is_alive(): # timed out active_jids = [x for x in get_active_jids()] while len(active_jids) > 0: for jid in active_jids: kill_cmd = "qdel " + str(jid) backticks(kill_cmd) # don't care whether it worked time.sleep(3) # wait 3 sec for qdel to take effect.... active_jids = [x for x in get_active_jids()] # make sure qdel really worked return "TIMEOUT" return "SUCCESS"
def test_newUuid_random_cli(self): fn_orig = data.getXml(8) outdir = tempfile.mkdtemp(suffix="dataset-unittest") fn = os.path.join(outdir, 'fn.alignmentset.xml') fn2 = os.path.join(outdir, 'fn2.alignmentset.xml') with AlignmentSet(fn_orig) as aln: aln.copyTo(fn) shutil.copy(fn, fn2) pre_uuid = AlignmentSet(fn).uuid pre_uuid2 = AlignmentSet(fn2).uuid self.assertEqual(pre_uuid, pre_uuid2) cmd = "dataset newuuid --random {d}".format(d=fn) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0) self.assertTrue(os.path.exists(fn)) cmd = "dataset newuuid --random {d}".format(d=fn2) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0) self.assertTrue(os.path.exists(fn2)) post_uuid = AlignmentSet(fn).uuid post_uuid2 = AlignmentSet(fn2).uuid self.assertNotEqual(pre_uuid, post_uuid) self.assertNotEqual(pre_uuid2, post_uuid2) # RANDOM, THEREFORE THESE ARE NOT EQUAL: self.assertNotEqual(post_uuid, post_uuid2)
def test_contigset_consolidate_int_names(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) double = 'B.cereus.1' exp_double = rs1.get_contig(double) # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord('5141', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord('5142', exp_double.sequence) exp_double_seqs = [exp_double.sequence, exp_double.sequence] exp_names = ['5141', '5142'] obs_file = ContigSet(outFas1, outFas2) log.debug(obs_file.toExternalFiles()) obs_file.consolidate() log.debug(obs_file.toExternalFiles()) # open obs and compare to exp for name, seq in zip(exp_names, exp_double_seqs): self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
def wait_for_sge_jobs(cmd, jids, timeout): """ This replaces the original qsub -sync y -hold_jid j1,j2..... command which can still be hung if certain jobs got stuck. If timeout occurs, simply qdel all jids (ignoring whether they exist or not) and let the main function that calls it handle what to do """ def get_active_jids(): stuff = os.popen("qstat").read().strip().split('\n') for x in stuff[2:]: job_id = x.split()[0] yield job_id p = Process(target=wait_for_sge_jobs_worker, args=(cmd, )) p.start() p.join(timeout) if p.is_alive(): # timed out active_jids = [x for x in get_active_jids()] while len(active_jids) > 0: for jid in active_jids: kill_cmd = "qdel " + str(jid) backticks(kill_cmd) # don't care whether it worked time.sleep(3) # wait 3 sec for qdel to take effect.... active_jids = [x for x in get_active_jids() ] # make sure qdel really worked return "TIMEOUT" return "SUCCESS"
def test_contigset_consolidate_int_names(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format(i=ReferenceSet( data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) double = 'B.cereus.1' exp_double = rs1.get_contig(double) # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord('5141', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord('5142', exp_double.sequence) exp_double_seqs = [exp_double.sequence, exp_double.sequence] exp_names = ['5141', '5142'] obs_file = ContigSet(outFas1, outFas2) log.debug(obs_file.toExternalFiles()) obs_file.consolidate() log.debug(obs_file.toExternalFiles()) # open obs and compare to exp for name, seq in zip(exp_names, exp_double_seqs): self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
def test_create_cli(self): log.debug("Absolute") outdir = tempfile.mkdtemp(suffix="dataset-unittest") cmd = "dataset create --type AlignmentSet {o} {i1} {i2}".format( o=os.path.join(outdir, 'pbalchemysim.alignmentset.xml'), i1=data.getXml(8), i2=data.getXml(11)) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0) self.assertTrue( os.path.exists( os.path.join(outdir, os.path.basename(data.getXml(12))))) log.debug("Relative") outdir = tempfile.mkdtemp(suffix="dataset-unittest") cmd = ("dataset create --relative --type AlignmentSet " "{o} {i1} {i2}".format(o=os.path.join( outdir, 'pbalchemysim.alignmentset.xml'), i1=data.getXml(8), i2=data.getXml(11))) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0) self.assertTrue( os.path.exists( os.path.join(outdir, os.path.basename(data.getXml(12)))))
def test_contigset_consolidate(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) singletons = ['A.baumannii.1', 'A.odontolyticus.1'] double = 'B.cereus.1' reader = rs1.resourceReaders()[0] exp_double = rs1.get_contig(double) exp_singles = [rs1.get_contig(name) for name in singletons] # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord(exp_singles[0]) writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord(exp_double.name + '_0_10', exp_double.sequence + 'ATCGATCGATCG') writer.writeRecord(exp_singles[1]) exp_double_seq = ''.join([exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence]) exp_single_seqs = [rec.sequence for rec in exp_singles] acc_file = ContigSet(outFas1, outFas2) acc_file.induceIndices() log.debug(acc_file.toExternalFiles()) self.assertEqual(len(acc_file), 4) self.assertEqual(len(list(acc_file)), 4) acc_file.consolidate() log.debug(acc_file.toExternalFiles()) # open acc and compare to exp for name, seq in zip(singletons, exp_single_seqs): self.assertEqual(acc_file.get_contig(name).sequence[:], seq) self.assertEqual(acc_file.get_contig(double).sequence[:], exp_double_seq) self.assertEqual(len(acc_file._openReaders), 1) self.assertEqual(len(acc_file.index), 3) self.assertEqual(len(acc_file._indexMap), 3) self.assertEqual(len(acc_file), 3) self.assertEqual(len(list(acc_file)), 3) # test merge: acc1 = ContigSet(outFas1) acc2 = ContigSet(outFas2) acc3 = acc1 + acc2
def test_updateCounts_without_pbi(self): log.info("Testing updateCounts without pbi") data_fname = data.getBam(0) outdir = tempfile.mkdtemp(suffix="dataset-unittest") tempout = os.path.join(outdir, os.path.basename(data_fname)) backticks('cp {i} {o}'.format(i=data_fname, o=tempout)) aln = AlignmentSet(tempout, strict=False) self.assertEqual(aln.totalLength, -1) self.assertEqual(aln.numRecords, -1)
def test_contigset_consolidate(self): #build set to merge outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') outFas1 = os.path.join(outdir, 'tempfile1.fasta') outFas2 = os.path.join(outdir, 'tempfile2.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format(i=ReferenceSet( data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) singletons = ['A.baumannii.1', 'A.odontolyticus.1'] double = 'B.cereus.1' reader = rs1.resourceReaders()[0] exp_double = rs1.get_contig(double) exp_singles = [rs1.get_contig(name) for name in singletons] # todo: modify the names first: with FastaWriter(outFas1) as writer: writer.writeRecord(exp_singles[0]) writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence) with FastaWriter(outFas2) as writer: writer.writeRecord(exp_double.name + '_0_10', exp_double.sequence + 'ATCGATCGATCG') writer.writeRecord(exp_singles[1]) exp_double_seq = ''.join( [exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence]) exp_single_seqs = [rec.sequence for rec in exp_singles] acc_file = ContigSet(outFas1, outFas2) acc_file.induceIndices() log.debug(acc_file.toExternalFiles()) self.assertEqual(len(acc_file), 4) self.assertEqual(len(list(acc_file)), 4) acc_file.consolidate() log.debug(acc_file.toExternalFiles()) # open acc and compare to exp for name, seq in zip(singletons, exp_single_seqs): self.assertEqual(acc_file.get_contig(name).sequence[:], seq) self.assertEqual( acc_file.get_contig(double).sequence[:], exp_double_seq) self.assertEqual(len(acc_file._openReaders), 1) self.assertEqual(len(acc_file.index), 3) self.assertEqual(len(acc_file._indexMap), 3) self.assertEqual(len(acc_file), 3) self.assertEqual(len(list(acc_file)), 3) # test merge: acc1 = ContigSet(outFas1) acc2 = ContigSet(outFas2) acc3 = acc1 + acc2
def qsub_job_runner(cmds_list, sh_file_format, done_script, sge_opts, qsub_retry=3, run_local_if_qsub_fail=True): """ cmds_list -- list of commands to run (each in a separate file) sh_file_format ---- ex: test_script.{i}.sh ToDo: (1) add in ways to gracefully fail if SGE submits fail -- resubmit? wait? run local? (2) add in ways to monitor if certain qsub jobs died or hung --- resubmit? kill? run local? """ jids = [] for i, cmd in enumerate(cmds_list): f = open(sh_file_format.format(i=i), "w") f.write("#!/bin/bash\n") f.write(cmd + "\n") f.close() # hard-coded to 4 CPUS because hard-coded in daligner! qsub_cmd = "qsub" if sge_opts.queue_name is not None: qsub_cmd += " -q " + sge_opts.queue_name qsub_cmd += " -cwd -V -S /bin/bash -pe {env} 4 -e {out}.elog -o {out}.olog {out}".format( env=sge_opts.sge_env_name, out=f.name ) try_times = 1 while try_times <= qsub_retry: _out, _code, _msg = backticks(qsub_cmd) if _code == 0: # succeeded, break break else: # failed, sleep for a little, try again time.sleep(10) try_times += 1 if try_times > qsub_retry: if run_local_if_qsub_fail: raise NotImplementedError, "Not yet implemented to not use SGE!" else: raise RuntimeError, "Unable to qsub. Abort!:", qsub_cmd # ex: # Your job 596028 ("a.sh") has been submitted jids.append(str(_out).split()[2]) # use a qsub job to wait for the commands to finish # ToDo: this is NOT bullet proof! watch for cases where the job may have died or been killed or hung wait_cmd = "qsub " if sge_opts.queue_name is not None: wait_cmd += " -q " + sge_opts.queue_name wait_cmd += " -sync y -pe {2} 1 -cwd -S /bin/bash -V -e /dev/null -o /dev/null -hold_jid {0} {1}".format( ",".join(jids), done_script, sge_opts.sge_env_name ) _out, _code, _msg = backticks(wait_cmd) if _code != 0: # failed, just wait manually then active_jids = [x.split()[0] for x in os.popen("qstat").read().strip().split("\n")[2:]] while True: if any(x in jids for x in active_jids): # some jobs are still running time.sleep(10) else: break
def createPickles(self): """For each file in fasta_filenames, call ice_partial.py to build clusters and to save results to a pickle file. When all pickles are done, union all pickles. """ self.add_log("Mapping non-full-length reads to consensus isoforms.") self.add_log("Creating pickles...", level=logging.INFO) for idx, fa in enumerate(self.fasta_filenames): # for each splitted non-full-length reads fasta file, build # # partial_uc.pickle self.add_log("Creating a pickle for {f}".format(f=fa)) cmd = "ice_partial.py {i} ".format(i=fa) + \ "{r} ".format(r=self.ref_fasta) + \ "{o} ".format(o=self.pickle_filenames[idx]) + \ "--blasr_nproc={n} ".format(n=self.sge_opts.blasr_nproc) + \ "--done={d} ".format(d=self.done_filenames[idx]) if self.ccs_fofn is not None: cmd += "--ccs_fofn={f} ".format(f=self.ccs_fofn) if self.sa_file is not None: cmd += "--sa={sa} ".format(sa=self.sa_file) self.add_log("Writing command to script {fsh}".format( fsh=self.script_filenames[idx])) with open(self.script_filenames[idx], 'w') as fsh: fsh.write(cmd + "\n") # determine elog & olog partial_log_fn = op.join(self.log_dir, 'IcePartial.{idx}'.format(idx=idx)) elog = partial_log_fn + ".elog" olog = partial_log_fn + ".olog" jid = "ice_partial_" + op.basename(fa) qsub_cmd = "qsub " + \ "-pe smp {n} ".format(n=self.sge_opts.blasr_nproc) + \ "-cwd -S /bin/bash -V " + \ "-e {elog} ".format(elog=elog) + \ "-o {olog} ".format(olog=olog) + \ "-N {jid} ".format(jid=jid) + \ "{sh}".format(sh=self.script_filenames[idx]) if self.sge_opts.use_sge is True: self.add_log("Submitting CMD: {qcmd}".format(qcmd=qsub_cmd)) _out, _code, _msg = backticks(qsub_cmd) #elif self.sge_opts.useSMRTPortal is True: # pass else: cmd += " 1>{olog} 2>{elog}".format(olog=olog, elog=elog) self.add_log("Submitting CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: raise RuntimeError("CMD failed: {cmd}\n{msg}\n".format( cmd=cmd, msg=str(_msg)))
def createPickles(self): """For each file in fasta_filenames, call ice_partial.py to build clusters and to save results to a pickle file. When all pickles are done, union all pickles. """ self.add_log("Mapping non-full-length reads to consensus isoforms.") self.add_log("Creating pickles...", level=logging.INFO) for idx, fa in enumerate(self.fasta_filenames): # for each splitted non-full-length reads fasta file, build # # partial_uc.pickle self.add_log("Creating a pickle for {f}".format(f=fa)) cmd = "ice_partial.py {i} ".format(i=fa) + \ "{r} ".format(r=self.ref_fasta) + \ "{o} ".format(o=self.pickle_filenames[idx]) + \ "--blasr_nproc={n} ".format(n=self.sge_opts.blasr_nproc) + \ "--done={d} ".format(d=self.done_filenames[idx]) if self.ccs_fofn is not None: cmd += "--ccs_fofn={f} ".format(f=self.ccs_fofn) if self.sa_file is not None: cmd += "--sa={sa} ".format(sa=self.sa_file) self.add_log("Writing command to script {fsh}". format(fsh=self.script_filenames[idx])) with open(self.script_filenames[idx], 'w') as fsh: fsh.write(cmd + "\n") # determine elog & olog partial_log_fn = op.join(self.log_dir, 'IcePartial.{idx}'.format(idx=idx)) elog = partial_log_fn + ".elog" olog = partial_log_fn + ".olog" jid = "ice_partial_" + op.basename(fa) qsub_cmd = "qsub " + \ "-pe smp {n} ".format(n=self.sge_opts.blasr_nproc) + \ "-cwd -S /bin/bash -V " + \ "-e {elog} ".format(elog=elog) + \ "-o {olog} ".format(olog=olog) + \ "-N {jid} ".format(jid=jid) + \ "{sh}".format(sh=self.script_filenames[idx]) if self.sge_opts.use_sge is True: self.add_log("Submitting CMD: {qcmd}".format(qcmd=qsub_cmd)) _out, _code, _msg = backticks(qsub_cmd) #elif self.sge_opts.useSMRTPortal is True: # pass else: cmd += " 1>{olog} 2>{elog}".format(olog=olog, elog=elog) self.add_log("Submitting CMD: {cmd}".format(cmd=cmd)) _out, _code, _msg = backticks(cmd) if _code != 0: raise RuntimeError("CMD failed: {cmd}\n{msg}\n".format( cmd=cmd, msg=str(_msg)))
def qsub_job_runner(cmds_list, sh_file_format, done_script, sge_opts, qsub_retry=3, run_local_if_qsub_fail=True): """ cmds_list -- list of commands to run (each in a separate file) sh_file_format ---- ex: test_script.{i}.sh ToDo: (1) add in ways to gracefully fail if SGE submits fail -- resubmit? wait? run local? (2) add in ways to monitor if certain qsub jobs died or hung --- resubmit? kill? run local? """ jids = [] for i, cmd in enumerate(cmds_list): f = open(sh_file_format.format(i=i), 'w') f.write("#!/bin/bash\n") f.write(cmd + '\n') f.close() # hard-coded to 4 CPUS because hard-coded in daligner! qsub_cmd = "qsub" if sge_opts.queue_name is not None: qsub_cmd += " -q " + sge_opts.queue_name qsub_cmd += " -cwd -V -S /bin/bash -pe {env} 4 -e {out}.elog -o {out}.olog {out}".format(\ env=sge_opts.sge_env_name, out=f.name) try_times = 1 while try_times <= qsub_retry: _out, _code, _msg = backticks(qsub_cmd) if _code == 0: # succeeded, break break else: # failed, sleep for a little, try again time.sleep(10) try_times += 1 if try_times > qsub_retry: if run_local_if_qsub_fail: raise NotImplementedError, "Not yet implemented to not use SGE!" else: raise RuntimeError, "Unable to qsub. Abort!:", qsub_cmd # ex: # Your job 596028 ("a.sh") has been submitted jids.append(str(_out).split()[2]) # use a qsub job to wait for the commands to finish # ToDo: this is NOT bullet proof! watch for cases where the job may have died or been killed or hung wait_cmd = "qsub " if sge_opts.queue_name is not None: wait_cmd += " -q " + sge_opts.queue_name wait_cmd += " -sync y -pe {2} 1 -cwd -S /bin/bash -V -e /dev/null -o /dev/null -hold_jid {0} {1}".format(",".join(jids), done_script, sge_opts.sge_env_name) _out, _code, _msg = backticks(wait_cmd) if _code != 0: # failed, just wait manually then active_jids = [x.split()[0] for x in os.popen("qstat").read().strip().split('\n')[2:]] while True: if any(x in jids for x in active_jids): # some jobs are still running time.sleep(10) else: break
def _test_daligner_against_ref(self, test_name, use_sge, sge_opts, prob_model_from="fake"): """Test daligner_against_ref with and without using sge.""" copy_dir = op.join(self.dataDir, "test_daligner_against_ref") output_dir = op.join(self.outDir, test_name) mknewdir(output_dir) qname, tname = "test_daligner_query.fasta", "test_daligner_target.fasta" query_filename = op.join(output_dir, qname) target_filename = op.join(output_dir, tname) prob_model = None if prob_model_from == "fake": prob_model = ProbFromModel(0.01, 0.07, 0.06) elif prob_model_from == "fastq": fastq_fn = op.join(copy_dir, "test_daligner_reads.fastq") prob_model = ProbFromFastq(fastq_fn) else: self.assertTrue(False) qver_get_func = prob_model.get_smoothed qvmean_get_func = prob_model.get_mean dummy_o, c, dummy_m = backticks("cp %s %s" % (op.join(copy_dir, qname), query_filename)) self.assertTrue(c == 0) dummy_o, c, dummy_m = backticks("cp %s %s" % (op.join(copy_dir, tname), target_filename)) self.assertTrue(c == 0) old_dir = os.getcwd() os.chdir(output_dir) runner = DalignerRunner(query_filename=query_filename, target_filename=target_filename, is_FL=True, same_strand_only=True, use_sge=use_sge, sge_opts=sge_opts) runner.run(output_dir=op.join(self.outDir, test_name)) hits = [] for la4ice_filename in runner.la4ice_filenames: hits.extend(daligner_against_ref(query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=True, sID_starts_with_c=False, qver_get_func=qver_get_func, qvmean_get_func=qvmean_get_func)) # Num of hits may change when daligner or parameters change. self.assertTrue(len(hits), 706) self.assertEqual(str(hits[0]), "m54007_160109_025449/27984844/29_646_CCS/0_617 aligns to m54007_160109_025449/28836279/631_54_CCS") os.chdir(output_dir)
def _check_constools(): if not BamtoolsVersion().good: log.warn("Bamtools not found or out of date") return False cmd = "pbindex" o, r, m = backticks(cmd) if r != 1: return False cmd = "samtools" o, r, m = backticks(cmd) if r != 1: return False return True
def test_alignmentset_partial_consolidate(self): testFile = ("/pbi/dept/secondary/siv/testdata/SA3-DS/" "lambda/2372215/0007_tiny/Alignment_" "Results/m150404_101626_42267_c10080" "7920800000001823174110291514_s1_p0." "all.alignmentset.xml") aln = AlignmentSet(testFile) nonCons = AlignmentSet(testFile) self.assertEqual(len(aln.toExternalFiles()), 3) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'merged.bam') aln.consolidate(outfn, numFiles=2) self.assertFalse(os.path.exists(outfn)) self.assertTrue(os.path.exists(_infixFname(outfn, "0"))) self.assertTrue(os.path.exists(_infixFname(outfn, "1"))) self.assertEqual(len(aln.toExternalFiles()), 2) self.assertEqual(len(nonCons.toExternalFiles()), 3) for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))): self.assertEqual(read1, read2) self.assertEqual(len(aln), len(nonCons)) log.debug("Test cli") outdir = tempfile.mkdtemp(suffix="dataset-unittest") datafile = os.path.join(outdir, "merged.bam") xmlfile = os.path.join(outdir, "merged.xml") cmd = "dataset consolidate --numFiles 2 {i} {d} {x}".format(i=testFile, d=datafile, x=xmlfile) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0)
def _startPhmmers(self, chunked_reads_fns, chunked_dom_fns, out_dom_fn, primer_fn, pbmatrix_fn): """Run phmmers on chunked reads files in 'chunked_reads_fns' and generate chunked dom files as listed in 'chunked_dom_fns', finally concatenate dom files to 'out_dom_fn'.""" logging.info("Start to launch phmmer on chunked reads.") jobs = [] for reads_fn, domFN in zip(chunked_reads_fns, chunked_dom_fns): p = multiprocessing.Process( target=self._phmmer, args=(reads_fn, domFN, primer_fn, pbmatrix_fn)) jobs.append((p, domFN)) p.start() for p, domFN in jobs: p.join() cmd = "cat {0} >> {1}".format(real_upath(domFN), real_upath(out_dom_fn)) _output, errCode, errMsg = backticks(cmd) if errCode != 0: raise ClassifierException( "Error concatenating dom files: {e}". format(e=str(errMsg))) self._cleanup(chunked_reads_fns) self._cleanup(chunked_dom_fns)
def _releaseLock(self, dbLock): """Release dbLock.""" _o, errCode, _m = backticks("rm -f {dbLock}".format(dbLock=dbLock)) if errCode == 0: logging.debug(self.name + ": Release the lock for DB creation.") else: raise RuntimeError(self.name + ": Failed to release lock " + dbLock + ". Please delete the lock manually.")
def test_exit_code_0(self): bam = self.getAlignmentSet() var_rpt = os.path.join(DATA, 'variants_report.json') mapping_rpt = os.path.join(DATA, 'mapping_stats_report.json') cmd = 'python -m pbreports.report.sat {o} {r} {c} {a} {v}'.format(o=self._output_dir, r='rpt.json', c=bam, a=var_rpt, v=mapping_rpt) o, c, m = backticks(cmd) log.info(cmd) if c is not 0: log.error(m) log.error(o) print(m) self.assertEquals(0, c) rpt_file = os.path.join(self._output_dir, 'rpt.json') rpt = load_report_from_json(rpt_file) self.assertEqual('sidney', rpt.get_attribute_by_id('instrument').value) self.assertEqual(1, rpt.get_attribute_by_id('coverage').value) self.assertEqual(1, rpt.get_attribute_by_id('concordance').value) self.assertEqual(7752, rpt.get_attribute_by_id( 'mapped_readlength_mean').value) self.assertEqual(48, rpt.get_attribute_by_id('reads_in_cell').value) out = StringIO() self.assertTrue(summarize_report(rpt_file, out=out))
def isExist(ff): """Return whether a file or a dir ff exists or not. Call ls instead of python os.path.exists to eliminate NFS errors. """ cmd = "ls %s" % ff _output, errCode, _errMsg = backticks(cmd) return (errCode == 0)
def blasr_sam_for_quiver(input_fasta, ref_fasta, out_sam_filename, run_cmd=True, blasr_nproc=12): """ #input_fofn --- should be input.fofn input_fasta --- should be in.raw.fa ref_fasta --- reference fasta (ex: g_consensus.fa) to align to #output_dir --- if None, automatically set to where ref_fasta is run blasr -clipping soft to get sam """ #if output_dir is None: # output_dir = op.dirname(ref_fasta) #if movies is not None: # f = open(input_fasta + '.fofn', 'w') # for line in open(input_fofn): # if op.basename(line).split('.')[0] in movies: # f.write(line) # f.close() # input_fofn = f.name #out_sam = op.join(output_dir, out_sam_filename) #TODO: review code cmd = "blasr {i} ".format(i=input_fasta) + \ "{r} ".format(r=ref_fasta) + \ "-nproc {n} ".format(n=blasr_nproc) + \ "-bestn 5 -nCandidates 10 -sam -clipping soft " + \ "-out {o}".format(o=out_sam_filename) logging.debug("CMD: " + cmd) if run_cmd: _out, _code, _msg = backticks(cmd) if _code != 0: raise RuntimeError("CMD failed: {cmd}\n{e}". format(cmd=cmd, e=_msg)) return cmd
def _bt2BuildIndex(self, tempDir, referenceFile): """Build bt2 index files. Input: tempDir : a temporary directory for saving bowtie2 index files. referenceFile: the reference sequence file. Output: list of strings, bowtie2 index files. """ refBaseName = bt2BaseName(tempDir, referenceFile) cmdStr = "bowtie2-build -q -f {0} {1}".\ format(referenceFile, refBaseName) logging.info(self.name + ": Build bowtie2 index files.") logging.debug(self.name + ": Call {0}".format(cmdStr)) _output, errCode, errMsg = backticks(cmdStr) if (errCode != 0): logging.error(self.name + ": Failed to build bowtie2 " + "index files.\n" + errMsg) raise RuntimeError(errMsg) return bt2IndexFiles(refBaseName)
def test_integration(self): exe = "barcode_report" temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json") json_report_file_name = temp_file.name temp_file.close() ccs = " --ccs " if self.ccs else "" cmd = "{e} --debug {ccs} {b} {ba} {r}".format(e=exe, b=self.bas_h5_fofn, ba=self.barcode_h5_fofn, r=json_report_file_name, ccs=ccs) log.info("Running cmd {c}".format(c=cmd)) output, rcode, emsg = backticks(cmd) if rcode != 0: log.error(output) log.error(emsg) self.assertEqual(0, rcode) with open(json_report_file_name, 'r') as f: s = json.load(f) self.assertIsNotNone(s) log.info(pformat(s)) # cleanup os.remove(json_report_file_name)
def test_missing_fai_error_message(self): outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format(i=ReferenceSet( data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) with self.assertRaises(IOError) as cm: rs1.assertIndexed() self.assertEqual( str(cm.exception), ("Companion FASTA index (.fai) file not found or malformatted! " "Use 'samtools faidx' to generate FASTA index."))
def test_exit_code_0_referenceset(self): """ Like a cram test. Assert exits with 0 with ReferenceSet XML """ ref = os.path.join(self._data_dir, 'references', 'lambda', 'sequence', 'lambda.fasta') ref_name = os.path.join(self._output_dir, "refset.xml") refset = ReferenceSet(ref) refset.write(ref_name) ref = ref_name gff = os.path.join(self._data_dir, 'alignment_summary.lambda.gff') r = 'rpt.json' cmd = 'python -m pbreports.report.coverage {o} {r} {c} {g}'.format(o=self._output_dir, r=r, c=ref, g=gff) log.info(cmd) o, c, m = backticks(cmd) if c is not 0: log.error(m) log.error(o) sys.stderr.write(str(m) + "\n") self.assertEquals(0, c) self.assertTrue(os.path.exists(os.path.join(self._output_dir, r)))
def test_loadmetadata_from_dataset_create_cli(self): fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name fn2 = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name log.debug(fn) aln = AlignmentSet(data.getXml(8)) aln.metadata.collections = None aln.copyTo(fn) aln.close() del aln self.assertTrue(os.path.exists(fn)) aln = AlignmentSet(fn) self.assertFalse(aln.metadata.collections) cmd = "dataset create --metadata {m} {o} {i}".format( o=fn2, i=fn, m=("/pbi/dept/secondary/siv/testdata/" "SA3-Sequel/lambda/roche_SAT/" "m54013_151205_032353.subreadset.xml")) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0, m) aln = AlignmentSet(fn2) self.assertTrue(aln.metadata.collections)
def trigger_nfs_refresh(ff): """ Central place for all NFS hackery Return whether a file or a dir ff exists or not. Call ls instead of python os.path.exists to eliminate NFS errors. Added try/catch black hole exception cases to help trigger an NFS refresh :rtype bool: """ # try to trigger refresh for File case try: f = open(ff, 'r') f.close() except Exception: pass # try to trigger refresh for Directory case try: _ = os.stat(ff) _ = os.listdir(ff) except Exception: pass # Call externally # this is taken from Yuan cmd = "ls %s" % ff _, rcode, _ = backticks(cmd) return rcode == 0
def test_exit_code_0(self): """ Like a cram test. Assert exits with 0, even though region size is 0 See bug 25079 """ from pbcore.util.Process import backticks import tempfile # als = os.path.join(self._data_dir, 'alignment_summary.gff') # variants = os.path.join(self._data_dir, 'variants.gff.gz') # ref = os.path.join(self._data_dir, 'ecoliK12_pbi_March2013') ref = pbcore.data.getLambdaFasta() tiny_reads = pbcore.data.getBamAndCmpH5()[0] out = os.path.join(tempfile.mkdtemp(suffix="summ_cov"), 'gff') cmd = 'summarize_coverage --region_size=0 --num_regions=500 {a} {r} {g}'.format( a=tiny_reads, r=ref, g=out) o, c, m = backticks(cmd) log.info(cmd) if c is not 0: log.error(m) log.error(o) print(m) self.assertEquals(0, c) self.assertTrue( os.path.exists(os.path.join(out)))
def wait_for_sge_jobs_worker(cmd): _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Failed to qsub CMD: {cmd}, {msg}.".format(cmd=cmd, msg=_msg) raise RuntimeError(errMsg) # Your job 596028 ("a.sh") has been submitted return str(_out).split()[2]
def _checkPhmmer(self): """Check phmmer can be called successfully.""" logging.info("checking for phmmer existence.") _output, errCode, errMsg = backticks("phmmer -h > /dev/null") if errCode != 0: raise ClassifierException("Unable to invoke phmmer.\n{e}". format(e=errMsg))
def _check_constools(): cmd = "pbindex" o, r, m = backticks(cmd) if r != 1: return False cmd = "samtools" o, r, m = backticks(cmd) if r != 1: return False cmd = "pbmerge" o, r, m = backticks(cmd) if r != 1: return False return True
def _pbindexBam(fname): cmd = "pbindex {i}".format(i=fname) log.info(cmd) o, r, m = backticks(cmd) if r != 0: raise RuntimeError(m) return fname + ".pbi"
def test_alignmentset_partial_consolidate(self): testFile = ("/mnt/secondary-siv/testdata/SA3-DS/" "lambda/2372215/0007_tiny/Alignment_" "Results/m150404_101626_42267_c10080" "7920800000001823174110291514_s1_p0." "all.alignmentset.xml") aln = AlignmentSet(testFile) nonCons= AlignmentSet(testFile) self.assertEqual(len(aln.toExternalFiles()), 3) outdir = tempfile.mkdtemp(suffix="dataset-unittest") outfn = os.path.join(outdir, 'merged.bam') aln.consolidate(outfn, numFiles=2) self.assertFalse(os.path.exists(outfn)) self.assertTrue(os.path.exists(_infixFname(outfn, "0"))) self.assertTrue(os.path.exists(_infixFname(outfn, "1"))) self.assertEqual(len(aln.toExternalFiles()), 2) self.assertEqual(len(nonCons.toExternalFiles()), 3) for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))): self.assertEqual(read1, read2) self.assertEqual(len(aln), len(nonCons)) log.debug("Test cli") outdir = tempfile.mkdtemp(suffix="dataset-unittest") datafile = os.path.join(outdir, "merged.bam") xmlfile = os.path.join(outdir, "merged.xml") cmd = "dataset.py consolidate --numFiles 2 {i} {d} {x}".format( i=testFile, d=datafile, x=xmlfile) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0)
def _cpFile(inFname, outFname): cmd = "cp {i} {o}".format(i=inFname, o=outFname) log.info(cmd) o, r, m = backticks(cmd) if r != 0: raise RuntimeError(m)
def test_exit_code_0(self): bam = self.getAlignmentSet() var_rpt = os.path.join(DATA, 'variants_report.json') mapping_rpt = os.path.join(DATA, 'mapping_stats_report.json') cmd = 'python -m pbreports.report.sat {o} {r} {c} {a} {v}'.format(o=self._output_dir, r='rpt.json', c=bam, a=var_rpt, v=mapping_rpt) o, c, m = backticks(cmd) log.info(cmd) if c is not 0: log.error(m) log.error(o) print(m) self.assertEquals(0, c) rpt_file = os.path.join(self._output_dir, 'rpt.json') rpt = load_report_from_json(rpt_file) self.assertEqual('sidney', rpt.get_attribute_by_id('instrument').value) self.assertEqual(1, rpt.get_attribute_by_id('coverage').value) self.assertEqual(1, rpt.get_attribute_by_id('accuracy').value) self.assertEqual(1328, rpt.get_attribute_by_id('mapped_readlength_mean').value) self.assertEqual(48, rpt.get_attribute_by_id('reads_in_cell').value) out = StringIO() self.assertTrue(summarize_report(rpt_file, out=out))
def local_job_runner(cmds_list, num_threads, throw_error=True): """ Execute a list of cmds locally using thread pool with at most num_threads threads, wait for all jobs to finish before exit. If throw_error is True, when any job failed, raise RuntimeError. If throw_error is False, return a list of cmds that failed. Parameters: cmds_list - cmds that will be executed in ThreadPool num_threads - number of threads that will be used in the ThreadPool throw_error - whether or not to throw RuntimeError when any of cmd failed. rescue - whether or not to rescue this job rescue_times - maximum number of rescue times """ run_cmd_in_shell = lambda x: backticks(x, merge_stderr=True) try: pool = ThreadPool(processes=num_threads) rets = pool.map(run_cmd_in_shell, cmds_list) pool.close() pool.join() except subprocess.CalledProcessError: pass failed_cmds = [cmds_list[i] for i in range(0, len(cmds_list)) if rets[i][1] != 0] failed_cmds_out = [rets[i][0] for i in range(0, len(cmds_list)) if rets[i][1] != 0] if throw_error and len(failed_cmds) > 0: errmsg = "\n".join(["CMD failed: %s, %s" % (cmd, out) for (cmd, out) in zip(failed_cmds, failed_cmds_out)]) raise RuntimeError(errmsg) else: return failed_cmds
def run_cmd_and_log(self, cmd, olog="", elog="", description=""): """Run the given command locally and write to log, raise a RunTimeError if failed to finish the job. olog: output log elog: error log The error message to display should look like: CMD exited with a non-zero code: {cmd}, {msg}\n {description}\n Error log: {elog}\n """ #msg = "Running CMD: {cmd}".format(cmd=cmd) #self.add_log(msg) _out, _code, _msg = backticks(cmd) if _code != 0: errMsgs = ["CMD exited with a non-zero code: {cmd}, {msg}". format(cmd=cmd, msg=_msg)] if len(description) != 0: errMsgs.append("{description}".format(description=description)) if len(elog) != 0: errMsgs.append("Error log: {elog}".format(elog=elog)) if len(olog) != 0: errMsgs.append("Out log: {olog}".format(olog=olog)) errMsg = "\n".join(errMsgs) self.add_log(errMsg, level=logging.ERROR) raise RuntimeError(errMsg)
def _nfs_exists_check(ff): """ Central place for all NFS hackery Return whether a file or a dir ff exists or not. Call ls instead of python os.path.exists to eliminate NFS errors. Added try/catch black hole exception cases to help trigger an NFS refresh :rtype bool: """ # try to trigger refresh for File case try: f = open(ff, 'r') f.close() except Exception: pass # try to trigger refresh for Directory case try: _ = os.stat(ff) _ = os.listdir(ff) except Exception: pass # Call externally # this is taken from Yuan cmd = "ls %s" % ff _, rcode, _ = backticks(cmd) return rcode == 0
def run_cmd_and_log(self, cmd, olog="", elog="", description=""): """Run the given command locally and write to log, raise a RunTimeError if failed to finish the job. olog: output log elog: error log The error message to display should look like: CMD exited with a non-zero code: {cmd}, {msg}\n {description}\n Error log: {elog}\n """ msg = "Running CMD: {cmd}".format(cmd=cmd) self.add_log(msg) _out, _code, _msg = backticks(cmd) if _code != 0: errMsgs = [ "CMD exited with a non-zero code: {cmd}, {msg}".format( cmd=cmd, msg=_msg) ] if len(description) != 0: errMsgs.append("{description}".format(description=description)) if len(elog) != 0: errMsgs.append("Error log: {elog}".format(elog=elog)) if len(olog) != 0: errMsgs.append("Out log: {olog}".format(olog=olog)) errMsg = "\n".join(errMsgs) self.add_log(errMsg, level=logging.ERROR) raise RuntimeError(errMsg)
def test_missing_fai_error_message(self): outdir = tempfile.mkdtemp(suffix="dataset-unittest") inFas = os.path.join(outdir, 'infile.fasta') # copy fasta reference to hide fai and ensure FastaReader is used backticks('cp {i} {o}'.format( i=ReferenceSet(data.getXml(9)).toExternalFiles()[0], o=inFas)) rs1 = ContigSet(inFas) with self.assertRaises(IOError) as cm: rs1.assertIndexed() self.assertEqual( str(cm.exception), ( "Companion FASTA index (.fai) file not found or malformatted! " "Use 'samtools faidx' to generate FASTA index."))
def _output(self, inSam, refFile, outFile, readType=None, smrtTitle=False): """Generate a SAM, BAM or a CMP.H5 file. Input: inSam : an input SAM/BAM file. (e.g. fileName.filteredSam) refFile : the reference file. (e.g. fileName.targetFileName) outFile : the output SAM/BAM or CMP.H5 file. (i.e. fileName.outputFileName) readType: standard or cDNA or CCS (can be None if not specified) Output: output, errCode, errMsg """ output, errCode, errMsg = "", 0, "" outFormat = getFileFormat(outFile) if outFormat == FILE_FORMATS.BAM: pass # Nothing to be done if outFormat == FILE_FORMATS.SAM: logging.info("OutputService: Genearte the output SAM file.") logging.debug("OutputService: Move {src} as {dst}".format( src=inSam, dst=outFile)) try: shutil.move(real_ppath(inSam), real_ppath(outFile)) except shutil.Error as e: output, errCode, errMsg = "", 1, str(e) elif outFormat == FILE_FORMATS.CMP: #`samtoh5 inSam outFile -readType readType logging.info("OutputService: Genearte the output CMP.H5 " + "file using samtoh5.") prog = "samtoh5" cmd = "samtoh5 {samFile} {refFile} {outFile}".format( samFile=inSam, refFile=refFile, outFile=outFile) if readType is not None: cmd += " -readType {0} ".format(readType) if smrtTitle: cmd += " -smrtTitle " # Execute the command line logging.debug("OutputService: Call \"{0}\"".format(cmd)) output, errCode, errMsg = backticks(cmd) elif outFormat == FILE_FORMATS.XML: logging.info( "OutputService: Generating the output XML file".format( samFile=inSam, outFile=outFile)) # Create {out}.xml, given {out}.bam outBam = str(outFile[0:-3]) + "bam" aln = None # FIXME This should really be more automatic if self.args.readType == "CCS": self._output_dataset_type = ConsensusAlignmentSet aln = self._output_dataset_type(real_ppath(outBam)) for res in aln.externalResources: res.reference = refFile aln.write(outFile) if errCode != 0: errMsg = prog + " returned a non-zero exit status." + errMsg logging.error(errMsg) raise RuntimeError(errMsg) return output, errCode, errMsg
def bax2bam_path(): """Return path to bax2bam""" cmd = "which bax2bam" o, c, m = backticks(cmd) if c != 0: raise RuntimeError("could not find bax2bam") else: return o[0]
def _nfs_exists_check(ff): """Return whether a file or a dir ff exists or not. Call ls instead of python os.path.exists to eliminate NFS errors. """ # this is taken from Yuan cmd = "ls %s" % ff output, errCode, errMsg = backticks(cmd) return errCode == 0
def sanity_check_gcon(): """Sanity check gcon.""" cmd = gcon_py + " --help" _out, _code, _msg = backticks(cmd) if _code != 0: msg = gcon_py + " is not installed." raise RuntimeError(msg) return gcon_py
def _pbmergeXML(indset, outbam): cmd = "pbmerge -o {o} {i} ".format(i=indset, o=outbam) log.info(cmd) o, r, m = backticks(cmd) if r != 0: raise RuntimeError("Pbmerge command failed: {c}\n Message: " "{m}".format(c=cmd, m=m)) return outbam
def test_copyTo_cli(self): # To a fname: # absolute: fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name cmd = "dataset copyto {i} {o}".format(i=data.getXml(8), o=fn) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0) self.assertTrue(os.path.exists(fn)) sset = AlignmentSet(fn, strict=True) self.assertFalse(_is_relative(fn)) # relative: fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name cmd = "dataset copyto --relative {i} {o}".format(i=data.getXml(8), o=fn) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0) self.assertTrue(os.path.exists(fn)) sset = AlignmentSet(fn, strict=True) self.assertTrue(_is_relative(fn)) # to a directory: # absolute: outdir = tempfile.mkdtemp(suffix="dataset-unittest") fn = os.path.join(outdir, os.path.split(data.getXml(8))[1]) cmd = "dataset copyto {i} {o}".format(i=data.getXml(8), o=outdir) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0) self.assertTrue(os.path.exists(fn)) sset = AlignmentSet(fn, strict=True) self.assertFalse(_is_relative(fn)) # relative: outdir = tempfile.mkdtemp(suffix="dataset-unittest") fn = os.path.join(outdir, os.path.split(data.getXml(8))[1]) cmd = "dataset copyto --relative {i} {o}".format(i=data.getXml(8), o=outdir) log.debug(cmd) o, r, m = backticks(cmd) self.assertEqual(r, 0) self.assertTrue(os.path.exists(fn)) sset = AlignmentSet(fn, strict=True) self.assertTrue(_is_relative(fn))
def numReads(self): """Return the number of reads in reads_fn.""" cmd = "grep -c '>' {r}".format(r=real_upath(self.reads_fn)) output, errCode, errMsg = backticks(cmd) if errCode != 0: raise ClassifierException("Error reading file {r}:{e}".format( r=self.reads_fn, e=str(errMsg))) return int(output[0])
def wait_for_sge_jobs_worker(cmd): _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Failed to qsub CMD: {cmd}, {msg}.".\ format(cmd=cmd, msg=_msg) raise RuntimeError(errMsg) # Your job 596028 ("a.sh") has been submitted return str(_out).split()[2]
def _sortBam(fname): tmpname = _infixFname(fname, "_sorted") cmd = "bamtools sort -in {i} -out {o}".format(i=fname, o=tmpname) log.info(cmd) o, r, m = backticks(cmd) if r != 0: raise RuntimeError(m) shutil.move(tmpname, fname)
def isExist(ff): """Return whether a file or a dir ff exists or not. Call ls instead of python os.path.exists to eliminate NFS errors. """ if ff is None: return False cmd = "ls %s" % real_upath(ff) _output, errCode, _errMsg = backticks(cmd) return (errCode == 0)
def _mergeBams(inFiles, outFile): if len(inFiles) > 1: cmd = "bamtools merge -in {i} -out {o}".format(i=' -in '.join(inFiles), o=outFile) log.info(cmd) o, r, m = backticks(cmd) if r != 0: raise RuntimeError(m) else: shutil.copy(inFiles[0], outFile)