def sanity_check_gcon2(): """Sanity check gcon.""" cmd = gcon2_py + " --help" errmsg = gcon2_py + " is not installed." execute(cmd=cmd, errmsg=errmsg) return gcon2_py
def resolved_tool_contract_runner(rtc): """ For each cluster bin, clean up intermediate files under tmp. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) cluster_bin_indices = [task.cluster_bin_index for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) sentinel_out = rtc.task.output_files[0] with open(sentinel_out, 'w') as writer: for task in p: icef = IceFiles(prog_name="ice_cleanup", root_dir=task.cluster_out_dir) tmp_dir = icef.tmp_dir log.info("Cleaning up, removing %s", tmp_dir) writer.write("removing %s\n" % tmp_dir) execute("rm -rf %s" % tmp_dir) quivered_dir = icef.quivered_dir log.info("Cleaning up, removing %s", quivered_dir) writer.write("removing %s\n" % quivered_dir) execute("rm -rf %s" % quivered_dir)
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' Modified: 09/14/2015, both ends of subreads in fasta files will be trimmed in IceQuiver (trim_and_write_raw_file) instead of here. """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_file_or_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug( "File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" execute(cmd=cmd) out_fns.append(out_file) write_files_to_fofn(out_fns, out_filename)
def sanity_check_gcon(): """Sanity check gcon.""" cmd = gcon_py + " --help" errmsg = gcon_py + " is not installed." execute(cmd=cmd, errmsg=errmsg) return gcon_py
def blasr_for_quiver(query_fn, ref_fasta, out_fn, bam=False, run_cmd=True, blasr_nproc=12): """ query_fn --- should be in.raw.fasta|bam ref_fasta --- reference fasta (ex: g_consensus.fasta) to align to out_fn --- sam|bam output aligning query_fn to ref_fasta blasr query_fn ref_fasta -out out_fn -sam -clipping soft blasr query_fn ref_fasta -out out_fn -bam """ cmd = "blasr {i} ".format(i=real_upath(query_fn)) + \ "{r} ".format(r=real_upath(ref_fasta)) + \ "--nproc {n} ".format(n=blasr_nproc) + \ "--bestn 5 --nCandidates 10 " + \ ("--sam --clipping soft " if not bam else "--bam ") + \ "--out {o} ".format(o=real_upath(out_fn)) + \ "1>/dev/null 2>/dev/null" if run_cmd: execute(cmd) else: logging.debug("CMD: " + cmd) return cmd
def convert_fofn_to_fasta(fofn_filename, out_filename, fasta_out_dir, force_overwrite=False): """ For each .bax.h5 file, create .bax.h5.fasta file and save paths to out_filename, which should usually be 'input.fasta.fofn' Modified: 09/14/2015, both ends of subreads in fasta files will be trimmed in IceQuiver (trim_and_write_raw_file) instead of here. """ logging.info("Converting fofn {fofn} to fasta.".format(fofn=fofn_filename)) in_fns = get_files_from_file_or_fofn(fofn_filename) out_fns = [] mkdir(fasta_out_dir) for in_fn in in_fns: logging.debug("converting h5 file: {f}.".format(f=in_fn)) if not (in_fn.endswith('.bax.h5') or in_fn.endswith('.bas.h5')): raise ValueError("fofn file {fofn} ".format(fofn=fofn_filename) + "should only contain bax/bas.h5 files.") # e.g. m111xxxx.1.bax.h5 ==> # tmp_out_file = m11xxxx.1.bax.h5.fasta.tmp # out_file = m11xxxx.1.bax.h5.fasta in_basename = op.basename(in_fn) out_file = op.join(fasta_out_dir, in_basename + '.fasta') if op.exists(out_file) and not force_overwrite: logging.debug("File {0} already exists. skipping.".format(out_file)) else: cmd = "pls2fasta {in_fn} ".format(in_fn=real_upath(in_fn)) + \ " {out} ".format(out=real_upath(out_file)) + \ "-minSubreadLength 300 -minReadScore 750 -trimByRegion" execute(cmd=cmd) out_fns.append(out_file) write_files_to_fofn(out_fns, out_filename)
def map_isoforms_to_reference_transcripts(self): """Map isoforms to reference transcripts.""" m5out = self.output_analysis_fn + ".blasr.out.m5" cmd = 'blasr %s %s --bestn 1 -m 5 --out %s' % \ (self.isoseq_output_fa, self.reference_transcripts_fn, m5out) execute(cmd) return [r for r in BLASRM5Reader(m5out)]
def resolved_tool_contract_runner(rtc): """ For each cluster bin, clean up intermediate files under tmp. """ p = ChunkTasksPickle.read(rtc.task.input_files[0]) assert all([isinstance(task, ClusterChunkTask) for task in p]) cluster_bin_indices = [task.cluster_bin_index for task in p] # sanity check that Cluster indices are unique! assert len(set(cluster_bin_indices)) == len(cluster_bin_indices) sentinel_out = rtc.task.output_files[0] with open(sentinel_out, 'w') as writer: for task in p: icef = IceFiles(prog_name="ice_cleanup", root_dir=task.cluster_out_dir) tmp_dir = icef.tmp_dir log.info("Cleaning up, removing %s", tmp_dir) writer.write("removing %s\n" % tmp_dir) execute("rm -rf %s" % real_upath(tmp_dir)) quivered_dir = icef.quivered_dir log.info("Cleaning up, removing %s", quivered_dir) writer.write("removing %s\n" % quivered_dir) execute("rm -rf %s" % real_upath(quivered_dir))
def copy_in_fasta_to_out(in_dir, out_dir, filename): """copy filename from in_dir (e.g., data) to out_dir, return out_fasta """ mknewdir(out_dir) cmd = "cp %s %s" % (op.join(in_dir, filename), op.join(out_dir, filename)) execute(cmd=cmd) return op.join(out_dir, filename)
def validate_with_Gencode(sorted_rep_sam, gencode_gtf, match_out): """ Input: sorted_rep_sam -- sorted SAM output mapping (collapsed) representitve isoforms to reference eval_dir -- evaluation directory Run matchAnnot to compare sorted_rep_sam with gencode v25 and output to eval_dir """ log.info("Writing matchAnnot output to %s", match_out) cmd = "matchAnnot.py --gtf={0} {1} > {2}".format(gencode_gtf, sorted_rep_sam, match_out) execute(cmd)
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files cwd = realpath(os.getcwd()) cmd_args = [ 'cd %s' % real_upath(op.join(gmap_db_dir, gmap_db_name)), 'ls *.iit *meta', 'sleep 3', 'cd %s' % real_upath(cwd) ] execute(' && '.join(cmd_args)) cmd_args = [ 'gmap', '-D {d}'.format(d=real_upath(gmap_db_dir)), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', '--max-intronlength-ends 200000', # for long genes real_upath(gmap_input_filename), '>', real_upath(unsorted_sam_filename), '2>{log}'.format(log=real_upath(log_filename)) ] # Call gmap to map isoforms to reference and output sam. try: execute(' '.join(cmd_args)) except Exception: logging.debug("gmap failed, try again.") execute('sleep 3') execute(' '.join(cmd_args)) # sort sam file sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def setUp(self): """Initialize.""" self.inputDir = op.join(DATA_DIR, self.testName) self.outDir = op.join(OUT_DIR, self.testName) self.stdoutDir = op.join(STD_DIR, self.testName) self.fastaFileName = "test_DazzIDHandler.fasta" self.stdout_dazz_fasta = op.join(self.stdoutDir, self.fastaFileName[0:-6] + ".dazz.fasta") self.stdout_pickle = self.stdout_dazz_fasta + ".pickle" mknewdir(self.outDir) # Copy inputDir/test_DazzIDHandler.fasta to outDir. execute("cp %s %s" % (op.join(self.inputDir, self.fastaFileName), op.join(self.outDir, self.fastaFileName)))
def sort_sam(in_sam, out_sam): """ Sort input sam file and write to output sam file. """ # Copy SAM headers copy_sam_header(in_sam=in_sam, out_sam=out_sam) # Call sort to sort gmap output sam file cmd_args = ['sort', '-k 3,3', '-k 4,4n', in_sam, '| grep -v \'^@\' ', ' >> ', out_sam] if os.stat(in_sam).st_size == 0: # overwrite cmds if file is empty cmd_args = ['touch', out_sam] execute(' '.join(cmd_args))
def setUp(self): """Initialize.""" self.inputDir = op.join(DATA_DIR, self.testName) self.outDir = op.join(OUT_DIR, self.testName) self.stdoutDir = op.join(STD_DIR, self.testName) self.fastaFileName = "test_DazzIDHandler.fasta" self.stdout_dazz_fasta = op.join( self.stdoutDir, self.fastaFileName[0:-6] + ".dazz.fasta") self.stdout_pickle = self.stdout_dazz_fasta + ".pickle" mknewdir(self.outDir) # Copy inputDir/test_DazzIDHandler.fasta to outDir. execute("cp %s %s" % (op.join(self.inputDir, self.fastaFileName), op.join(self.outDir, self.fastaFileName)))
def _align_withBLASR(self, queryFa, targetFa, outFN, ice_opts, sge_opts): """Align input reads against itself using BLASR.""" if op.exists(outFN): logging.info("{0} already exists. No need to run BLASR.".format(outFN)) else: cmd = "blasr {q} ".format(q=real_upath(queryFa)) + \ "{t} ".format(t=real_upath(targetFa)) + \ "-m 5 --maxLCPLength 15 " + \ "--nproc {cpu} ".format(cpu=sge_opts.blasr_nproc) + \ "--maxScore {score} ".format(score=ice_opts.maxScore) + \ "--bestn {n} --nCandidates {n} ".format(n=ice_opts.bestn) + \ "--out {o} ".format(o=real_upath(outFN)) + \ "1>/dev/null 2>/dev/null" logging.info("Calling {cmd}".format(cmd=cmd)) execute(cmd)
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename+'.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) cmd_args = ['gmap', '-D {d}'.format(d=gmap_db_dir), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', gmap_input_filename, '>', unsorted_sam_filename, '2>{log}'.format(log=log_filename)] # Call gmap to map isoforms to reference and output sam. execute(' '.join(cmd_args)) # Copy SAM headers copy_sam_header(in_sam=unsorted_sam_filename, out_sam=sam_filename) # Call sort to sort gmap output sam file cmd_args = ['sort', '-k 3,3', '-k 4,4n', unsorted_sam_filename, '| grep -v \'^@\'', '>>', sam_filename] execute(' '.join(cmd_args)) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename + '.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) cmd_args = [ 'gmap', '-D {d}'.format(d=gmap_db_dir), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', gmap_input_filename, '>', unsorted_sam_filename, '2>{log}'.format(log=log_filename) ] # Call gmap to map isoforms to reference and output sam. execute(' '.join(cmd_args)) # Copy SAM headers copy_sam_header(in_sam=unsorted_sam_filename, out_sam=sam_filename) # Call sort to sort gmap output sam file cmd_args = [ 'sort', '-k 3,3', '-k 4,4n', unsorted_sam_filename, '| grep -v \'^@\'', '>>', sam_filename ] execute(' '.join(cmd_args)) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def map_isoforms_and_sort(input_filename, sam_filename, gmap_db_dir, gmap_db_name, gmap_nproc): """ Map isoforms to references by gmap, generate a sam output and sort sam. Parameters: input_filename -- input isoforms. e.g., hq_isoforms.fasta|fastq|xml sam_filename -- output sam file, produced by gmap and sorted. gmap_db_dir -- gmap database directory gmap_db_name -- gmap database name gmap_nproc -- gmap nproc """ unsorted_sam_filename = sam_filename + ".tmp" log_filename = sam_filename + ".log" gmap_input_filename = input_filename if input_filename.endswith('.xml'): # must consolidate dataset xml to FASTA/FASTQ w = ContigSetReaderWrapper(input_filename) gmap_input_filename = w.consolidate(out_prefix=sam_filename+'.input') if not op.exists(gmap_input_filename): raise IOError("Gmap input file %s does not exists" % gmap_input_filename) # In order to prevent mount issues, cd to ${gmap_db_dir} and ls ${gmap_db_name}.* files cwd = realpath(os.getcwd()) cmd_args = ['cd %s' % op.join(gmap_db_dir, gmap_db_name), 'ls *.iit *meta', 'sleep 3', 'cd %s' % cwd] execute(' && '.join(cmd_args)) cmd_args = ['gmap', '-D {d}'.format(d=gmap_db_dir), '-d {name}'.format(name=gmap_db_name), '-t {nproc}'.format(nproc=gmap_nproc), '-n 0', '-z sense_force', '--cross-species', '-f samse', gmap_input_filename, '>', unsorted_sam_filename, '2>{log}'.format(log=log_filename)] # Call gmap to map isoforms to reference and output sam. try: execute(' '.join(cmd_args)) except Exception: logging.debug("gmap failed, try again.") execute('sleep 3') execute(' '.join(cmd_args)) # sort sam file sort_sam(in_sam=unsorted_sam_filename, out_sam=sam_filename) # remove intermediate unsorted sam file. rmpath(unsorted_sam_filename)
def test_concat_bam(self): """Test concat_bam, unaligned and aligned.""" # cat aligned bam files with only one RG, one SN fns = [op.join(self.moreDir, "%d.bam" % i) for i in range(1,5)] out_fn = op.join(self.outDir, "test_concat_bam_1.bam") from pbtranscript.ice.IceUtils import concat_bam concat_bam(fns, out_fn) self.assertTrue(op.exists(out_fn)) # cat aligned bam files to a big bam fns = [op.join(self.moreDir, "aligned.%d.bam" % i) for i in range(1, 6)] out_fn = op.join(self.outDir, "test_concat_bam_2.bam") from pbtranscript.Utils import execute concat_bam(fns, out_fn) self.assertTrue(op.exists(out_fn)) # convert big bam to sam and compare with std output out_sam = out_fn + ".sam" stdout_sam = op.join(self.sivStdoutDir, "test_concat_bam_2.sam") cmd="samtools view -h %s -o %s" % (out_fn, out_sam) execute(cmd=cmd) self.cmp_sam(out_sam, stdout_sam)
def test_as_contigset(self): """Test as_contigset""" out_dir = op.join(OUT_DIR, 'test_Utils') mknewdir(out_dir) fa = op.join(out_dir, "empty.fasta") xml = op.join(out_dir, "empty.contigset.xml") fai = fa + ".fai" execute("touch %s" % fa) as_contigset(fa, xml) self.assertTrue(op.exists(xml)) self.assertTrue(op.exists(fai)) fn = 'reads_of_insert.fasta' shutil.copy(src=op.join(DATA_DIR, fn), dst=op.join(out_dir, fn)) fa = op.join(out_dir, fn) as_contigset(fa, fa) fai = fa + ".fai" xml = op.join(out_dir, 'reads_of_insert.contigset.xml') as_contigset(fa, xml) self.assertTrue(op.exists(xml)) self.assertTrue(op.exists(fai))
def test_concat_bam(self): """Test concat_bam, unaligned and aligned.""" # cat aligned bam files with only one RG, one SN fns = [op.join(self.moreDir, "%d.bam" % i) for i in range(1, 5)] out_fn = op.join(self.outDir, "test_concat_bam_1.bam") from pbtranscript.ice.IceUtils import concat_bam concat_bam(fns, out_fn) self.assertTrue(op.exists(out_fn)) # cat aligned bam files to a big bam fns = [ op.join(self.moreDir, "aligned.%d.bam" % i) for i in range(1, 6) ] out_fn = op.join(self.outDir, "test_concat_bam_2.bam") from pbtranscript.Utils import execute concat_bam(fns, out_fn) self.assertTrue(op.exists(out_fn)) # convert big bam to sam and compare with std output out_sam = out_fn + ".sam" stdout_sam = op.join(self.sivStdoutDir, "test_concat_bam_2.sam") cmd = "samtools view -h %s -o %s" % (out_fn, out_sam) execute(cmd=cmd) self.assertTrue(filecmp.cmp(out_sam, stdout_sam))
def make_db(self): """Make dazz database for input file. 1. fasta2DB 2. DBsplit 3. get & store number of blocks *.dazz.fasta.db will be created. """ log.debug("Making DAZZ database for %s.", self.dazz_filename) if not op.exists(self.dazz_filename): raise RuntimeError("%s hasn't been converted to daligner-compatible format." % self.input_filename) if op.exists(self.db_filename): cmd = "DBrm %s" % self.dazz_filename execute(cmd=cmd) cmd = "fasta2DB %s %s " % (self.dazz_filename, self.dazz_filename) execute(cmd=cmd) cmd = "DBsplit -s200 %s" % self.dazz_filename execute(cmd)
def make_db(self): """Make dazz database for input file. 1. fasta2DB 2. DBsplit 3. get & store number of blocks *.dazz.fasta.db will be created. """ log.debug("Making DAZZ database for %s.", self.dazz_filename) if not op.exists(self.dazz_filename): raise RuntimeError( "%s hasn't been converted to daligner-compatible format." % self.input_filename) if op.exists(self.db_filename): cmd = "DBrm %s" % self.dazz_filename execute(cmd=cmd) cmd = "fasta2DB %s %s " % (self.dazz_filename, self.dazz_filename) execute(cmd=cmd) cmd = "DBsplit -s200 %s" % self.dazz_filename execute(cmd)
def concat_sam(samfiles, outsam_filename): """ Header looks like: @HD VN:1.3.1 @SQ SN:c31 LN:3104 M5:ef7d3f84dea9d9face43e6fd5b6336c4 @RG ID:2caa54eef6 PU:in.raw_with_partial.fasta SM:NO_CHIP_ID @PG ID:BLASR VN:1.3.1.126469 CL:blasr in.raw_with_partial.fasta g_consensus.fasta -nproc 12 -bestn 5 -nCandidates 10 -sam -out out.sam NOTE: check for M5 conflicts; manipulate them if it conflicts """ f_sq = open(outsam_filename + '.sq', 'w') f_bd = open(outsam_filename + '.bd', 'w') rg_line = None pg_line = None md5_seen = set() if len(samfiles) == 0: raise ValueError("No sam input files to concatenate.") h = open(samfiles[0]) line = h.readline() assert line.startswith('@HD') f_sq.write(line) line = h.readline() assert line.startswith('@SQ') line = h.readline() assert line.startswith('@RG') rg_line = line # write at the end line = h.readline() assert line.startswith('@PG') pg_line = line # write at the end h.close() for f in samfiles: with open(f) as h: assert h.readline().startswith('@HD') line = h.readline() assert line.startswith('@SQ') # ------- check for MD5 conflicts ----------- # m5 = line.strip().split()[-1] assert m5.startswith("M5:") if m5 not in md5_seen: f_sq.write(line) md5_seen.add(m5) else: s = list(m5[3:]) while True: # create a random m5 string. random.shuffle(s) s = "".join(s) if s not in md5_seen: break line = line[:line.find('M5:')] + 'M5:' + s + '\n' logging.debug("MD5 conflict: change to {0}".format(s)) md5_seen.add(s) f_sq.write(line) # ----- end MD5 checking and writing --------- # assert h.readline().startswith('@RG') assert h.readline().startswith('@PG') for line in h: f_bd.write(line) f_bd.close() f_sq.write(rg_line) f_sq.write(pg_line) f_sq.close() cmd = "cat {0}.sq {0}.bd > {0}".format(real_upath(outsam_filename)) execute(cmd=cmd, errmsg="Failed to concat sam files! Abort.", errcls=IOError) os.remove(f_sq.name) os.remove(f_bd.name)
def build_uc_from_partial_blasr(input_fasta, ref_fasta, out_pickle, done_filename, ice_opts, probqv, qv_prob_threshold=0.3, cpus=4, no_qv_or_aln_checking=False, tmp_dir=None, sID_starts_with_c=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 100 --nCandidates 200 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=cpus) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--minAlnLength {a} ".format(a=ice_opts.min_match_len) + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) logging.info("Calling blasr_against_ref ...") # no need to provide full_missed_start/end for nFLs, since is_FL = False hitItems = blasr_against_ref2(output_filename=m5_file, is_FL=False, sID_starts_with_c=sID_starts_with_c, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, qv_prob_threshold=qv_prob_threshold, ece_penalty=ice_opts.ece_penalty, ece_min_len=ice_opts.ece_min_len, max_missed_start=ice_opts.max_missed_start, max_missed_end=ice_opts.max_missed_end, full_missed_start=ice_opts.full_missed_start, full_missed_end=ice_opts.full_missed_end, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, blasr_nproc=12, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=blasr_nproc) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(.01, .07, .06) else: # FIXME this will not work with current CCS bam output, which lacks # QV pulse features required - this is handled via a workaround in # pbtranscript.tasks.ice_partial logging.info("Loading probability from QV in %s", ccs_fofn) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)
def test_end_to_end(): """Call separate_flnc.py from command line, end to end must exit gracefully.""" cmd = "separate_flnc.py %s %s %s --bin_by_primer" % \ (FLNC_FASTA, op.join(OUT_DIR, "separate_flnc_by_primer_fasta_input_e2e"), op.join(OUT_DIR, "end_to_end1.pickle")) execute(cmd) cmd = "separate_flnc.py %s %s %s --bin_by_primer" % \ (FLNC_DATASET, op.join(OUT_DIR, "separate_flnc_by_primer_xml_input_e2e"), op.join(OUT_DIR, "end_to_end2.pickle")) execute(cmd) cmd = "separate_flnc.py %s %s %s --bin_size_kb 1" % \ (FLNC_FASTA, op.join(OUT_DIR, "separate_flnc_by_size_fasta_input_e2e"), op.join(OUT_DIR, "end_to_end3.pickle")) execute(cmd) cmd = "separate_flnc.py %s %s %s --bin_size_kb 1" % \ (FLNC_DATASET, op.join(OUT_DIR, "separate_flnc_by_size_xml_input_e2e"), op.join(OUT_DIR, "end_to_end4.pickle")) execute(cmd) cmd = "separate_flnc.py %s %s %s --bin_manual '[0,3,4,6]'" % \ (FLNC_FASTA, op.join(OUT_DIR, "separate_flnc_by_size_fasta_input_manual_e2e"), op.join(OUT_DIR, "end_to_end5.pickle")) execute(cmd) cmd = "separate_flnc.py %s %s %s --bin_manual '[0,3,4,6]'" % \ (FLNC_DATASET, op.join(OUT_DIR, "separate_flnc_by_size_xml_input_manual_e2e"), op.join(OUT_DIR, "end_to_end6.pickle")) execute(cmd)