def convert_to_dazz_fasta(self): """ Convert input fasta/fastq file to daligner-compatibe fasta with ids: <prefix>/<index>/0_<seqlen> Also write out mappings to pickle """ log.debug("Converting %s to daligner compatible fasta %s.", self.input_filename, self.dazz_filename) reader = ContigSetReaderWrapper(self.input_filename) with FastaWriter(self.dazz_filename) as f: i = 1 for r in reader: f.writeRecord( "{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence[:]) self.dazz_mapping[i] = r.name i += 1 reader.close() with open(self.pickle_filename, 'w') as f: dump(self.dazz_mapping, f)
def get_primer_ids(self): """Return primer ids seen in input FLNC file.""" primer_ids = set() for r in ContigSetReaderWrapper(self.flnc_filename): primer_ids.add(self._get_primer_id(r)) primer_ids = sorted(list(primer_ids)) return primer_ids
def run(self): """Run""" # separate reads and write for r in ContigSetReaderWrapper(self.flnc_filename): p = self._get_primer_id(r) self.handles[p].write(">{0}\n{1}\n".format(r.name, r.sequence[:])) assert all([os.stat(x).st_size > 0 for x in self.out_dirs])
def run(self): """Run""" read_counter_in_each_bin = dict({b:0 for b in self.size_bins}) for r in ContigSetReaderWrapper(self.flnc_filename): b = self.size_bins.which_bin_contains(len(r.sequence)) p = read_counter_in_each_bin[b] % self.size_bins_parts[b] read_counter_in_each_bin[b] += 1 self.handles[(b, p)].write(">{0}\n{1}\n".format(r.name, r.sequence[:]))
def get_size_bins_parts(self, bin_size_kb, bin_manual, max_base_limit_MB): """ return a dict {SizeBin: number of parts in this SizeBin} """ # first check min - max size range min_size = sys.maxint + 1 max_size = 0 base_in_each_size = defaultdict( lambda: 0) # SizeBin --> number of bases for r in ContigSetReaderWrapper(self.flnc_filename): seqlen = len(r.sequence) min_size = min(min_size, seqlen) max_size = max(max_size, seqlen) b = SizeBin(seqlen / 1000, seqlen / 1000 + 1) base_in_each_size[b] += len(r.sequence) min_size_kb = min_size / 1000 max_size_kb = max_size / 1000 + (1 if max_size % 1000 != 0 else 0) logging.info("Min read length: %s, %s KB, max read length: %s, %s KB", str(min_size), str(min_size_kb), str(max_size), str(max_size_kb)) size_bins = None if bin_manual is not None and len(bin_manual) > 0: if bin_manual[0] > min_size_kb: bin_manual.insert(0, min_size_kb) logging.warning("bin_manual has been reset to %s kb!", bin_manual) if bin_manual[-1] < max_size_kb: bin_manual.append(max_size_kb) logging.warning("bin_manual has been reset to %s kb!", bin_manual) size_bins = SizeBins(bin_manual) else: size_bins = SizeBins( range(min_size_kb, max_size_kb + 1, bin_size_kb)) logging.info("Read size bins are: %s", str(size_bins)) size_bins_bases = dict({b: 0 for b in size_bins }) # SizeBin -> total n of bases in it size_bins_parts = dict({b: 0 for b in size_bins }) # SizeBin -> total n of partitions in it if max_base_limit_MB is not None: for _b, num_bases in base_in_each_size.iteritems(): b = size_bins.which_bin_contains(_b) size_bins_bases[b] += num_bases for b, num_bases in size_bins_bases.iteritems(): size_bins_parts[b] = int((size_bins_bases[b]*1.0 / 10**6) / max_base_limit_MB) + \ (1 if (num_bases*1. / 10**6) % max_base_limit_MB > 0 else 0) return size_bins_parts
def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_nc_fn, out_c_fn, primer_report_fn, write_report_header=True): """ in_read_fn --- a fasta of full-length reads or a fasta of non-full-length reads. For each full-length read in in_read_fn FASTA file, detect whether it is chimeric or not, and write its annotation to primer_report_fn. Return: (num_nc, num_c, num_nc_bases, num_c_bases) """ logging.debug( "Update chimera info for reads in {f} ".format(f=in_read_fn)) logging.debug( "Write primer report to {rpt}".format(rpt=primer_report_fn)) out_nc_fn_fasta, out_c_fn_fasta = out_nc_fn, out_c_fn if out_nc_fn.endswith(".xml"): out_nc_fn_fasta = out_nc_fn[:-4] + ".fasta" if out_c_fn.endswith(".xml"): out_c_fn_fasta = out_c_fn[:-4] + ".fasta" num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0 with ContigSetReaderWrapper(in_read_fn) as reader, \ FastaWriter(out_nc_fn_fasta) as writer, \ FastaWriter(out_c_fn_fasta) as writer_chimera, \ open(primer_report_fn, 'w') as reporter: if write_report_header: reporter.write(ReadAnnotation.header(delimiter=",") + "\n") for r in reader: # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;" readid = r.name.split()[0] annotation = ReadAnnotation.fromString( r.name, ignore_polyA=self.ignore_polyA) if readid not in suspicous_hits: # Non-chimeric reads # Primer of a primer-trimmed read can not be None. # assert(annotation.primer is not None) annotation.chimera = 0 num_nc += 1 num_nc_bases += len(r.sequence) writer.writeRecord(annotation.toAnnotation(), r.sequence[:]) else: # chimeric reads annotation.chimera = 1 num_c += 1 num_c_bases += len(r.sequence) writer_chimera.writeRecord(annotation.toAnnotation(), r.sequence[:]) reporter.write(annotation.toReportRecord(delimitor=",") + "\n") return (num_nc, num_c, num_nc_bases, num_c_bases)
def ice_fa2fq(in_fa, ccs_fofn, out_fq): """Convert an input FASTA file to an output FASTQ file, reading QVs from the input ccs.h5, ccs.bam or ccs FOFN. """ ccs_fns = get_files_from_file_or_fofn(ccs_fofn) fmt = guess_file_format(ccs_fns) if fmt == FILE_FORMATS.H5: qver = basQVcacher() for ccs_fn in ccs_fns: qver.add_bash5(ccs_fn) bas_handlers = {} elif fmt == FILE_FORMATS.BAM: qver = BamCollection(*ccs_fns) else: raise IOError("ice_fa2fq does not support input %s." % ccs_fofn) with ContigSetReaderWrapper(in_fa) as reader, \ FastqWriter(out_fq) as writer: for r in reader: logging.debug("Getting QVs for {name} ...".format(name=r.name)) seqid = r.name.split(' ')[0] parsed_read_name = _Parsed_Read_Name(seqid) if fmt == FILE_FORMATS.H5: try: bas_file = qver.bas_files[parsed_read_name.movie][seqid] if bas_file not in bas_handlers: bas_handlers[bas_file] = BasH5Reader(bas_file) except KeyError: raise IOError("Could not read {s} from {f}.".format( s=seqid, f=ccs_fofn)) qvs = get_qv_from_bas_handler( bas_handler=bas_handlers[bas_file], parsed_read_name=parsed_read_name, qv_name="QualityValue") elif fmt == FILE_FORMATS.BAM: qvs = get_qvs_from_bam(reader=qver, parsed_read_name=parsed_read_name, qv_name="QualityValue") else: assert False if len(r.sequence) != len(qvs): raise ValueError( "Sequence and QVs of {r} should be the same!".format( r=r.name)) writer.writeRecord(r.name, r.sequence[:], qvs) if fmt == FILE_FORMATS.H5: for bas_file, bas_handler in bas_handlers.iteritems(): logging.debug("Closing {bas_file} ...".format(bas_file=bas_file)) bas_handler.close() elif fmt == FILE_FORMATS.BAM: qver.close()
def convert_to_dazz_fasta(self): """ Convert input fasta/fastq file to daligner-compatibe fasta with ids: <prefix>/<index>/0_<seqlen> Also write out mappings to pickle """ log.debug("Converting %s to daligner compatible fasta %s.", self.input_filename, self.dazz_filename) reader = ContigSetReaderWrapper(self.input_filename) with FastaWriter(self.dazz_filename) as f: i = 1 for r in reader: f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name, i=i, len=len(r.sequence)), r.sequence[:]) self.dazz_mapping[i] = r.name i += 1 reader.close() with open(self.pickle_filename, 'w') as f: dump(self.dazz_mapping, f)
def _write_config(self, fasta_filename): """Write daligner sensitive config to fasta_filename.sensitive.config.""" lens = [ len(r.sequence) for r in ContigSetReaderWrapper(fasta_filename) ] self.low_cDNA_size = int(np.percentile(lens, 10)) self.high_cDNA_size = int(np.percentile(lens, 90)) try: with open(fasta_filename + '.sensitive.config', 'w') as f: f.write("sensitive={s}\n".format(s=self.sensitive_mode)) f.write("low={l}\n".format(l=self.low_cDNA_size)) f.write("high={h}\n".format(h=self.high_cDNA_size)) except IOError: pass # it's OK not to have write permission
def combine_consensus_isoforms(split_indices, split_files, combined_consensus_isoforms_fa, sample_name): """ Parameters: split_indices -- indices of splitted cluster bins. split_files -- consensus isoforms in each splitted cluster bin. """ assert len(split_indices) == len(split_files) writer = FastaWriter(combined_consensus_isoforms_fa) for i, split_fn in zip(split_indices, split_files): logging.debug("Adding prefix i%s to %s.", str(i), split_fn) with ContigSetReaderWrapper(split_fn) as reader: for read in reader: name = combined_cid_ice_name(name=read.name, cluster_bin_index=i, sample_name=sample_name) writer.writeRecord(name, read.sequence[:]) writer.close() logging.info("Consensus isoforms output combined to:%s", combined_consensus_isoforms_fa)
def split(self, reads_in_first_split=None): """Split `input_fasta` into smaller files each containing `reads_per_split` reads. Return splitted fasta.""" split_index = 0 self.out_fns = [] writer = FastaWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) if reads_in_first_split is None: reads_in_first_split = self.reads_per_split with ContigSetReaderWrapper(self.input_fasta) as reader: for ridx, r in enumerate(reader): if ((split_index == 0 and ridx == reads_in_first_split) or (split_index > 0 and ridx % self.reads_per_split == 0)) \ and ridx != 0: split_index += 1 writer.close() writer = FastaWriter(self._out_fn(split_index)) self.out_fns.append(self._out_fn(split_index)) writer.writeRecord(r.name, r.sequence[:]) writer.close() return list(self.out_fns)
revcmp(Ri_sequence) 3.2 If revcmp_primers is True, >Fi Fi_sequence >Ri Ri_sequence >Fi_revcmp revcmp(Fi_sqeuence) >Ri_revcmp revcmp(Ri_sqeuence) 4. return primers range(0, n) """ logging.info("Process primers for {case}.".format(case=( "finding primers" if not revcmp_primers else "detecting chimeras" ))) freader = ContigSetReaderWrapper(primer_fn) primers = [] primerComboId = -1 for i, r in enumerate(freader): if i % 2 == 0: direction = "F" primerComboId += 1 else: direction = "R" expectedName = "{d}{n}".format(d=direction, n=primerComboId) if r.name != expectedName: errMsg = "Primers should be placed in order F0, R0, F1, R1..." logging.error(errMsg) raise ClassifierException(errMsg)
def build_uc_from_partial(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, blasr_nproc=12, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=blasr_nproc) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) if ccs_fofn is None: logging.info("Loading probability from model") probqv = ProbFromModel(.01, .07, .06) else: # FIXME this will not work with current CCS bam output, which lacks # QV pulse features required - this is handled via a workaround in # pbtranscript.tasks.ice_partial logging.info("Loading probability from QV in %s", ccs_fofn) probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Calling blasr_against_ref ...") hitItems = blasr_against_ref(output_filename=m5_file, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, ccs_fofn=None, done_filename=None, use_finer_qv=False, cpus=24, no_qv_or_aln_checking=True, tmp_dir=None): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. ccs_fofn --- If None, assume no quality value is available, otherwise, use QV from ccs_fofn. tmp_dir - where to save intermediate files such as dazz files. if None, writer dazz files to the same directory as query/target. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = op.dirname(out_pickle) ice_opts = IceOptions() ice_opts.detect_cDNA_size(ref_fasta) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(query_filename=input_fasta, target_filename=ref_fasta, is_FL=False, same_strand_only=False, query_converted=False, target_converted=True, dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"), use_sge=False, sge_opts=None, cpus=cpus) runner.run(min_match_len=300, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode) if no_qv_or_aln_checking: # not using QVs or alignment checking! # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used logging.info("Not using QV for partial_uc. Loading dummy QV.") probqv = ProbFromModel(.01, .07, .06) else: if ccs_fofn is None: logging.info("Loading probability from model (0.01,0.07,0.06)") probqv = ProbFromModel(.01, .07, .06) else: start_t = time.time() if use_finer_qv: probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta) logging.info("Loading QVs from %s + %s took %s secs", ccs_fofn, input_fasta, time.time() - start_t) else: input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq' logging.info("Converting %s + %s --> %s", input_fasta, ccs_fofn, input_fastq) ice_fa2fq(input_fasta, ccs_fofn, input_fastq) probqv = ProbFromFastq(input_fastq) logging.info("Loading QVs from %s took %s secs", input_fastq, time.time() - start_t) logging.info("Calling dalign_against_ref ...") partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for la4ice_filename in runner.la4ice_filenames: start_t = time.time() hitItems = daligner_against_ref( query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=False, sID_starts_with_c=True, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, ece_penalty=1, ece_min_len=20, same_strand_only=False, no_qv_or_aln_checking=no_qv_or_aln_checking) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing %s took %s sec", la4ice_filename, str(time.time() - start_t)) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename) # remove all the .las and .las.out filenames runner.clean_run()
def _processPrimers(self, primer_fn, window_size, primer_out_fn, revcmp_primers=False): """ Check and generate primers. 1. Check primers in primer_fn are in order F0, R0, F1, R1, ... Fn, Rn, and lengths are all < k, where k is the primer search window length. F0 5' NNNNNNNNNN 3' R0 3' NNNNNNNNNN 5' 2. If Ri and Fi are revers complementarily identical, add a polyA tail to 3' of Ri. 3. For each combo of primers Fi and Ri, save the following to primer_out_fn. 3.1 If revcmp_primers is False, >Fi Fi_sequence >Ri revcmp(Ri_sequence) 3.2 If revcmp_primers is True, >Fi Fi_sequence >Ri Ri_sequence >Fi_revcmp revcmp(Fi_sqeuence) >Ri_revcmp revcmp(Ri_sqeuence) 4. return primers range(0, n) """ logging.info("Process primers for {case}.". format(case=("finding primers" if not revcmp_primers else "detecting chimeras"))) freader = ContigSetReaderWrapper(primer_fn) primers = [] primerComboId = -1 for i, r in enumerate(freader): if i % 2 == 0: direction = "F" primerComboId += 1 else: direction = "R" expectedName = "{d}{n}".format(d=direction, n=primerComboId) if r.name != expectedName: errMsg = "Primers should be placed in order F0, R0, F1, R1..." logging.error(errMsg) raise ClassifierException(errMsg) if len(r.sequence) > window_size: errMsg = "Primer {n} has length {l} which is longer than {k}.".\ format(n=expectedName, l=len(r.sequence), k=window_size) logging.error(errMsg) raise ClassifierException(errMsg) if direction == "F": # Save >Fi and Fi_sequence. primers.append([expectedName, r.sequence]) else: # direction is "R" # fwdF/fwdR is the forward sequence of Fi/Ri fwdF, fwdR = primers[-1][1], r.sequence # revcmpF/revcmpR is the reverse complement of Fi/Ri revcmpF, revcmpR = revcmp(fwdF), revcmp(fwdR) # If Fi and Ri are reverse complementariliy identical, bail out, # because we need Poly A tail to distinguish Fi and Ri. if fwdF.find(revcmpR) >= 0 or revcmpR.find(fwdF) >= 0: infoMsg = "Primer F{n}, R{n} ".format(n=primerComboId) + \ "are reverse complementarily identical. " + \ "Need to add 'AAAA' to 3' to distinguish them." logging.info(infoMsg) if revcmp_primers is False: # Save primer Ri and revcmp(Ri_sequence) + TTTT primers.append([expectedName, revcmpR + "T" * 4]) else: # revcmp_primers is True primers.append([expectedName, "A" * 4 + fwdR]) primers.append(['F{n}_revcmp'.format(n=primerComboId), revcmpF]) primers.append(['R{n}_revcmp'.format(n=primerComboId), revcmpR + "T" * 4]) else: # Ri and Fi are not revcmp identical if revcmp_primers is False: # Save >Ri and revcmp(Ri_sequence) primers.append([expectedName, revcmpR]) else: # Save >Ri and Ri_sequence primers.append([expectedName, fwdR]) # Save >Fi_revcmp and revcmp(Fi_sequence) primers.append(['F{n}_revcmp'.format(n=primerComboId), revcmpF]) # Save >Ri_revcmp and revcmp(Ri_sequence) primers.append(['R{n}_revcmp'.format(n=primerComboId), revcmpR]) freader.close() # Write Fi and reverse-complemented Ri to primer_out_fn f = open(primer_out_fn, 'w') for (name, seq) in primers: f.write(">{n}\n{s}\n".format(n=name, s=seq)) f.close() return range(0, primerComboId + 1)
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle, done_filename, ice_opts, probqv, qv_prob_threshold=0.3, cpus=4, no_qv_or_aln_checking=False, tmp_dir=None, sID_starts_with_c=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using DALIGNER, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. tmp_dir - where to save intermediate files such as dazz files. if None, writer dazz files to the same directory as query/target. """ input_fasta = realpath(input_fasta) ref_fasta = realpath(ref_fasta) out_pickle = realpath(out_pickle) output_dir = op.dirname(out_pickle) ice_opts.detect_cDNA_size(ref_fasta) # ice_partial is already being called through qsub, so run everything local! runner = DalignerRunner(query_filename=input_fasta, target_filename=ref_fasta, is_FL=False, same_strand_only=False, query_converted=False, target_converted=True, dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"), use_sge=False, sge_opts=None, cpus=cpus) runner.run(min_match_len=ice_opts.min_match_len, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from DALIGNER hits.") for la4ice_filename in runner.la4ice_filenames: start_t = time.time() # not providing full_missed_start/end since aligning nFLs, ok to partially align only hitItems = daligner_against_ref2(query_dazz_handler=runner.query_dazz_handler, target_dazz_handler=runner.target_dazz_handler, la4ice_filename=la4ice_filename, is_FL=False, sID_starts_with_c=sID_starts_with_c, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, qv_prob_threshold=qv_prob_threshold, ece_penalty=ice_opts.ece_penalty, ece_min_len=ice_opts.ece_min_len, same_strand_only=True, no_qv_or_aln_checking=no_qv_or_aln_checking, max_missed_start=ice_opts.max_missed_start, max_missed_end=ice_opts.max_missed_end, full_missed_start=ice_opts.full_missed_start, full_missed_end=ice_opts.full_missed_end) for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) logging.info("processing %s took %s sec", la4ice_filename, str(time.time()-start_t)) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename) # remove all the .las and .las.out filenames runner.clean_run()
def build_uc_from_partial_blasr(input_fasta, ref_fasta, out_pickle, done_filename, ice_opts, probqv, qv_prob_threshold=0.3, cpus=4, no_qv_or_aln_checking=False, tmp_dir=None, sID_starts_with_c=False): """ Given an input_fasta file of non-full-length (partial) reads and (unpolished) consensus isoforms sequences in ref_fasta, align reads to consensus isoforms using BLASR, and then build up a mapping between consensus isoforms and reads (i.e., assign reads to isoforms). Finally, save {isoform_id: [read_ids], nohit: set(no_hit_read_ids)} to an output pickle file. """ input_fasta = _get_fasta_path(realpath(input_fasta)) m5_file = os.path.basename(input_fasta) + ".blasr" if tmp_dir is not None: m5_file = op.join(tmp_dir, m5_file) out_pickle = realpath(out_pickle) cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \ "{r} --bestn 100 --nCandidates 200 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \ "--nproc {n} -m 5 ".format(n=cpus) + \ "--maxScore -1000 --minPctIdentity 85 " + \ "--minAlnLength {a} ".format(a=ice_opts.min_match_len) + \ "--out {o} ".format(o=real_upath(m5_file)) + \ "1>/dev/null 2>/dev/null" execute(cmd) logging.info("Calling blasr_against_ref ...") # no need to provide full_missed_start/end for nFLs, since is_FL = False hitItems = blasr_against_ref2(output_filename=m5_file, is_FL=False, sID_starts_with_c=sID_starts_with_c, qver_get_func=probqv.get_smoothed, qvmean_get_func=probqv.get_mean, qv_prob_threshold=qv_prob_threshold, ece_penalty=ice_opts.ece_penalty, ece_min_len=ice_opts.ece_min_len, max_missed_start=ice_opts.max_missed_start, max_missed_end=ice_opts.max_missed_end, full_missed_start=ice_opts.full_missed_start, full_missed_end=ice_opts.full_missed_end, same_strand_only=False) partial_uc = {} # Maps each isoform (cluster) id to a list of reads # which can map to the isoform seen = set() # reads seen logging.info("Building uc from BLASR hits.") for h in hitItems: if h.ece_arr is not None: if h.cID not in partial_uc: partial_uc[h.cID] = set() partial_uc[h.cID].add(h.qID) seen.add(h.qID) for k in partial_uc: partial_uc[k] = list(partial_uc[k]) allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta)) logging.info("Counting reads with no hit.") nohit = allhits.difference(seen) logging.info("Dumping uc to a pickle: %s.", out_pickle) with open(out_pickle, 'w') as f: if out_pickle.endswith(".pickle"): dump({'partial_uc': partial_uc, 'nohit': nohit}, f) elif out_pickle.endswith(".json"): f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit})) else: raise IOError("Unrecognized extension: %s" % out_pickle) os.remove(m5_file) done_filename = realpath(done_filename) if done_filename is not None \ else out_pickle + '.DONE' logging.debug("Creating %s.", done_filename) touch(done_filename)