def prog_umi_group_ec(args): import os from fileio import zopen, FastqFormat from seq import QSequence assert args.k > 0 assert args.max_offset >= 0 fn, fext = os.path.splitext(args.o) with zopen(fn + ".discarded" + fext, 'w') as discard_file: umi_groups = run_umi_group_ec( FastqFormat.records_in(zopen(args.i, 'r')), args.k, args.max_offset, args.min_score, FastqFormat.records_out(discard_file, None)) FastqFormat.records_out(zopen(args.o, 'w'), (QSequence("UMI:%s:%s" % (umi, cqs.count), cqs.seq, cqs.qual) for umi, cqs in umi_groups.iteritems()))
def prog_umi_group_ec(args): import os from fileio import zopen, FastqFormat from seq import QSequence assert args.k > 0 assert args.max_offset >= 0 assert args.min_score_for_offset <= args.k assert args.min_score_for_merge <= args.k fn, fext = os.path.splitext(args.o) discarded = FastqFormat.records_out(zopen(fn + '.discarded' + fext, 'w'), None) input_file = stdin if args.i is None else zopen(args.i, 'r') _progress_indicator(0, 0, input_file) umi_groups = run_umi_group_ec(records=FastqFormat.records_in(input_file), k=args.k, moff=args.max_offset, min_score4offset=args.min_score_for_offset, min_score4merge=args.min_score_for_merge, discarded=discarded, callback=_progress_indicator) stdout.write('\n') out = FastqFormat.records_out(zopen(args.o, 'w'), None) print 'Writing results to file' for name, grouplist in umi_groups.iteritems(): grouplist = sorted(grouplist, key=lambda cqs: (-cqs.count, -sum(cqs.qual))) cqs = grouplist[0] out += QSequence( 'UMI:%s:%s:%s' % (name, '%s/%s' % ('1', len(grouplist)), cqs.count), cqs.seq, cqs.qual) for i, cqs in enumerate(grouplist[1:]): discarded += QSequence( 'UMI:%s:%s:%s' % (name, '%s/%s' % (i + 2, len(grouplist)), cqs.count), cqs.seq, cqs.qual)
def prog_umi_group_ec(args): import os from fileio import zopen, FastqFormat from seq import QSequence assert args.k > 0 assert args.max_offset >= 0 assert args.min_score_for_offset <= args.k assert args.min_score_for_merge <= args.k fn, fext = os.path.splitext(args.o) discarded = FastqFormat.records_out( zopen(fn + '.discarded' + fext, 'w'), None) input_file = stdin if args.i is None else zopen(args.i, 'r') _progress_indicator(0, 0, input_file) umi_groups = run_umi_group_ec( records = FastqFormat.records_in(input_file), k = args.k, moff = args.max_offset, min_score4offset = args.min_score_for_offset, min_score4merge = args.min_score_for_merge, discarded = discarded, callback = _progress_indicator) stdout.write('\n') out = FastqFormat.records_out(zopen(args.o, 'w'), None) print 'Writing results to file' for name, grouplist in umi_groups.iteritems(): grouplist = sorted(grouplist, key = lambda cqs:(-cqs.count, -sum(cqs.qual))) cqs = grouplist[0] out += QSequence('UMI:%s:%s:%s'%(name, '%s/%s'%('1', len(grouplist)), cqs.count), cqs.seq, cqs.qual) for i, cqs in enumerate(grouplist[1:]): discarded += QSequence('UMI:%s:%s:%s'%(name, '%s/%s'%(i+2, len(grouplist)),cqs.count), cqs.seq, cqs.qual)
def prog_pipeline(args): # [Defaults] section confidence = float(config.get("Defaults", "confidence")) min_phred_threshold = int(config.get("Defaults", "min_phred_threshold")) clone_classname = config.get("Defaults", "clone_classname") min_seqlen = int(config.get("Defaults", "min_seqlen")) include_cysphe = str2bool(config.get("Defaults", "include_cysphe")) species = config.get("Defaults", "species") gene = config.get("Defaults", "gene") update_interval = float(config.get("Defaults", "update_interval")) ref_fn = path.join(here, config.get("Defaults", "ref_fn")) alignments_fn = config.get("Defaults", "alignments_fn") alignment_stats_fn = config.get("Defaults", "alignment_stats_fn") Q_mm_stats_fn = config.get("Defaults", "Q_mm_stats_fn") Q_mm_stats_plot_fn = config.get("Defaults", "Q_mm_stats_plot_fn") qplot_fn = config.get("Defaults", "qplot_fn") output_fn = config.get('Defaults', 'output_fn') output_hdr = config.get('Defaults', 'output_hdr').decode('string_escape') output_fmt = config.get('Defaults', 'output_fmt').decode('string_escape') # [Aligner] section location = config.get("Aligner", "location") cmd_build_index = config.get("Aligner", "cmd_build_index") args_build_index = config.get("Aligner", "args_build_index") cmd_align = config.get("Aligner", "cmd_align") args_align_base = config.get("Aligner", "args_align_base") args_align_v = args_align_base + " " + \ config.get("Aligner", "args_align_v") args_align_j = args_align_base + " " + \ config.get("Aligner", "args_align_j") if args.no_VJ_collapsing: clone_classname = "AnnotatedCloneDistinctAllele" ref_fn = path.realpath(ref_fn) \ if args.ref is None else path.realpath(args.ref) reads_fn = path.realpath(args.reads) phred_encoding = determine_phred_encoding(reads_fn) \ if args.phred_encoding is None else args.phred_encoding assert phred_encoding in (33, 64) species = species if args.species is None else args.species species = species.split(",") gene = gene if args.gene is None else args.gene gene = gene.split(",") n_threads = mp.cpu_count() if args.threads is None else args.threads # Prepare aligner commands and check existence of aligner cmd_build_index = path.realpath(path.join(location, cmd_build_index)) cmd_align = path.realpath(path.join(location, cmd_align)) if not path.isfile(cmd_build_index): raise ValueError(\ "Executable to build an index (\"%s\") does not exist.\n\ Please use \"rtcr Config\" to see if the Aligner is properly configured" % cmd_build_index) if not path.isfile(cmd_align): raise ValueError(\ "Executable to align sequences (\"%s\") does not exist.\n\ Please use \"rtcr Config\" to see if the Aligner is properly configured " % cmd_align) init_logging() if args.debug: logger.setLevel(logging.DEBUG) logger.debug("log level set to DEBUG") if args.verbose: logger.addHandler(logging.StreamHandler(stdout, )) logger.info("Writing log statements to stdout") # Note, delaying import of modules that have a logger until after logging # has been initialised. from fileio import read_reference, zopen from pipeline import Pipeline ref = read_reference(ref_fn).get_slice(species=species, gene=gene) for s in species: if not s in ref.species: logger.error("species \"%s\" does not exist in reference" % s) sys.exit(1) for g in gene: if not g in ref.genes: logger.error("gene \"%s\" does not exist in reference" % s) sys.exit(1) version = __version__ preamble = '\nRTCR version: %(version)s\n' % locals() preamble += '\n[Command line arguments]\n' + \ '\n'.join(['%s : %s'%(i,v) for i,v in enumerate(sys.argv)]) preamble += '\n\ [Files]\n\ Reference: %(ref_fn)s\n\ Reads: %(reads_fn)s\n\ Output: %(output_fn)s\n\ \n\ [Settings]\n\ PHRED encoding: %(phred_encoding)s\n\ Species: %(species)s\n\ Gene: %(gene)s\n\ confidence: %(confidence)s\n\ \n\ [Immune receptor reference]\n' % locals() for species in sorted(ref.species): for gene in sorted(ref.genes): for region in sorted(ref.regions): alleles = ref.get_alleles(species=species, gene=gene, region=region) n = len(alleles) if n > 0: preamble += "%s,%s,%s: %s alleles\n" % (species, gene, region, n) s = "" for allele in alleles: s += "%s, %s\n" % (allele.species, allele.name) logger.debug("species, allele\n" + s) preamble += "\n[Pipeline run]" logger.info(preamble) # Make sure exceptions are logged, even when not caught sys.excepthook = handle_uncaught_exception pipeline = Pipeline(ref=ref, reads=zopen(reads_fn, 'r'), phred_encoding=phred_encoding, cmd_build_index=cmd_build_index, args_build_index=args_build_index, cmd_align=cmd_align, args_align_v=args_align_v, args_align_j=args_align_j, alignments_fn=alignments_fn, alignment_stats_fn=alignment_stats_fn, Q_mm_stats_fn=Q_mm_stats_fn, Q_mm_stats_plot_fn=Q_mm_stats_plot_fn, output_fn=output_fn, output_hdr=output_hdr, output_fmt=output_fmt, clone_classname=clone_classname, confidence=confidence, min_seqlen=min_seqlen, include_cysphe=include_cysphe, min_phred_threshold=min_phred_threshold, n_threads=n_threads, update_interval=update_interval, listener=Listener()) pipeline.daemon = True pipeline.name = 'Pipeline' try: pipeline.start() while pipeline.is_alive(): pipeline.join(1) except KeyboardInterrupt: logger.error('Caught keyboard interrupt. Shutting down.') pipeline.stop() pipeline.join(1)
def prog_checkout(args): search_rc = args.reverse_complement barcodes_fn = args.barcodes adapters = list(BarcodeFormat.records_in(open(barcodes_fn, 'r'))) outfiles = {sample_id : open("%s.fastq"%sample_id,'w') \ for (sample_id, master, slave) in adapters} fq1_fn = args.i fq2_fn = args.i2 max_mm = args.max_mm fq1_file = zopen(fq1_fn, 'r') if isinstance(fq1_file, gzip.GzipFile): fq1_filesize = os.path.getsize(fq1_file.name) fq1_filepos = fq1_file.fileobj.tell else: fq1_filesize = filesize(fq1_file) fq1_filepos = fq1_file.tell fq1 = FastqFormat.records_in(fq1_file, encoding=None) if fq2_fn: fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None) else: fq2 = False n_accepted = 0 prev_time = time() for i, r1 in enumerate(fq1): if time() - prev_time > .5: prev_time = time() frac = float(fq1_filepos()) / fq1_filesize stdout.write(term.EL(2) + term.CHA(0) + \ "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i, frac*100, n_accepted, 100*float(n_accepted)/i)) stdout.flush() if fq2: r2 = next(fq2) assert (r1.id == r2.id) # Demultiplex best_match = None for (sample_id, master, slave) in adapters: matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc) master_match, is_rc = get_best_match(matches, matches_rc) # look for master on the mate if (not master_match or master_match[0] < len(master.seq)) and \ fq2: # master not found or no full length match found matches2, matches_rc2 = master.locate_in( r2.seq, max_mm, search_rc) master_match2, is_rc2 = get_best_match(matches2, matches_rc2) if not master_match2 and not master_match: # master not found on r1 nor on r2 continue if not master_match or (master_match2 and \ master_match2[0] < master_match[0]): master_match = master_match2 is_rc = is_rc2 # apparently strands are swapped r1, r2 = r2, r1 if master_match == None: continue if is_rc: master_match = list(master_match) master_match[1] = \ len(r1.seq) - (master_match[1] + len(master.seq)) master_match = tuple(master_match) r1 = FastqRecord(id=r1.id + " rc", desc=r1.desc, seq=revcomp(r1.seq), qual_str=r1.qual_str[::-1]) if fq2: r2 = FastqRecord(id=r2.id + " rc", desc=r2.desc, seq=revcomp(r2.seq), qual_str=r2.qual_str[::-1]) # Master adapter has been found, retrieve its UMI (if it has one) master_umi = ("", "") if master.has_UMI(): # get umi master_umi = master.get_UMI(r1.seq, r1.qual_str, master_match[1]) if master.UMI_length != len(master_umi[0]): # failed to retrieve UMI from master adapter continue # Look for slave adapter slave_match = None slave_umi = ("", "") if slave: # has slave adapter slave_matches, slave_matches_rc = slave.locate_in( r2.seq, max_mm, search_rc=False) slave_match = get_best_match(slave_matches, slave_matches_rc) if slave.has_UMI(): # get umi slave_umi = slave.get_UMI(r2.seq, r2.qual_str, slave_match[1]) if slave.UMI_length != len(slave_umi[0]): continue if not best_match or best_match[0][0] > master_match[0]: umi = [x + y for x, y in zip(master_umi, slave_umi)] best_match = (master_match, sample_id, umi) if best_match: master_match, sample_id, umi = best_match out = outfiles[sample_id] out.write("@%s UMI:%s:%s\n" % (r1.id, umi[0], umi[1])) out.write("%s\n+\n%s\n" % (r1.seq, r1.qual_str)) n_accepted += 1
def run(self): if self._alignments_fn is None or self._alignments_fn == "": output_alignments = False else: output_alignments = True if output_alignments and os.path.isfile(self._alignments_fn): logger.info("SKIPPING creation of %s"%self._alignments_fn) output_alignments = False alignment_file = zopen(self._alignments_fn, 'r') vj_recs = SAMFormat.records_in(alignment_file) # Get two (rows/)alignments at a time from vj_recs alns = ((rec, next(vj_recs)) for rec in vj_recs) self._listener.notify("Reading alignments from %s"% self._alignments_fn) else: alns = get_vj_alignments(self._ref, self._reads, self._cmd_build_index, self._args_build_index, self._cmd_align, self._args_align_v, self._args_align_j, phred_encoding = self._phred_encoding, n_threads = self._n_threads) self._listener.notify("Aligning reference sequences to reads") # Keep track of the quality scores of the bases that went into the # sequences of the clones. Q_counts = {} # Build clones and use alignments to count mismatches and indels cs = CloneSet() alnstats = {"V":{}, "J":{}} if self._include_cysphe: v_refpos_offset = -3 j_refpos_offset = 3 else: v_refpos_offset = 0 j_refpos_offset = 0 try: if output_alignments: out = zopen(self._alignments_fn, 'w') if output_alignments: infile = self._reads else: infile = alignment_file prev_time = time() if isinstance(infile, gzip.GzipFile): infile_size = os.path.getsize(infile.name) infile_pos = infile.fileobj.tell else: infile_size = filesize(infile) infile_pos = infile.tell self._listener.notify(("PROGRESSBAR", "Alignments", "start")) for v_rec, j_rec in alns: if self.stopped(): logger.warning("Pipeline stopped") return if time() - prev_time >= self._update_interval: prev_time = time() if not infile.closed: pos = infile_pos() else: # assuming a closed infile means the entire infile has # been processed. pos = infile_size frac = float(pos) / infile_size self._listener.notify(("PROGRESSBAR", "Alignments", frac)) if output_alignments: out.write("\t".join(map(str, v_rec)) + "\n" + \ "\t".join(map(str, j_rec)) + "\n") clone = build_clone(self._ref, v_rec, j_rec, self._include_cysphe, self._clone_classname) if clone is None: continue seqlen = len(clone.seq) if seqlen < self._min_seqlen: continue # Count base qualities in the clone (which is possible because # at this point the clone is based on a single read) lenfam_Q_counts = Q_counts.setdefault(seqlen, [0] * 42) for i in xrange(clone.v.end, clone.j.start): lenfam_Q_counts[clone.qual[i]] += 1 cs.add(clone, merge = True) v_allele = self._ref[v_rec.RNAME] j_allele = self._ref[j_rec.RNAME] # Count errors in the alignments for (rec, r_roi_start, r_roi_end) in \ ((v_rec, v_allele.refpos + v_refpos_offset, 0), (j_rec, 0, j_allele.refpos + j_refpos_offset)): allele = self._ref[rec.RNAME] lenfam_alnstats = alnstats[allele.region].setdefault( seqlen, { "n" : 0, "mm" : 0, "ins" : 0, "dels" : 0, "Q_mm" : [0] * 42, "Q_n" : [0] * 42}) n, mm, ins, dels, r_roi_as, r_roi_ae = get_error_stats(rec, allele.seq, lenfam_alnstats["Q_mm"], lenfam_alnstats["Q_n"], r_roi_start, r_roi_end) lenfam_alnstats["n"] += n lenfam_alnstats["mm"] += mm lenfam_alnstats["ins"] += ins lenfam_alnstats["dels"] += dels finally: if output_alignments: out.close() self._listener.notify(("PROGRESSBAR", "Alignments", "end")) if len(cs) == 0: msg = "No clones found in alignments. \ Was the correct germline reference used?" logger.error(msg) raise Exception(msg) if not self._alignment_stats_fn is None and \ self._alignment_stats_fn != "": logger.info("Writing alignment stats to \"%s\""% self._alignment_stats_fn) with zopen(self._alignment_stats_fn, 'w') as out: out.write("seqlen,region,n,mm,ins,dels\n") for region in alnstats: for seqlen, lenfam_alnstats in \ alnstats[region].iteritems(): out.write(",".join(map(str,[ seqlen, region, lenfam_alnstats["n"], lenfam_alnstats["mm"], lenfam_alnstats["ins"], lenfam_alnstats["dels"]])) + "\n") self._save_cloneset(cs, "r") # Sum all the counts in the V and J regions separately, and calculate # average error rates tot_err = {"V":{}, "J":{}} for region in ("V", "J"): region_stats = alnstats[region] x = tot_err[region] x["n"] = sum([y["n"] for y in region_stats.itervalues()]) x["mm"] = sum([y["mm"] for y in region_stats.itervalues()]) x["ins"] = sum([y["ins"] for y in region_stats.itervalues()]) x["dels"]= sum([y["dels"] for y in region_stats.itervalues()]) n = x["n"] if n > 0: x["mmr"] = float(x["mm"]) / n x["insr"] = float(x["ins"]) / n x["delsr"] = float(x["dels"]) / n else: x["mmr"] = 0 x["insr"] = 0 x["delsr"] = 0 global_mmr = max(tot_err["V"]["mmr"], tot_err["J"]["mmr"]) global_insr = max(tot_err["V"]["insr"], tot_err["J"]["insr"]) global_delsr = max(tot_err["V"]["delsr"], tot_err["J"]["delsr"]) logger.info("global error rates: mmr: %(global_mmr)s, \ insr: %(global_insr)s, delsr: %(global_delsr)s"%locals()) # Calculate observed error rates for Phred scores Q_mm_stats = {"V":{}, "J":{}} for region, region_stats in alnstats.iteritems(): Q_mm = Q_mm_stats[region].setdefault("Q_mm", [0] * 42) Q_n = Q_mm_stats[region].setdefault("Q_n", [0] * 42) for lenfam_alnstats in region_stats.itervalues(): for i in xrange(42): Q_mm[i] += lenfam_alnstats["Q_mm"][i] Q_n[i] += lenfam_alnstats["Q_n"][i] if not self._Q_mm_stats_fn is None and self._Q_mm_stats_fn != "": with zopen(self._Q_mm_stats_fn, 'w') as out: out.write("region,Q,n,mm\n") for region in Q_mm_stats: for Q,(mm, n) in enumerate(izip(Q_mm_stats[region]["Q_mm"], Q_mm_stats[region]["Q_n"])): out.write("%s,%s,%s,%s\n"%(region, Q, n, mm)) # Calculate ratio between base quality score assigned by the sequencer # and observed base quality (based on alignments with germline # reference). sum_ratios = 0 n_ratios = 0 for region in Q_mm_stats: Q_mm = Q_mm_stats[region]["Q_mm"] Q_n = Q_mm_stats[region]["Q_n"] for q in xrange(42): mm = Q_mm[q] n = Q_n[q] if mm > 0 and n > 0: q_obs = p2q(float(mm) / n) if q_obs > 0: sum_ratios += (q / q_obs) * n n_ratios += n if n_ratios > 0: alpha = float(sum_ratios) / n_ratios else: logger.warning('No instances found of a Phred score associated ' +\ 'with mismatches.') alpha = 1.0 logger.info("Ratio between base quality and observed quality: %s"% alpha) if not self._Q_mm_stats_plot_fn is None and \ self._Q_mm_stats_plot_fn != "": plot_Q_mm_stats(Q_mm_stats, self._Q_mm_stats_plot_fn) # Get median quality score Q_n = [0] * 42 # count number of bases for every Q score for lenfam_Q_counts in Q_counts.itervalues(): for q, count in enumerate(lenfam_Q_counts): Q_n[q] += count i = ((sum(Q_n) + 1) // 2) - 1 # index of median element in Q_n j = 0 for max_Q, count in enumerate(Q_n): j += count if j > i: break logger.info("max_Q = %s"%max_Q) pool = ConnectedConsumerPool(n_consumers = self._n_threads) by_seqlen = lambda clone:len(clone.seq) confidence = self._confidence for seqlen, clones in groupby(sorted(cs, key = by_seqlen), by_seqlen): if self.stopped(): logger.warning("Pipeline stopped") return cs2 = CloneSet(clones) # Calculate expected number of errors based on Q scores lenfam_Q_counts = Q_counts[seqlen] # get total number of bases between V and J region n_o = sum(lenfam_Q_counts) mm_o = 0 for q, count in enumerate(lenfam_Q_counts): q /= alpha mm_o += q2p(q) * count mm_v = alnstats["V"][seqlen]["mm"] n_v = alnstats["V"][seqlen]["n"] mm_j = alnstats["J"][seqlen]["mm"] n_j = alnstats["J"][seqlen]["n"] mm_tot = mm_v + mm_o + mm_j n_tot = n_v + n_o + n_j logger.info("Mismatch stats for seqlen %s: mm_v (%s, %s),\ mm_o (%s, %s), mm_j (%s, %s), mm_tot (%s, %s)"%(seqlen, mm_v, float(mm_v)/n_v if n_v > 0 else 0, mm_o, float(mm_o)/n_o if n_o > 0 else 0, mm_j, float(mm_j)/n_j if n_j > 0 else 0, mm_tot, float(mm_tot)/n_tot if n_tot > 0 else 0)) local_mmr = float(mm_tot) / n_tot mmr = max(local_mmr, global_mmr) logger.info("Adding task: seqlen: %(seqlen)s, mismatch_rate: \ %(mmr)s, confidence: %(confidence)s, max_Q: %(max_Q)s"%locals()) pool.add_task(run_ec_on_bin, (cs2, mmr, confidence, max_Q)) self._listener.notify("Running QMerge and IMerge on bins.") self.run_pool(pool, desc = 'QMerge, IMerge') results = pool.results cloneset = CloneSet(chain.from_iterable([x[0] for x in results])) self._save_cloneset(cloneset, "rqi") self._listener.notify("Running LMerge") cloneset = run_lmerge(cloneset, global_mmr, global_insr, global_delsr, confidence) self._save_cloneset(cloneset, "rqil") pool = ConnectedConsumerPool(n_consumers = self._n_threads) for seqlen, clones in groupby(sorted(cloneset, key = by_seqlen), by_seqlen): cs2 = CloneSet(clones) pool.add_task(wrapper_run_nmerge_on_bin, args = (cs2,)) self._listener.notify("Running NMerge on bins.") self.run_pool(pool, desc = 'NMerge') results = pool.results cloneset = CloneSet(chain.from_iterable(results)) self._save_cloneset(cloneset, "rqiln") ######################## # Write clones to file # ######################## self._listener.notify("Writing clones") with open(self._output_fn, 'w') as res_ok: with open("discarded_clones.tsv", 'w') as res_not_ok: header = self._output_hdr res_ok.write(header) res_not_ok.write(header) n_discarded = 0 for clone in sorted(cloneset, key = lambda clone:(-clone.count, clone.seq)): min_phred = min(clone.qual) aa_seq = nt2aa(clone.seq) n_stop_codons = sum([aa == '*' for aa in aa_seq]) frame = len(clone.seq) % 3 if min_phred < self._min_phred_threshold \ or n_stop_codons > 0 or frame != 0: n_discarded += 1 out = res_not_ok else: out = res_ok out.write(clone2str(clone, fmt = self._output_fmt)) self._listener.notify("Discarded %s clones"%n_discarded)
def prog_pipeline(args): # [Defaults] section confidence = float(config.get("Defaults", "confidence")) min_phred_threshold = int(config.get("Defaults", "min_phred_threshold")) clone_classname = config.get("Defaults", "clone_classname") min_seqlen = int(config.get("Defaults", "min_seqlen")) include_cysphe = str2bool(config.get("Defaults", "include_cysphe")) species = config.get("Defaults", "species") gene = config.get("Defaults", "gene") update_interval = float(config.get("Defaults", "update_interval")) ref_fn = path.join(here, config.get("Defaults", "ref_fn")) alignments_fn = config.get("Defaults", "alignments_fn") alignment_stats_fn = config.get("Defaults", "alignment_stats_fn") Q_mm_stats_fn = config.get("Defaults", "Q_mm_stats_fn") Q_mm_stats_plot_fn = config.get("Defaults", "Q_mm_stats_plot_fn") qplot_fn = config.get("Defaults", "qplot_fn") output_fn = config.get('Defaults', 'output_fn') output_hdr = config.get('Defaults', 'output_hdr').decode('string_escape') output_fmt = config.get('Defaults', 'output_fmt').decode('string_escape') # [Aligner] section location = config.get("Aligner", "location") cmd_build_index = config.get("Aligner", "cmd_build_index") args_build_index = config.get("Aligner", "args_build_index") cmd_align = config.get("Aligner", "cmd_align") args_align_base = config.get("Aligner", "args_align_base") args_align_v = args_align_base + " " + \ config.get("Aligner", "args_align_v") args_align_j = args_align_base + " " + \ config.get("Aligner", "args_align_j") if args.no_VJ_collapsing: clone_classname = "AnnotatedCloneDistinctAllele" ref_fn = path.realpath(ref_fn) \ if args.ref is None else path.realpath(args.ref) reads_fn = path.realpath(args.reads) phred_encoding = determine_phred_encoding(reads_fn) \ if args.phred_encoding is None else args.phred_encoding assert phred_encoding in (33, 64) species = species if args.species is None else args.species species = species.split(",") gene = gene if args.gene is None else args.gene gene = gene.split(",") n_threads = mp.cpu_count() if args.threads is None else args.threads # Prepare aligner commands and check existence of aligner cmd_build_index = path.realpath(path.join(location, cmd_build_index)) cmd_align = path.realpath(path.join(location, cmd_align)) if not path.isfile(cmd_build_index): raise ValueError(\ "Executable to build an index (\"%s\") does not exist.\n\ Please use \"rtcr Config\" to see if the Aligner is properly configured"% cmd_build_index) if not path.isfile(cmd_align): raise ValueError(\ "Executable to align sequences (\"%s\") does not exist.\n\ Please use \"rtcr Config\" to see if the Aligner is properly configured "% cmd_align) init_logging() if args.debug: logging.root.setLevel(logging.DEBUG) logging.root.debug("log level set to DEBUG") if args.verbose: logging.root.addHandler(logging.StreamHandler(stdout,)) logging.root.info("Writing log statements to stdout") # Note, delaying import of modules that have a logger until after logging # has been initialised. from fileio import read_reference, zopen from pipeline import Pipeline ref = read_reference(ref_fn).get_slice(species = species, gene = gene) for s in species: if not s in ref.species: logger.error("species \"%s\" does not exist in reference"%s) sys.exit(1) for g in gene: if not g in ref.genes: logger.error("gene \"%s\" does not exist in reference"%s) sys.exit(1) version = __version__ preamble = '\nRTCR version: %(version)s\n'%locals() preamble += '\n[Command line arguments]\n' + \ '\n'.join(['%s : %s'%(i,v) for i,v in enumerate(sys.argv)]) preamble += '\n\ [Files]\n\ Reference: %(ref_fn)s\n\ Reads: %(reads_fn)s\n\ Output: %(output_fn)s\n\ \n\ [Settings]\n\ PHRED encoding: %(phred_encoding)s\n\ Species: %(species)s\n\ Gene: %(gene)s\n\ confidence: %(confidence)s\n\ \n\ [Immune receptor reference]\n'%locals() for species in sorted(ref.species): for gene in sorted(ref.genes): for region in sorted(ref.regions): alleles = ref.get_alleles(species = species, gene = gene, region = region) n = len(alleles) if n > 0: preamble += "%s,%s,%s: %s alleles\n"%( species, gene, region, n) s = "" for allele in alleles: s += "%s, %s\n"%(allele.species, allele.name) logger.debug("species, allele\n" + s) preamble += "\n[Pipeline run]" logger.info(preamble) # Make sure exceptions are logged, even when not caught sys.excepthook = handle_uncaught_exception pipeline = Pipeline( ref = ref, reads = zopen(reads_fn, 'r'), phred_encoding = phred_encoding, cmd_build_index = cmd_build_index, args_build_index = args_build_index, cmd_align = cmd_align, args_align_v = args_align_v, args_align_j = args_align_j, alignments_fn = alignments_fn, alignment_stats_fn = alignment_stats_fn, Q_mm_stats_fn = Q_mm_stats_fn, Q_mm_stats_plot_fn = Q_mm_stats_plot_fn, output_fn = output_fn, output_hdr = output_hdr, output_fmt = output_fmt, clone_classname = clone_classname, confidence = confidence, min_seqlen = min_seqlen, include_cysphe = include_cysphe, min_phred_threshold = min_phred_threshold, n_threads = n_threads, update_interval = update_interval, listener = Listener()) pipeline.daemon = True pipeline.name = 'Pipeline' try: pipeline.start() while pipeline.is_alive(): pipeline.join(1) except KeyboardInterrupt: logger.error('Caught keyboard interrupt. Shutting down.') pipeline.stop() pipeline.join(1)
def checkout(fq1_fn, fq2_fn, adapters, max_mm, search_rc, paired=False): assert not fq1_fn is None assert not (paired and not fq2_fn is None) print 'Handling file(s): %s' % ''.join( [fq1_fn, '' if fq2_fn is None else ', %s' % fq2_fn]) fq1_file = zopen(fq1_fn, 'r') if isinstance(fq1_file, gzip.GzipFile): fq1_filesize = os.path.getsize(fq1_file.name) fq1_filepos = fq1_file.fileobj.tell else: fq1_filesize = filesize(fq1_file) fq1_filepos = fq1_file.tell fq1 = FastqFormat.records_in(fq1_file, encoding=None) if not fq2_fn is None: fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None) else: fq2 = None outfiles = {} for (sample_id, master, slave) in adapters: outfiles[sample_id] = { "out1" : (open("%s_R1.fastq"%sample_id, 'w'), 'R1') \ if not paired else \ (open("%s_R12.fastq"%sample_id, 'w'), 'R12'), "out2" : (None, None) if fq2 is None else \ (open("%s_R2.fastq"%sample_id, 'w'), 'R2')} n_accepted = 0 prev_time = time() for i, r1 in enumerate(fq1): if fq2: r2 = next(fq2) assert (r1.id == r2.id) else: r2 = None # Demultiplex best_match = None for (sample_id, master, slave) in adapters: matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc) master_match, is_rc = get_best_match(matches, matches_rc) # look for master on the mate if (not master_match or master_match[0] < len(master.seq)) and \ fq2: # master not found or no full length match found matches2, matches_rc2 = master.locate_in( r2.seq, max_mm, search_rc) master_match2, is_rc2 = get_best_match(matches2, matches_rc2) if not master_match2 and not master_match: # master not found on r1 nor on r2 continue if not master_match or (master_match2 and \ master_match2[0] < master_match[0]): master_match = master_match2 is_rc = is_rc2 # apparently strands are swapped r1, r2 = r2, r1 if master_match is None: continue if is_rc: master_match = list(master_match) master_match[1] = \ len(r1.seq) - (master_match[1] + len(master.seq)) master_match = tuple(master_match) r1 = FastqRecord(id=r1.id + " rc", desc=r1.desc, seq=revcomp(r1.seq), qual_str=r1.qual_str[::-1]) if fq2: r2 = FastqRecord(id=r2.id + " rc", desc=r2.desc, seq=revcomp(r2.seq), qual_str=r2.qual_str[::-1]) # Master adapter has been found, retrieve its UMI (if it has one) master_umi = ("", "") if master.has_UMI(): # get umi master_umi = master.get_UMI(r1.seq, r1.qual_str, master_match[1]) if master.UMI_length != len(master_umi[0]): # failed to retrieve UMI from master adapter continue # Look for slave adapter slave_match = None slave_umi = ("", "") if slave: # has slave adapter if paired: r = r1 else: r = r2 slave_matches, slave_matches_rc = slave.locate_in( r.seq, max_mm, search_rc=search_rc) slave_match, slave_is_rc = get_best_match( slave_matches, slave_matches_rc) if not slave_match: # No slave found continue if slave.has_UMI(): # get umi if slave_is_rc: slave_umi_start = len( r.seq) - (slave_match[1] + len(slave.seq)) slave_umi = slave.get_UMI(revcomp(r.seq), r.qual_str[::-1], slave_umi_start) else: slave_umi = slave.get_UMI(r.seq, r.qual_str, slave_match[1]) if slave.UMI_length != len(slave_umi[0]): continue if not best_match or best_match[0][0] > master_match[0] or \ (best_match[0][0] == master_match[0] and \ slave_match and \ (not best_match[1] or not best_match[1][0] or \ best_match[1][0] > slave_match[0])): umi = [x + y for x, y in zip(master_umi, slave_umi)] best_match = (master_match, slave_match, sample_id, umi) if best_match: master_match, slave_match, sample_id, umi = best_match for (r, (out, typename)) in ((r1, outfiles[sample_id]["out1"]), (r2, outfiles[sample_id]["out2"])): if not out: continue out.write("@%s UMI:%s:%s:%s\n" % (r.id, typename, umi[0], umi[1])) out.write("%s\n+\n%s\n" % (r.seq, r.qual_str)) n_accepted += 1 frac = float(fq1_filepos()) / fq1_filesize if time() - prev_time > .5 or frac == 1.0: prev_time = time() stdout.write(term.EL(2) + term.CHA(0) + \ "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i + 1, frac*100, n_accepted, (100*float(n_accepted)/(i+1)))) stdout.flush() stdout.write('\n')
def run(self): if self._alignments_fn is None or self._alignments_fn == "": output_alignments = False else: output_alignments = True if output_alignments and os.path.isfile(self._alignments_fn): logger.info("SKIPPING creation of %s" % self._alignments_fn) output_alignments = False alignment_file = zopen(self._alignments_fn, 'r') vj_recs = SAMFormat.records_in(alignment_file) # Get two (rows/)alignments at a time from vj_recs alns = ((rec, next(vj_recs)) for rec in vj_recs) self._listener.notify("Reading alignments from %s" % self._alignments_fn) else: alns = get_vj_alignments(self._ref, self._reads, self._cmd_build_index, self._args_build_index, self._cmd_align, self._args_align_v, self._args_align_j, phred_encoding=self._phred_encoding, n_threads=self._n_threads) self._listener.notify("Aligning reference sequences to reads") # Keep track of the quality scores of the bases that went into the # sequences of the clones. Q_counts = {} # Build clones and use alignments to count mismatches and indels cs = CloneSet() alnstats = {"V": {}, "J": {}} v_refpos_offset = -3 j_refpos_offset = 3 try: if output_alignments: out = zopen(self._alignments_fn, 'w') if output_alignments: infile = self._reads else: infile = alignment_file prev_time = time() if isinstance(infile, gzip.GzipFile): infile_size = os.path.getsize(infile.name) infile_pos = infile.fileobj.tell else: infile_size = filesize(infile) infile_pos = infile.tell self._listener.notify(("PROGRESSBAR", "Alignments", "start")) for v_rec, j_rec in alns: if self.stopped(): logger.warning("Pipeline stopped") return if time() - prev_time >= self._update_interval: prev_time = time() if not infile.closed: pos = infile_pos() else: # assuming a closed infile means the entire infile has # been processed. pos = infile_size frac = float(pos) / infile_size self._listener.notify(("PROGRESSBAR", "Alignments", frac)) if output_alignments: out.write("\t".join(map(str, v_rec)) + "\n" + \ "\t".join(map(str, j_rec)) + "\n") clone = build_clone(self._ref, v_rec, j_rec, self._clone_classname) if clone is None: continue seqlen = len(clone.seq) if seqlen < self._min_seqlen: continue # Count base qualities in the clone (which is possible because # at this point the clone is based on a single read) lenfam_Q_counts = Q_counts.setdefault(seqlen, [0] * 42) for i in xrange(clone.v.end, clone.j.start): lenfam_Q_counts[clone.qual[i]] += 1 cs.add(clone, merge=True) v_allele = self._ref[v_rec.RNAME] j_allele = self._ref[j_rec.RNAME] # Count errors in the alignments for (rec, r_roi_start, r_roi_end) in \ ((v_rec, v_allele.refpos + v_refpos_offset, 0), (j_rec, 0, j_allele.refpos + j_refpos_offset)): allele = self._ref[rec.RNAME] lenfam_alnstats = alnstats[allele.region].setdefault( seqlen, { "n": 0, "mm": 0, "ins": 0, "dels": 0, "Q_mm": [0] * 42, "Q_n": [0] * 42 }) n, mm, ins, dels, r_roi_as, r_roi_ae = get_error_stats( rec, allele.seq, lenfam_alnstats["Q_mm"], lenfam_alnstats["Q_n"], r_roi_start, r_roi_end) lenfam_alnstats["n"] += n lenfam_alnstats["mm"] += mm lenfam_alnstats["ins"] += ins lenfam_alnstats["dels"] += dels finally: if output_alignments: out.close() self._listener.notify(("PROGRESSBAR", "Alignments", "end")) if len(cs) == 0: msg = "No clones found in alignments. \ Was the correct germline reference used?" logger.error(msg) raise Exception(msg) if not self._alignment_stats_fn is None and \ self._alignment_stats_fn != "": logger.info("Writing alignment stats to \"%s\"" % self._alignment_stats_fn) with zopen(self._alignment_stats_fn, 'w') as out: out.write("seqlen,region,n,mm,ins,dels\n") for region in alnstats: for seqlen, lenfam_alnstats in \ alnstats[region].iteritems(): out.write(",".join( map(str, [ seqlen, region, lenfam_alnstats["n"], lenfam_alnstats["mm"], lenfam_alnstats["ins"], lenfam_alnstats["dels"] ])) + "\n") self._save_cloneset(cs, "r") # Sum all the counts in the V and J regions separately, and calculate # average error rates tot_err = {"V": {}, "J": {}} for region in ("V", "J"): region_stats = alnstats[region] x = tot_err[region] x["n"] = sum([y["n"] for y in region_stats.itervalues()]) x["mm"] = sum([y["mm"] for y in region_stats.itervalues()]) x["ins"] = sum([y["ins"] for y in region_stats.itervalues()]) x["dels"] = sum([y["dels"] for y in region_stats.itervalues()]) n = x["n"] if n > 0: x["mmr"] = float(x["mm"]) / n x["insr"] = float(x["ins"]) / n x["delsr"] = float(x["dels"]) / n else: x["mmr"] = 0 x["insr"] = 0 x["delsr"] = 0 global_mmr = max(tot_err["V"]["mmr"], tot_err["J"]["mmr"]) global_insr = max(tot_err["V"]["insr"], tot_err["J"]["insr"]) global_delsr = max(tot_err["V"]["delsr"], tot_err["J"]["delsr"]) logger.info("global error rates: mmr: %(global_mmr)s, \ insr: %(global_insr)s, delsr: %(global_delsr)s" % locals()) # Calculate observed error rates for Phred scores Q_mm_stats = {"V": {}, "J": {}} for region, region_stats in alnstats.iteritems(): Q_mm = Q_mm_stats[region].setdefault("Q_mm", [0] * 42) Q_n = Q_mm_stats[region].setdefault("Q_n", [0] * 42) for lenfam_alnstats in region_stats.itervalues(): for i in xrange(42): Q_mm[i] += lenfam_alnstats["Q_mm"][i] Q_n[i] += lenfam_alnstats["Q_n"][i] if not self._Q_mm_stats_fn is None and self._Q_mm_stats_fn != "": with zopen(self._Q_mm_stats_fn, 'w') as out: out.write("region,Q,n,mm\n") for region in Q_mm_stats: for Q, (mm, n) in enumerate( izip(Q_mm_stats[region]["Q_mm"], Q_mm_stats[region]["Q_n"])): out.write("%s,%s,%s,%s\n" % (region, Q, n, mm)) # Calculate ratio between base quality score assigned by the sequencer # and observed base quality (based on alignments with germline # reference). sum_ratios = 0 n_ratios = 0 for region in Q_mm_stats: Q_mm = Q_mm_stats[region]["Q_mm"] Q_n = Q_mm_stats[region]["Q_n"] for q in xrange(42): mm = Q_mm[q] n = Q_n[q] if mm > 0 and n > 0: q_obs = p2q(float(mm) / n) if q_obs > 0: sum_ratios += (q / q_obs) * n n_ratios += n if n_ratios > 0: alpha = float(sum_ratios) / n_ratios else: logger.warning('No instances found of a Phred score associated ' +\ 'with mismatches.') alpha = 1.0 logger.info("Ratio between base quality and observed quality: %s" % alpha) if not self._Q_mm_stats_plot_fn is None and \ self._Q_mm_stats_plot_fn != "": plot_Q_mm_stats(Q_mm_stats, self._Q_mm_stats_plot_fn) # Get median quality score Q_n = [0] * 42 # count number of bases for every Q score for lenfam_Q_counts in Q_counts.itervalues(): for q, count in enumerate(lenfam_Q_counts): Q_n[q] += count i = ((sum(Q_n) + 1) // 2) - 1 # index of median element in Q_n j = 0 for max_Q, count in enumerate(Q_n): j += count if j > i: break logger.info("max_Q = %s" % max_Q) pool = ConnectedConsumerPool(n_consumers=self._n_threads) by_seqlen = lambda clone: len(clone.seq) confidence = self._confidence for seqlen, clones in groupby(sorted(cs, key=by_seqlen), by_seqlen): if self.stopped(): logger.warning("Pipeline stopped") return cs2 = CloneSet(clones) # Calculate expected number of errors based on Q scores lenfam_Q_counts = Q_counts[seqlen] # get total number of bases between V and J region n_o = sum(lenfam_Q_counts) mm_o = 0 for q, count in enumerate(lenfam_Q_counts): q /= alpha mm_o += q2p(q) * count mm_v = alnstats["V"][seqlen]["mm"] n_v = alnstats["V"][seqlen]["n"] mm_j = alnstats["J"][seqlen]["mm"] n_j = alnstats["J"][seqlen]["n"] mm_tot = mm_v + mm_o + mm_j n_tot = n_v + n_o + n_j logger.info("Mismatch stats for seqlen %s: mm_v (%s, %s),\ mm_o (%s, %s), mm_j (%s, %s), mm_tot (%s, %s)" % (seqlen, mm_v, float(mm_v) / n_v if n_v > 0 else 0, mm_o, float(mm_o) / n_o if n_o > 0 else 0, mm_j, float(mm_j) / n_j if n_j > 0 else 0, mm_tot, float(mm_tot) / n_tot if n_tot > 0 else 0)) local_mmr = float(mm_tot) / n_tot mmr = max(local_mmr, global_mmr) logger.info("Adding task: seqlen: %(seqlen)s, mismatch_rate: \ %(mmr)s, confidence: %(confidence)s, max_Q: %(max_Q)s" % locals()) pool.add_task(run_ec_on_bin, (cs2, mmr, confidence, max_Q)) self._listener.notify("Running QMerge and IMerge on bins.") self.run_pool(pool, desc='QMerge, IMerge') results = pool.results cloneset = CloneSet(chain.from_iterable([x[0] for x in results])) self._save_cloneset(cloneset, "rqi") self._listener.notify("Running LMerge") cloneset = run_lmerge(cloneset, global_mmr, global_insr, global_delsr, confidence) self._save_cloneset(cloneset, "rqil") pool = ConnectedConsumerPool(n_consumers=self._n_threads) for seqlen, clones in groupby(sorted(cloneset, key=by_seqlen), by_seqlen): cs2 = CloneSet(clones) pool.add_task(wrapper_run_nmerge_on_bin, args=(cs2, )) self._listener.notify("Running NMerge on bins.") self.run_pool(pool, desc='NMerge') results = pool.results cloneset = CloneSet(chain.from_iterable(results)) self._save_cloneset(cloneset, "rqiln") ######################## # Write clones to file # ######################## self._listener.notify("Writing clones") sequence_id = 0 with open(self._output_fn, 'w') as res_ok: with open(self._output_not_ok_fn, 'w') as res_not_ok: header = '\t'.join(\ clone2AIRRDict(clone = None, ref = None).keys()) + '\n' res_ok.write(header) res_not_ok.write(header) n_discarded = 0 for clone in sorted(cloneset, key=lambda clone: (-clone.count, clone.seq)): record = clone2AIRRDict(clone=clone, ref=self._ref) min_phred = int(record['junction_minimum_quality_score']) if min_phred < self._min_phred_threshold \ or record['stop_codon'] == 'T' or \ record['vj_in_frame'] == 'F': n_discarded += 1 out = res_not_ok else: out = res_ok sequence_id += 1 record['sequence_id'] = str(sequence_id) out.write('\t'.join([v for k, v in record.iteritems()]) +\ '\n') self._listener.notify("Discarded %s clones" % n_discarded)