示例#1
0
def _progress_indicator(rec_nr, n_discarded, handle=None):
    if not handle is None:
        if isinstance(handle, gzip.GzipFile):
            _progress_indicator.endpos = os.path.getsize(handle.name)
            _progress_indicator.get_pos = handle.fileobj.tell
        else:
            try:
                _progress_indicator.endpos = filesize(handle)
                _progress_indicator.get_pos = handle.tell
            except IOError:
                _progress_indicator.endpos = None
                _progress_indicator.get_pos = lambda: None

        _progress_indicator.prev_time = time()
        return
    get_pos = _progress_indicator.get_pos
    endpos = _progress_indicator.endpos

    frac = None if endpos is None else float(get_pos()) / endpos
    if time() - _progress_indicator.prev_time > .5 or frac == 1.0:
        _progress_indicator.prev_time = time()
        perc_str = '?%' if frac is None else '%.2f%%' % (frac * 100)
        stdout.write(term.EL(2) + term.CHA(0) + \
                'Processed %s records (%s)'%(rec_nr + 1, perc_str))
        stdout.flush()
示例#2
0
def _progress_indicator(rec_nr, n_discarded, handle = None):
    if not handle is None:
        if isinstance(handle, gzip.GzipFile):
            _progress_indicator.endpos = os.path.getsize(handle.name) 
            _progress_indicator.get_pos = handle.fileobj.tell
        else:
            try:
                _progress_indicator.endpos = filesize(handle)
                _progress_indicator.get_pos = handle.tell
            except IOError:
                _progress_indicator.endpos = None
                _progress_indicator.get_pos = lambda : None

        _progress_indicator.prev_time = time()
        return
    get_pos = _progress_indicator.get_pos
    endpos = _progress_indicator.endpos

    frac = None if endpos is None else float(get_pos()) / endpos
    if time() - _progress_indicator.prev_time > .5 or frac == 1.0:
        _progress_indicator.prev_time = time()
        perc_str = '?%' if frac is None else '%.2f%%'%(frac*100)
        stdout.write(term.EL(2) + term.CHA(0) + \
                'Processed %s records (%s)'%(rec_nr + 1, perc_str))
        stdout.flush()
示例#3
0
def prog_checkout(args):
    search_rc = args.reverse_complement
    barcodes_fn = args.barcodes

    adapters = list(BarcodeFormat.records_in(open(barcodes_fn, 'r')))

    outfiles = {sample_id : open("%s.fastq"%sample_id,'w') \
            for (sample_id, master, slave) in adapters}

    fq1_fn = args.i
    fq2_fn = args.i2

    max_mm = args.max_mm

    fq1_file = zopen(fq1_fn, 'r')
    if isinstance(fq1_file, gzip.GzipFile):
        fq1_filesize = os.path.getsize(fq1_file.name)
        fq1_filepos = fq1_file.fileobj.tell
    else:
        fq1_filesize = filesize(fq1_file)
        fq1_filepos = fq1_file.tell

    fq1 = FastqFormat.records_in(fq1_file, encoding=None)
    if fq2_fn:
        fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None)
    else:
        fq2 = False

    n_accepted = 0
    prev_time = time()
    for i, r1 in enumerate(fq1):
        if time() - prev_time > .5:
            prev_time = time()
            frac = float(fq1_filepos()) / fq1_filesize
            stdout.write(term.EL(2) + term.CHA(0) + \
                    "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i,
                        frac*100, n_accepted, 100*float(n_accepted)/i))
            stdout.flush()

        if fq2:
            r2 = next(fq2)
            assert (r1.id == r2.id)

        # Demultiplex
        best_match = None
        for (sample_id, master, slave) in adapters:
            matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc)

            master_match, is_rc = get_best_match(matches, matches_rc)

            # look for master on the mate
            if (not master_match or master_match[0] < len(master.seq)) and \
                    fq2:
                # master not found or no full length match found
                matches2, matches_rc2 = master.locate_in(
                    r2.seq, max_mm, search_rc)

                master_match2, is_rc2 = get_best_match(matches2, matches_rc2)

                if not master_match2 and not master_match:
                    # master not found on r1 nor on r2
                    continue

                if not master_match or (master_match2 and \
                        master_match2[0] < master_match[0]):
                    master_match = master_match2
                    is_rc = is_rc2
                    # apparently strands are swapped
                    r1, r2 = r2, r1

            if master_match == None:
                continue

            if is_rc:
                master_match = list(master_match)
                master_match[1] = \
                        len(r1.seq) - (master_match[1] + len(master.seq))
                master_match = tuple(master_match)
                r1 = FastqRecord(id=r1.id + " rc",
                                 desc=r1.desc,
                                 seq=revcomp(r1.seq),
                                 qual_str=r1.qual_str[::-1])
                if fq2:
                    r2 = FastqRecord(id=r2.id + " rc",
                                     desc=r2.desc,
                                     seq=revcomp(r2.seq),
                                     qual_str=r2.qual_str[::-1])

            # Master adapter has been found, retrieve its UMI (if it has one)
            master_umi = ("", "")
            if master.has_UMI():  # get umi
                master_umi = master.get_UMI(r1.seq, r1.qual_str,
                                            master_match[1])
                if master.UMI_length != len(master_umi[0]):
                    # failed to retrieve UMI from master adapter
                    continue

            # Look for slave adapter
            slave_match = None

            slave_umi = ("", "")
            if slave:  # has slave adapter
                slave_matches, slave_matches_rc = slave.locate_in(
                    r2.seq, max_mm, search_rc=False)
                slave_match = get_best_match(slave_matches, slave_matches_rc)

                if slave.has_UMI():  # get umi
                    slave_umi = slave.get_UMI(r2.seq, r2.qual_str,
                                              slave_match[1])
                    if slave.UMI_length != len(slave_umi[0]):
                        continue

            if not best_match or best_match[0][0] > master_match[0]:
                umi = [x + y for x, y in zip(master_umi, slave_umi)]
                best_match = (master_match, sample_id, umi)

        if best_match:
            master_match, sample_id, umi = best_match
            out = outfiles[sample_id]
            out.write("@%s UMI:%s:%s\n" % (r1.id, umi[0], umi[1]))
            out.write("%s\n+\n%s\n" % (r1.seq, r1.qual_str))
            n_accepted += 1
示例#4
0
文件: pipeline.py 项目: uubram/RTCR
    def run(self):
        if self._alignments_fn is None or self._alignments_fn == "":
            output_alignments = False
        else:
            output_alignments = True

        if output_alignments and os.path.isfile(self._alignments_fn):
            logger.info("SKIPPING creation of %s"%self._alignments_fn)
            output_alignments = False
            alignment_file = zopen(self._alignments_fn, 'r')
            vj_recs = SAMFormat.records_in(alignment_file)
            # Get two (rows/)alignments at a time from vj_recs
            alns = ((rec, next(vj_recs)) for rec in vj_recs)
            self._listener.notify("Reading alignments from %s"%
                    self._alignments_fn)
        else:
            alns = get_vj_alignments(self._ref, self._reads,
                    self._cmd_build_index,
                    self._args_build_index,
                    self._cmd_align,
                    self._args_align_v,
                    self._args_align_j,
                    phred_encoding = self._phred_encoding,
                    n_threads = self._n_threads)
            self._listener.notify("Aligning reference sequences to reads")

        # Keep track of the quality scores of the bases that went into the
        # sequences of the clones.
        Q_counts = {}

        # Build clones and use alignments to count mismatches and indels
        cs = CloneSet()
        alnstats = {"V":{}, "J":{}}
        if self._include_cysphe:
            v_refpos_offset = -3
            j_refpos_offset = 3
        else:
            v_refpos_offset = 0
            j_refpos_offset = 0

        try:
            if output_alignments:
                out = zopen(self._alignments_fn, 'w')

            if output_alignments:
                infile = self._reads
            else:
                infile = alignment_file

            prev_time = time()
            if isinstance(infile, gzip.GzipFile):
                infile_size = os.path.getsize(infile.name)
                infile_pos = infile.fileobj.tell
            else:
                infile_size = filesize(infile)
                infile_pos = infile.tell

            self._listener.notify(("PROGRESSBAR", "Alignments", "start"))

            for v_rec, j_rec in alns:
                if self.stopped():
                    logger.warning("Pipeline stopped")
                    return

                if time() - prev_time >= self._update_interval:
                    prev_time = time()
                    if not infile.closed:
                        pos = infile_pos()
                    else:
                        # assuming a closed infile means the entire infile has
                        # been processed.
                        pos = infile_size
                    frac = float(pos) / infile_size
                    self._listener.notify(("PROGRESSBAR", "Alignments", frac))

                if output_alignments:
                    out.write("\t".join(map(str, v_rec)) + "\n" + \
                            "\t".join(map(str, j_rec)) + "\n")

                clone = build_clone(self._ref, v_rec, j_rec,
                        self._include_cysphe, self._clone_classname)

                if clone is None:
                    continue

                seqlen = len(clone.seq)
                if seqlen < self._min_seqlen:
                    continue

                # Count base qualities in the clone (which is possible because
                # at this point the clone is based on a single read)
                lenfam_Q_counts = Q_counts.setdefault(seqlen, [0] * 42)
                for i in xrange(clone.v.end, clone.j.start):
                    lenfam_Q_counts[clone.qual[i]] += 1
                
                cs.add(clone, merge = True)

                v_allele = self._ref[v_rec.RNAME]
                j_allele = self._ref[j_rec.RNAME]
                # Count errors in the alignments
                for (rec, r_roi_start, r_roi_end) in \
                        ((v_rec, v_allele.refpos + v_refpos_offset, 0),
                        (j_rec, 0, j_allele.refpos + j_refpos_offset)):
                    allele = self._ref[rec.RNAME]
                    lenfam_alnstats = alnstats[allele.region].setdefault(
                            seqlen, {
                        "n"     : 0,
                        "mm"    : 0,
                        "ins"   : 0,
                        "dels"  : 0,
                        "Q_mm"  : [0] * 42,
                        "Q_n"   : [0] * 42})
                    n, mm, ins, dels, r_roi_as, r_roi_ae = get_error_stats(rec,
                            allele.seq,
                            lenfam_alnstats["Q_mm"], lenfam_alnstats["Q_n"],
                            r_roi_start, r_roi_end)
                    lenfam_alnstats["n"] += n
                    lenfam_alnstats["mm"] += mm
                    lenfam_alnstats["ins"] += ins
                    lenfam_alnstats["dels"] += dels
        finally:
            if output_alignments:
                out.close()
        self._listener.notify(("PROGRESSBAR", "Alignments", "end"))

        if len(cs) == 0:
            msg = "No clones found in alignments. \
Was the correct germline reference used?"
            logger.error(msg)
            raise Exception(msg)

        if not self._alignment_stats_fn is None and \
                self._alignment_stats_fn != "":
            logger.info("Writing alignment stats to \"%s\""%
                    self._alignment_stats_fn)
            with zopen(self._alignment_stats_fn, 'w') as out:
                out.write("seqlen,region,n,mm,ins,dels\n")
                for region in alnstats:
                    for seqlen, lenfam_alnstats in \
                            alnstats[region].iteritems():
                        out.write(",".join(map(str,[
                            seqlen, region,
                            lenfam_alnstats["n"],
                            lenfam_alnstats["mm"],
                            lenfam_alnstats["ins"],
                            lenfam_alnstats["dels"]])) + "\n")

        self._save_cloneset(cs, "r")

        # Sum all the counts in the V and J regions separately, and calculate
        # average error rates
        tot_err = {"V":{}, "J":{}}
        for region in ("V", "J"):
            region_stats = alnstats[region]
            x = tot_err[region]
            x["n"] = sum([y["n"] for y in region_stats.itervalues()])
            x["mm"] = sum([y["mm"] for y in region_stats.itervalues()])
            x["ins"] = sum([y["ins"] for y in region_stats.itervalues()])
            x["dels"]= sum([y["dels"] for y in region_stats.itervalues()])

            n = x["n"]
            if n > 0:
                x["mmr"] = float(x["mm"]) / n 
                x["insr"] = float(x["ins"]) / n
                x["delsr"] = float(x["dels"]) / n
            else:
                x["mmr"] = 0
                x["insr"] = 0
                x["delsr"] = 0
        global_mmr = max(tot_err["V"]["mmr"], tot_err["J"]["mmr"])
        global_insr = max(tot_err["V"]["insr"], tot_err["J"]["insr"])
        global_delsr = max(tot_err["V"]["delsr"], tot_err["J"]["delsr"])
        logger.info("global error rates: mmr: %(global_mmr)s, \
insr: %(global_insr)s, delsr: %(global_delsr)s"%locals())

        # Calculate observed error rates for Phred scores
        Q_mm_stats = {"V":{}, "J":{}}
        for region, region_stats in alnstats.iteritems():
            Q_mm = Q_mm_stats[region].setdefault("Q_mm", [0] * 42)
            Q_n = Q_mm_stats[region].setdefault("Q_n", [0] * 42)
            for lenfam_alnstats in region_stats.itervalues():
                for i in xrange(42):
                    Q_mm[i] += lenfam_alnstats["Q_mm"][i]
                    Q_n[i] += lenfam_alnstats["Q_n"][i]

        if not self._Q_mm_stats_fn is None and self._Q_mm_stats_fn != "":
            with zopen(self._Q_mm_stats_fn, 'w') as out:
                out.write("region,Q,n,mm\n")
                for region in Q_mm_stats:
                    for Q,(mm, n) in enumerate(izip(Q_mm_stats[region]["Q_mm"],
                            Q_mm_stats[region]["Q_n"])):
                        out.write("%s,%s,%s,%s\n"%(region, Q, n, mm))

        # Calculate ratio between base quality score assigned by the sequencer
        # and observed base quality (based on alignments with germline
        # reference).
        sum_ratios = 0
        n_ratios = 0
        for region in Q_mm_stats:
            Q_mm = Q_mm_stats[region]["Q_mm"]
            Q_n = Q_mm_stats[region]["Q_n"]
            for q in xrange(42):
                mm = Q_mm[q]
                n = Q_n[q]
                if mm > 0 and n > 0:
                    q_obs = p2q(float(mm) / n)
                    if q_obs > 0:
                        sum_ratios += (q / q_obs) * n
                        n_ratios += n
        if n_ratios > 0:
            alpha = float(sum_ratios) / n_ratios
        else:
            logger.warning('No instances found of a Phred score associated ' +\
                    'with mismatches.')
            alpha = 1.0

        logger.info("Ratio between base quality and observed quality: %s"%
                alpha)

        if not self._Q_mm_stats_plot_fn is None and \
                self._Q_mm_stats_plot_fn != "":
            plot_Q_mm_stats(Q_mm_stats, self._Q_mm_stats_plot_fn)

        # Get median quality score
        Q_n = [0] * 42 # count number of bases for every Q score
        for lenfam_Q_counts in Q_counts.itervalues():
            for q, count in enumerate(lenfam_Q_counts):
                Q_n[q] += count
        i = ((sum(Q_n) + 1) // 2) - 1 # index of median element in Q_n
        j = 0
        for max_Q, count in enumerate(Q_n):
            j += count
            if j > i:
                break
        logger.info("max_Q = %s"%max_Q)

        pool = ConnectedConsumerPool(n_consumers = self._n_threads)
        by_seqlen = lambda clone:len(clone.seq)
        confidence = self._confidence
        for seqlen, clones in groupby(sorted(cs, key = by_seqlen), by_seqlen):
            if self.stopped():
                logger.warning("Pipeline stopped")
                return
            cs2 = CloneSet(clones)
            # Calculate expected number of errors based on Q scores
            lenfam_Q_counts = Q_counts[seqlen]

            # get total number of bases between V and J region
            n_o = sum(lenfam_Q_counts)
            mm_o = 0
            for q, count in enumerate(lenfam_Q_counts):
                q /= alpha
                mm_o += q2p(q) * count

            mm_v = alnstats["V"][seqlen]["mm"]
            n_v = alnstats["V"][seqlen]["n"]

            mm_j = alnstats["J"][seqlen]["mm"]
            n_j = alnstats["J"][seqlen]["n"]

            mm_tot = mm_v + mm_o + mm_j
            n_tot = n_v + n_o + n_j
            logger.info("Mismatch stats for seqlen %s: mm_v (%s, %s),\
mm_o (%s, %s), mm_j (%s, %s), mm_tot (%s, %s)"%(seqlen,
                mm_v, float(mm_v)/n_v if n_v > 0 else 0,
                mm_o, float(mm_o)/n_o if n_o > 0 else 0,
                mm_j, float(mm_j)/n_j if n_j > 0 else 0,
                mm_tot, float(mm_tot)/n_tot if n_tot > 0 else 0))
            local_mmr = float(mm_tot) / n_tot
            mmr = max(local_mmr, global_mmr)
            logger.info("Adding task: seqlen: %(seqlen)s, mismatch_rate: \
%(mmr)s, confidence: %(confidence)s, max_Q: %(max_Q)s"%locals())
            pool.add_task(run_ec_on_bin, (cs2, mmr, confidence, max_Q))
    
        self._listener.notify("Running QMerge and IMerge on bins.")
        self.run_pool(pool, desc = 'QMerge, IMerge')
        results = pool.results
        cloneset = CloneSet(chain.from_iterable([x[0] for x in results]))
        self._save_cloneset(cloneset, "rqi")

        self._listener.notify("Running LMerge")
        cloneset = run_lmerge(cloneset, global_mmr, global_insr, global_delsr,
                confidence)
        self._save_cloneset(cloneset, "rqil")

        pool = ConnectedConsumerPool(n_consumers = self._n_threads)
        for seqlen, clones in groupby(sorted(cloneset, key = by_seqlen),
                by_seqlen):
            cs2 = CloneSet(clones)
            pool.add_task(wrapper_run_nmerge_on_bin,
                    args = (cs2,))
        self._listener.notify("Running NMerge on bins.")
        self.run_pool(pool, desc = 'NMerge')
        results = pool.results
        cloneset = CloneSet(chain.from_iterable(results))
        self._save_cloneset(cloneset, "rqiln")

        ########################
        # Write clones to file #
        ########################
        self._listener.notify("Writing clones")
        with open(self._output_fn, 'w') as res_ok:
            with open("discarded_clones.tsv", 'w') as res_not_ok:
                header = self._output_hdr
                res_ok.write(header)
                res_not_ok.write(header)

                n_discarded = 0
                for clone in sorted(cloneset,
                        key = lambda clone:(-clone.count, clone.seq)):
                    min_phred = min(clone.qual)
                    aa_seq = nt2aa(clone.seq)
                    n_stop_codons = sum([aa == '*' for aa in aa_seq])
                    frame = len(clone.seq) % 3
                    if min_phred < self._min_phred_threshold \
                            or n_stop_codons > 0 or frame != 0:
                        n_discarded += 1
                        out = res_not_ok
                    else:
                        out = res_ok
                    out.write(clone2str(clone, fmt = self._output_fmt))
        self._listener.notify("Discarded %s clones"%n_discarded)
示例#5
0
def checkout(fq1_fn, fq2_fn, adapters, max_mm, search_rc, paired=False):
    assert not fq1_fn is None
    assert not (paired and not fq2_fn is None)

    print 'Handling file(s): %s' % ''.join(
        [fq1_fn, '' if fq2_fn is None else ', %s' % fq2_fn])

    fq1_file = zopen(fq1_fn, 'r')
    if isinstance(fq1_file, gzip.GzipFile):
        fq1_filesize = os.path.getsize(fq1_file.name)
        fq1_filepos = fq1_file.fileobj.tell
    else:
        fq1_filesize = filesize(fq1_file)
        fq1_filepos = fq1_file.tell

    fq1 = FastqFormat.records_in(fq1_file, encoding=None)
    if not fq2_fn is None:
        fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None)
    else:
        fq2 = None

    outfiles = {}
    for (sample_id, master, slave) in adapters:
        outfiles[sample_id] = {
                "out1" : (open("%s_R1.fastq"%sample_id, 'w'), 'R1') \
                        if not paired else \
                        (open("%s_R12.fastq"%sample_id, 'w'), 'R12'),
                "out2" : (None, None) if fq2 is None else \
                        (open("%s_R2.fastq"%sample_id, 'w'), 'R2')}

    n_accepted = 0
    prev_time = time()
    for i, r1 in enumerate(fq1):
        if fq2:
            r2 = next(fq2)
            assert (r1.id == r2.id)
        else:
            r2 = None

        # Demultiplex
        best_match = None
        for (sample_id, master, slave) in adapters:
            matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc)

            master_match, is_rc = get_best_match(matches, matches_rc)

            # look for master on the mate
            if (not master_match or master_match[0] < len(master.seq)) and \
                    fq2:
                # master not found or no full length match found
                matches2, matches_rc2 = master.locate_in(
                    r2.seq, max_mm, search_rc)

                master_match2, is_rc2 = get_best_match(matches2, matches_rc2)

                if not master_match2 and not master_match:
                    # master not found on r1 nor on r2
                    continue

                if not master_match or (master_match2 and \
                        master_match2[0] < master_match[0]):
                    master_match = master_match2
                    is_rc = is_rc2
                    # apparently strands are swapped
                    r1, r2 = r2, r1

            if master_match is None:
                continue

            if is_rc:
                master_match = list(master_match)
                master_match[1] = \
                        len(r1.seq) - (master_match[1] + len(master.seq))
                master_match = tuple(master_match)
                r1 = FastqRecord(id=r1.id + " rc",
                                 desc=r1.desc,
                                 seq=revcomp(r1.seq),
                                 qual_str=r1.qual_str[::-1])
                if fq2:
                    r2 = FastqRecord(id=r2.id + " rc",
                                     desc=r2.desc,
                                     seq=revcomp(r2.seq),
                                     qual_str=r2.qual_str[::-1])

            # Master adapter has been found, retrieve its UMI (if it has one)
            master_umi = ("", "")
            if master.has_UMI():  # get umi
                master_umi = master.get_UMI(r1.seq, r1.qual_str,
                                            master_match[1])
                if master.UMI_length != len(master_umi[0]):
                    # failed to retrieve UMI from master adapter
                    continue

            # Look for slave adapter
            slave_match = None

            slave_umi = ("", "")
            if slave:  # has slave adapter
                if paired:
                    r = r1
                else:
                    r = r2

                slave_matches, slave_matches_rc = slave.locate_in(
                    r.seq, max_mm, search_rc=search_rc)
                slave_match, slave_is_rc = get_best_match(
                    slave_matches, slave_matches_rc)

                if not slave_match:  # No slave found
                    continue

                if slave.has_UMI():  # get umi
                    if slave_is_rc:
                        slave_umi_start = len(
                            r.seq) - (slave_match[1] + len(slave.seq))
                        slave_umi = slave.get_UMI(revcomp(r.seq),
                                                  r.qual_str[::-1],
                                                  slave_umi_start)
                    else:
                        slave_umi = slave.get_UMI(r.seq, r.qual_str,
                                                  slave_match[1])
                    if slave.UMI_length != len(slave_umi[0]):
                        continue

            if not best_match or best_match[0][0] > master_match[0] or \
               (best_match[0][0] == master_match[0] and \
                slave_match and \
                (not best_match[1] or not best_match[1][0] or \
                 best_match[1][0] > slave_match[0])):
                umi = [x + y for x, y in zip(master_umi, slave_umi)]
                best_match = (master_match, slave_match, sample_id, umi)

        if best_match:
            master_match, slave_match, sample_id, umi = best_match
            for (r, (out, typename)) in ((r1, outfiles[sample_id]["out1"]),
                                         (r2, outfiles[sample_id]["out2"])):
                if not out:
                    continue
                out.write("@%s UMI:%s:%s:%s\n" %
                          (r.id, typename, umi[0], umi[1]))
                out.write("%s\n+\n%s\n" % (r.seq, r.qual_str))
            n_accepted += 1

        frac = float(fq1_filepos()) / fq1_filesize
        if time() - prev_time > .5 or frac == 1.0:
            prev_time = time()
            stdout.write(term.EL(2) + term.CHA(0) + \
                    "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i + 1,
                        frac*100, n_accepted,
                        (100*float(n_accepted)/(i+1))))
            stdout.flush()

    stdout.write('\n')
    def run(self):
        if self._alignments_fn is None or self._alignments_fn == "":
            output_alignments = False
        else:
            output_alignments = True

        if output_alignments and os.path.isfile(self._alignments_fn):
            logger.info("SKIPPING creation of %s" % self._alignments_fn)
            output_alignments = False
            alignment_file = zopen(self._alignments_fn, 'r')
            vj_recs = SAMFormat.records_in(alignment_file)
            # Get two (rows/)alignments at a time from vj_recs
            alns = ((rec, next(vj_recs)) for rec in vj_recs)
            self._listener.notify("Reading alignments from %s" %
                                  self._alignments_fn)
        else:
            alns = get_vj_alignments(self._ref,
                                     self._reads,
                                     self._cmd_build_index,
                                     self._args_build_index,
                                     self._cmd_align,
                                     self._args_align_v,
                                     self._args_align_j,
                                     phred_encoding=self._phred_encoding,
                                     n_threads=self._n_threads)
            self._listener.notify("Aligning reference sequences to reads")

        # Keep track of the quality scores of the bases that went into the
        # sequences of the clones.
        Q_counts = {}

        # Build clones and use alignments to count mismatches and indels
        cs = CloneSet()
        alnstats = {"V": {}, "J": {}}
        v_refpos_offset = -3
        j_refpos_offset = 3

        try:
            if output_alignments:
                out = zopen(self._alignments_fn, 'w')

            if output_alignments:
                infile = self._reads
            else:
                infile = alignment_file

            prev_time = time()
            if isinstance(infile, gzip.GzipFile):
                infile_size = os.path.getsize(infile.name)
                infile_pos = infile.fileobj.tell
            else:
                infile_size = filesize(infile)
                infile_pos = infile.tell

            self._listener.notify(("PROGRESSBAR", "Alignments", "start"))

            for v_rec, j_rec in alns:
                if self.stopped():
                    logger.warning("Pipeline stopped")
                    return

                if time() - prev_time >= self._update_interval:
                    prev_time = time()
                    if not infile.closed:
                        pos = infile_pos()
                    else:
                        # assuming a closed infile means the entire infile has
                        # been processed.
                        pos = infile_size
                    frac = float(pos) / infile_size
                    self._listener.notify(("PROGRESSBAR", "Alignments", frac))

                if output_alignments:
                    out.write("\t".join(map(str, v_rec)) + "\n" + \
                            "\t".join(map(str, j_rec)) + "\n")

                clone = build_clone(self._ref, v_rec, j_rec,
                                    self._clone_classname)

                if clone is None:
                    continue

                seqlen = len(clone.seq)
                if seqlen < self._min_seqlen:
                    continue

                # Count base qualities in the clone (which is possible because
                # at this point the clone is based on a single read)
                lenfam_Q_counts = Q_counts.setdefault(seqlen, [0] * 42)
                for i in xrange(clone.v.end, clone.j.start):
                    lenfam_Q_counts[clone.qual[i]] += 1

                cs.add(clone, merge=True)

                v_allele = self._ref[v_rec.RNAME]
                j_allele = self._ref[j_rec.RNAME]
                # Count errors in the alignments
                for (rec, r_roi_start, r_roi_end) in \
                        ((v_rec, v_allele.refpos + v_refpos_offset, 0),
                        (j_rec, 0, j_allele.refpos + j_refpos_offset)):
                    allele = self._ref[rec.RNAME]
                    lenfam_alnstats = alnstats[allele.region].setdefault(
                        seqlen, {
                            "n": 0,
                            "mm": 0,
                            "ins": 0,
                            "dels": 0,
                            "Q_mm": [0] * 42,
                            "Q_n": [0] * 42
                        })
                    n, mm, ins, dels, r_roi_as, r_roi_ae = get_error_stats(
                        rec, allele.seq, lenfam_alnstats["Q_mm"],
                        lenfam_alnstats["Q_n"], r_roi_start, r_roi_end)
                    lenfam_alnstats["n"] += n
                    lenfam_alnstats["mm"] += mm
                    lenfam_alnstats["ins"] += ins
                    lenfam_alnstats["dels"] += dels
        finally:
            if output_alignments:
                out.close()
        self._listener.notify(("PROGRESSBAR", "Alignments", "end"))

        if len(cs) == 0:
            msg = "No clones found in alignments. \
Was the correct germline reference used?"

            logger.error(msg)
            raise Exception(msg)

        if not self._alignment_stats_fn is None and \
                self._alignment_stats_fn != "":
            logger.info("Writing alignment stats to \"%s\"" %
                        self._alignment_stats_fn)
            with zopen(self._alignment_stats_fn, 'w') as out:
                out.write("seqlen,region,n,mm,ins,dels\n")
                for region in alnstats:
                    for seqlen, lenfam_alnstats in \
                            alnstats[region].iteritems():
                        out.write(",".join(
                            map(str, [
                                seqlen, region, lenfam_alnstats["n"],
                                lenfam_alnstats["mm"], lenfam_alnstats["ins"],
                                lenfam_alnstats["dels"]
                            ])) + "\n")

        self._save_cloneset(cs, "r")

        # Sum all the counts in the V and J regions separately, and calculate
        # average error rates
        tot_err = {"V": {}, "J": {}}
        for region in ("V", "J"):
            region_stats = alnstats[region]
            x = tot_err[region]
            x["n"] = sum([y["n"] for y in region_stats.itervalues()])
            x["mm"] = sum([y["mm"] for y in region_stats.itervalues()])
            x["ins"] = sum([y["ins"] for y in region_stats.itervalues()])
            x["dels"] = sum([y["dels"] for y in region_stats.itervalues()])

            n = x["n"]
            if n > 0:
                x["mmr"] = float(x["mm"]) / n
                x["insr"] = float(x["ins"]) / n
                x["delsr"] = float(x["dels"]) / n
            else:
                x["mmr"] = 0
                x["insr"] = 0
                x["delsr"] = 0
        global_mmr = max(tot_err["V"]["mmr"], tot_err["J"]["mmr"])
        global_insr = max(tot_err["V"]["insr"], tot_err["J"]["insr"])
        global_delsr = max(tot_err["V"]["delsr"], tot_err["J"]["delsr"])
        logger.info("global error rates: mmr: %(global_mmr)s, \
insr: %(global_insr)s, delsr: %(global_delsr)s" % locals())

        # Calculate observed error rates for Phred scores
        Q_mm_stats = {"V": {}, "J": {}}
        for region, region_stats in alnstats.iteritems():
            Q_mm = Q_mm_stats[region].setdefault("Q_mm", [0] * 42)
            Q_n = Q_mm_stats[region].setdefault("Q_n", [0] * 42)
            for lenfam_alnstats in region_stats.itervalues():
                for i in xrange(42):
                    Q_mm[i] += lenfam_alnstats["Q_mm"][i]
                    Q_n[i] += lenfam_alnstats["Q_n"][i]

        if not self._Q_mm_stats_fn is None and self._Q_mm_stats_fn != "":
            with zopen(self._Q_mm_stats_fn, 'w') as out:
                out.write("region,Q,n,mm\n")
                for region in Q_mm_stats:
                    for Q, (mm, n) in enumerate(
                            izip(Q_mm_stats[region]["Q_mm"],
                                 Q_mm_stats[region]["Q_n"])):
                        out.write("%s,%s,%s,%s\n" % (region, Q, n, mm))

        # Calculate ratio between base quality score assigned by the sequencer
        # and observed base quality (based on alignments with germline
        # reference).
        sum_ratios = 0
        n_ratios = 0
        for region in Q_mm_stats:
            Q_mm = Q_mm_stats[region]["Q_mm"]
            Q_n = Q_mm_stats[region]["Q_n"]
            for q in xrange(42):
                mm = Q_mm[q]
                n = Q_n[q]
                if mm > 0 and n > 0:
                    q_obs = p2q(float(mm) / n)
                    if q_obs > 0:
                        sum_ratios += (q / q_obs) * n
                        n_ratios += n
        if n_ratios > 0:
            alpha = float(sum_ratios) / n_ratios
        else:
            logger.warning('No instances found of a Phred score associated ' +\
                    'with mismatches.')
            alpha = 1.0

        logger.info("Ratio between base quality and observed quality: %s" %
                    alpha)

        if not self._Q_mm_stats_plot_fn is None and \
                self._Q_mm_stats_plot_fn != "":
            plot_Q_mm_stats(Q_mm_stats, self._Q_mm_stats_plot_fn)

        # Get median quality score
        Q_n = [0] * 42  # count number of bases for every Q score
        for lenfam_Q_counts in Q_counts.itervalues():
            for q, count in enumerate(lenfam_Q_counts):
                Q_n[q] += count
        i = ((sum(Q_n) + 1) // 2) - 1  # index of median element in Q_n
        j = 0
        for max_Q, count in enumerate(Q_n):
            j += count
            if j > i:
                break
        logger.info("max_Q = %s" % max_Q)

        pool = ConnectedConsumerPool(n_consumers=self._n_threads)
        by_seqlen = lambda clone: len(clone.seq)
        confidence = self._confidence
        for seqlen, clones in groupby(sorted(cs, key=by_seqlen), by_seqlen):
            if self.stopped():
                logger.warning("Pipeline stopped")
                return
            cs2 = CloneSet(clones)
            # Calculate expected number of errors based on Q scores
            lenfam_Q_counts = Q_counts[seqlen]

            # get total number of bases between V and J region
            n_o = sum(lenfam_Q_counts)
            mm_o = 0
            for q, count in enumerate(lenfam_Q_counts):
                q /= alpha
                mm_o += q2p(q) * count

            mm_v = alnstats["V"][seqlen]["mm"]
            n_v = alnstats["V"][seqlen]["n"]

            mm_j = alnstats["J"][seqlen]["mm"]
            n_j = alnstats["J"][seqlen]["n"]

            mm_tot = mm_v + mm_o + mm_j
            n_tot = n_v + n_o + n_j
            logger.info("Mismatch stats for seqlen %s: mm_v (%s, %s),\
mm_o (%s, %s), mm_j (%s, %s), mm_tot (%s, %s)" %
                        (seqlen, mm_v, float(mm_v) / n_v if n_v > 0 else 0,
                         mm_o, float(mm_o) / n_o if n_o > 0 else 0, mm_j,
                         float(mm_j) / n_j if n_j > 0 else 0, mm_tot,
                         float(mm_tot) / n_tot if n_tot > 0 else 0))
            local_mmr = float(mm_tot) / n_tot
            mmr = max(local_mmr, global_mmr)
            logger.info("Adding task: seqlen: %(seqlen)s, mismatch_rate: \
%(mmr)s, confidence: %(confidence)s, max_Q: %(max_Q)s" % locals())
            pool.add_task(run_ec_on_bin, (cs2, mmr, confidence, max_Q))

        self._listener.notify("Running QMerge and IMerge on bins.")
        self.run_pool(pool, desc='QMerge, IMerge')
        results = pool.results
        cloneset = CloneSet(chain.from_iterable([x[0] for x in results]))
        self._save_cloneset(cloneset, "rqi")

        self._listener.notify("Running LMerge")
        cloneset = run_lmerge(cloneset, global_mmr, global_insr, global_delsr,
                              confidence)
        self._save_cloneset(cloneset, "rqil")

        pool = ConnectedConsumerPool(n_consumers=self._n_threads)
        for seqlen, clones in groupby(sorted(cloneset, key=by_seqlen),
                                      by_seqlen):
            cs2 = CloneSet(clones)
            pool.add_task(wrapper_run_nmerge_on_bin, args=(cs2, ))
        self._listener.notify("Running NMerge on bins.")
        self.run_pool(pool, desc='NMerge')
        results = pool.results
        cloneset = CloneSet(chain.from_iterable(results))
        self._save_cloneset(cloneset, "rqiln")

        ########################
        # Write clones to file #
        ########################
        self._listener.notify("Writing clones")
        sequence_id = 0
        with open(self._output_fn, 'w') as res_ok:
            with open(self._output_not_ok_fn, 'w') as res_not_ok:
                header = '\t'.join(\
                        clone2AIRRDict(clone = None, ref = None).keys()) + '\n'
                res_ok.write(header)
                res_not_ok.write(header)

                n_discarded = 0
                for clone in sorted(cloneset,
                                    key=lambda clone:
                                    (-clone.count, clone.seq)):
                    record = clone2AIRRDict(clone=clone, ref=self._ref)
                    min_phred = int(record['junction_minimum_quality_score'])
                    if min_phred < self._min_phred_threshold \
                            or record['stop_codon'] == 'T' or \
                            record['vj_in_frame'] == 'F':
                        n_discarded += 1
                        out = res_not_ok
                    else:
                        out = res_ok
                    sequence_id += 1
                    record['sequence_id'] = str(sequence_id)
                    out.write('\t'.join([v for k, v in record.iteritems()]) +\
                            '\n')
        self._listener.notify("Discarded %s clones" % n_discarded)