示例#1
0
def filter_bam(input_bam, pore_c_table, output_bam, clean_read_name):
    from pysam import AlignmentFile

    inbam = AlignmentFile(input_bam, "rb")
    outbam = AlignmentFile(output_bam, "wb", template=inbam)

    aligns = pd.read_parquet(pore_c_table,
                             engine=PQ_ENGINE,
                             columns=["align_idx",
                                      "pass_filter"]).set_index(["align_idx"])
    aligns = aligns[aligns["pass_filter"]]

    expected = len(aligns)
    counter = 0

    for align in inbam.fetch(until_eof=True):
        align_idx = int(align.query_name.rsplit(":")[2])
        if align_idx not in aligns.index:
            continue
        if clean_read_name:
            readname_only = align.query_name.split(":")[0]
            align.query_name = readname_only
        outbam.write(align)
        counter += 1
    if counter != expected:
        raise ValueError(
            f"Number of alignments doesn't match. Expected {expected} got {counter}"
        )
    logger.info(f"Wrote {counter} reads to {output_bam}")
示例#2
0
def subset_bamfile(sam, barcodes):
    """
    Subset a SAM/BAM file, keeping only alignments from given
    cellular barcodes
    """
    from pysam import AlignmentFile

    start_time = time.time()

    sam_file = open_bamfile(sam)
    out_file = AlignmentFile("-", "wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)

    # peek at first alignment to determine the annotations
    queryalignment = track.next()
    annotations = detect_alignment_annotations(queryalignment)
    track = itertools.chain([queryalignment], track)

    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)
    barcodes = set(barcode.strip() for barcode in barcodes)

    for count, aln in enumerate(track, start=1):
        if count and not count % 100000:
            logger.info("Processed %d alignments." % count)

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if "cellular" in annotations:
            cb = match.group('CB')
            if cb in barcodes:
                out_file.write(aln)
示例#3
0
class BAMWriter:
    def __init__(self, output, indexed_sequence_list, index_options):
        header = self.build_header(indexed_sequence_list, index_options)
        self.writer = AlignmentFile(output, 'wb', header=header)

    def __del__(self):
        self.close()

    def close(self):
        if hasattr(self, 'writer'):
            self.writer.close()
            del self.writer

    def build_header(self, indexed_sequence_list, index_options):
        return {
            'SQ':
            indexed_sequence_list,
            'PG': [{
                'ID': 'minimap2',
                'PN': 'minimap2',
                'CL': index_options,
                'DS': 'minimap2 invoked by poreplex'
            }]
        }

    def write(self, fields):
        line = '\t'.join(map(str, fields))
        segment = AlignedSegment.fromstring(line, self.writer.header)
        self.writer.write(segment)
示例#4
0
def clean_sam(sam_fn, clean_sam_fn, orig_fq_fn, NB_MM):
    sam = AlignmentFile(sam_fn, 'r')
    clean_sam = AlignmentFile(clean_sam_fn, 'wb', template=sam)
    reads_kept = set()
    reads_deleted = set()
    for alignment in sam:
        if delete_alignment(alignment, NB_MM):
            reads_deleted.add(alignment.query_name)
        else:
            clean_sam.write(alignment)
            reads_kept.add(alignment.query_name)
    # only add alignment to fastq if read involved hasn't been kept on a cross-mapping loci
    keep_as_failed_list = []
    for read in reads_deleted:
        if not(read in reads_kept):
            keep_as_failed_list.append(read)
    seqs_failed = alignments_to_seqs(keep_as_failed_list, orig_fq_fn)
    ## Print discarded alignments in fastq 
    fastq_output_fn = sam_fn[:-7] + ".cl.fq"
    SeqIO.write(seqs_failed, fastq_output_fn, 'fastq')
    ## Print in log file
    log_clean = open("{}.SamCleaner.log".format(sam_fn[:-7]), 'a')
    cpt_treated = len(reads_kept) + len(reads_deleted)
    log_clean.write('Number of alignments treated : {}\nNumber of alignment kept : {}\nNumber of alignments deleted : {}\n'.format(cpt_treated, len(reads_kept), len(reads_deleted)))
    log_clean.close()
    clean_sam.close()
示例#5
0
文件: mapqto0.py 项目: PySean/mutools
def umappedq2zero(bamdir):
    """
    Reads in a BAM file, setting the MAPQ value for an alignment segment
    to zero if it is unmapped.
    Opens up both infile and outfile and outputs these modified
    reads to outfile.
    """
    if not os.path.exists(bamdir):
        sys.stderr.write("Sorry, but the specified directory does not exist.")
        sys.exit(1)

    bamfiles = os.listdir(bamdir)
    bampaths = filter(lambda x: x.endswith(".bam"), bamfiles)
    bampaths = map(lambda x: os.path.join(bamdir, x), bampaths)
    for bam in bampaths:
        inbam = AlignmentFile(bam, "rb")
        # Template is specified to maintain the same header information.
        outbam = AlignmentFile("temp.bam", "wb", template=inbam)
        # Construct reads iterator using fetch.
        reads = inbam.fetch(until_eof=True)
        for read in reads:
            if read.is_unmapped == True:
                read.mapping_quality = 0
            outbam.write(read)  # Don't omit any reads!
        # Overwrite the original with the new file with MAPQs set to zero.
        os.rename("temp.bam", bam)
示例#6
0
文件: umis.py 项目: roryk/umis
def bamtag(sam, umi_only):
    ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and
    cellular barcode tags
    '''
    from pysam import AlignmentFile

    if umi_only:
        parser_re = re.compile('.*:UMI_(?P<MB>.*)')
    else:
        parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')

    start_time = time.time()

    sam_mode = 'r' if sam.endswith(".sam") else 'rb'
    sam_file = AlignmentFile(sam, mode=sam_mode)
    out_file = AlignmentFile("-", "wh", template=sam_file)

    track = sam_file.fetch(until_eof=True)

    for count, aln in enumerate(track):
        if not count % 100000:
            logger.info("Processed %d alignments.")

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if not umi_only:
            aln.tags += [('XC', match.group('CB'))]

        aln.tags += [('XR', match.group('MB'))]
        out_file.write(aln)

    total_time = time.time() - start_time
    logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(total_time, int(60. * count / total_time)))
示例#7
0
文件: umis.py 项目: vals/umis
def subset_bamfile(sam, barcodes):
    """
    Subset a SAM/BAM file, keeping only alignments from given
    cellular barcodes
    """
    from pysam import AlignmentFile

    start_time = time.time()

    sam_file = open_bamfile(sam)
    out_file = AlignmentFile("-", "wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)

    # peek at first alignment to determine the annotations
    queryalignment = track.next()
    annotations = detect_alignment_annotations(queryalignment)
    track = itertools.chain([queryalignment], track)

    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)
    barcodes = set(barcode.strip() for barcode in barcodes)

    for count, aln in enumerate(track, start=1):
        if count and not count % 1000000:
            logger.info("Processed %d alignments." % count)

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if "cellular" in annotations:
            cb = match.group('CB')
            if cb in barcodes:
                out_file.write(aln)
def main(
    sam: str,
    output: str,
    reference2taxid: str
) -> None:
    """Write row with taxid and classification status for each alignment."""
    aln_infile = AlignmentFile(sam, "r")
    aln_outfile = AlignmentFile('-', "w", template=aln_infile)
    ref2taxid_df = pd.read_csv(
        reference2taxid, sep='\t', names=['acc', 'taxid'], index_col=0)
    output_tsv = open(output, 'w+')

    for aln in aln_infile.fetch(until_eof=True):

        mapped = 'U' if aln.is_unmapped else 'C'
        queryid = aln.query_name
        querylen = aln.query_length

        taxid = 0
        if not aln.is_unmapped:
            taxid = ref2taxid_df.at[aln.reference_name, 'taxid']

        output_tsv.write(
            '{mapped}\t{queryid}\t{taxid}\t0|{querylen}\n'.format(
                mapped=mapped, queryid=queryid, taxid=taxid, querylen=querylen
            )
        )

        aln_outfile.write(aln)
示例#9
0
def main(args):
    if args.log:
        log = MappingStats(args.log)
        #if args.gff3:
        #    log.add_gff3(args.gff3)
    else:
        log = None
    infile = AlignmentFile(args.bam, "rb")
    outfile = AlignmentFile(args.output, "wb", template=infile)
    for alignment in infile:
        tags = {x[0]: x[1] for x in alignment.tags}
        # checking and counting special cases
        c1 = check_multi_location(alignment, tags, log)
        c2 = check_clipping(alignment, log)
        if log:
            log.count(alignment)
            check_barcode_is_off(alignment, tags, log)
            c3 = check_is_mapped(alignment, log)
            if c1 or c2 or c3:
                # if any of these checks fail (return true), the read will not be counted in mpileup
                # if they all pass count it as passing
                log.passing(alignment)
        # writing the filtered sam file
        outfile.write(alignment)
    if args.log:
        log.write()
示例#10
0
def remove_low_cellcount_reads(inbam, outbam, mincount, log):
    """
    This function takes a bam file with barcodes in the
    RG tag as input and outputs a bam file containing
    only barcodes that exceed the minimum number of aligments
    for a given barcode.

    """
    treatment = AlignmentFile(inbam, 'rb')
    header = treatment.header
    barcodecounts = {bc['ID']: 0 for bc in header['RG']}

    # first parse the file to determine the per barcode
    # alignment counts

    for aln in treatment.fetch(until_eof=True):
        if aln.is_proper_pair and aln.is_read1 or not aln.is_paired:
            rg = aln.get_tag('RG')
            barcodecounts[rg] += 1

    treatment.close()

    # make new header with the valid barcodes
    treatment = AlignmentFile(inbam, 'rb')
    header = treatment.header.to_dict().copy()
    rgheader = []
    for rg in header['RG']:
        if barcodecounts[rg['ID']] >= mincount:
            rgheader.append(rg)

    header['RG'] = rgheader

    #log summary
    log_content = {}
    log_content['below_minbarcodecounts'] = 0
    log_content['above_minbarcodecounts'] = 0
    log_content['total'] = 0
    for bc in barcodecounts:
        log_content['total'] += barcodecounts[bc]
        if barcodecounts[bc] >= mincount:
            log_content['above_minbarcodecounts'] += barcodecounts[bc]
        else:
            log_content['below_minbarcodecounts'] += barcodecounts[bc]

    bam_writer = AlignmentFile(outbam, 'wb', header=header)
    for aln in treatment.fetch(until_eof=True):
        if barcodecounts[aln.get_tag('RG')] >= mincount:
            bam_writer.write(aln)

    treatment.close()
    bam_writer.close()

    #write log file
    with open(log, 'w') as f:
        f.write('Readgroup\tcounts\n')
        for icnt in log_content:
            f.write('{}\t{}\n'.format(icnt, log_content[icnt]))
示例#11
0
def deduplicate_reads(bamin, bamout, report, by_rg=True):
    """This script deduplicates the original bamfile.
    Deduplication removes reads align to the same position.
    If the reads in the bamfile contain a RG tag and
    by_rg=True, deduplication is done for each group separately.
    Parameters
    ----------
    bamfile : str
        Sorted bamfile containing barcoded reads.
    output : str
        Output path to a bamfile that contains the deduplicated reads.
    by_rg : boolean
        If True, the reads will be split by group tag.
    """
    bamfile = AlignmentFile(bamin, 'rb')
    output = AlignmentFile(bamout, 'wb', template=bamfile)

    log_counts = {'total': 0, 'retained': 0, 'removed': 0}

    # grep all barcodes from the header
    #barcodes = set()
    last_barcode = {}

    for aln in bamfile.fetch():
        # if previous hash matches the current has
        # skip the read
        val = (aln.reference_id, aln.reference_start, aln.is_reverse, aln.tlen)
        if aln.has_tag('RG') and by_rg:
            rg = aln.get_tag('RG')
        else:
            rg = 'dummy'
        log_counts['total'] += 1

        if rg not in last_barcode:
            output.write(aln)
            # clear dictionary
            last_barcode[rg] = val

        if val == last_barcode[rg]:
            log_counts['removed'] += 1
            continue
        else:
            output.write(aln)
            last_barcode[rg] = val

        log_counts['retained'] += 1

        if (log_counts['retained'] % 1000000) == 0:
            print("Processed {}/{} total/removed reads".format(
                log_counts['total'], log_counts['removed']))

    #write log file
    with open(report, 'w') as f:
        f.write('\tcounts\n')
        for icnt in log_counts:
            f.write('{}\t{}\n'.format(icnt, log_counts[icnt]))
示例#12
0
def remove_chroms(bamin, bamout, rmchroms):
    """ Removes chromosomes from bam-file.

    The function searches for matching chromosomes
    using regular expressions.
    For example, rmchroms=['chrM', '_random']
    would remove 'chrM' as well as all random chromsomes.
    E.g. chr1_KI270706v1_random.

    Parameters
    ----------
    bamin : str
       Input bam file.
    bamout : str
       Output bam file.
    rmchroms : list(str)
       List of chromosome names or name patterns to be removed.

    Returns
    -------
    None

    """

    treatment = AlignmentFile(bamin, 'rb')

    header = copy(treatment.header.as_dict())
    newheader = []
    for seq in header['SQ']:

        if not any([x in seq['SN'] for x in rmchroms]):
            newheader.append(seq)

    header['SQ'] = newheader

    tidmap = {k['SN']: i for i, k in enumerate(header['SQ'])}

    bam_writer = AlignmentFile(bamout, 'wb', header=header)

    # write new bam files containing only valid chromosomes
    for aln in treatment.fetch(until_eof=True):
        if aln.is_unmapped:
            continue
        if aln.reference_name not in tidmap or aln.next_reference_name not in tidmap:
            continue

        refid = tidmap[aln.reference_name]
        refnextid = tidmap[aln.next_reference_name]

        aln.reference_id = refid
        aln.next_reference_id = refnextid
        bam_writer.write(aln)

    bam_writer.close()
    treatment.close()
示例#13
0
def filter_reads(a, barcodes, out_bam_filename):
    outstream = AlignmentFile(out_bam_filename, 'wb')

    count = 1
    for read in a:
        if (count % 100000) == 0:
            print(f'Processed {count} reads')
        if read.has_tag('CB') and read.get_tag('CB') in barcodes:
            outstream.write(read)
            count += 1

    print(f'Processed {count-1} reads')
    outstream.close()
示例#14
0
def extract_barcode(sam, barcode):

    parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)')
    sam_file = AlignmentFile(sam, mode='r')
    filter_file = AlignmentFile("-", mode='wh', template=sam_file)
    track = sam_file.fetch(until_eof=True)
    for i, aln in enumerate(track):
        if aln.is_unmapped:
            continue
        match = parser_re.match(aln.qname)
        CB = match.group('CB')
        if CB == barcode:
            filter_file.write(aln)
示例#15
0
def extract_barcode(sam, barcode):

    parser_re = re.compile(".*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)")
    sam_file = AlignmentFile(sam, mode="r")
    filter_file = AlignmentFile("-", mode="wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)
    for i, aln in enumerate(track):
        if aln.is_unmapped:
            continue
        match = parser_re.match(aln.qname)
        CB = match.group("CB")
        if CB == barcode:
            filter_file.write(aln)
示例#16
0
def get_reads_with_bcs_sam(bc_file, sam_file, out_handle=stdout):
    bcs = parse_bc_list(bc_file)
    sam_file = open_samfile(sam_file)
    sam_out = SamFile(out_handle, "w", template=sam_file)
    for read in sam_file:
        try:
            bc = get_bc_sam(read)
            if bc in bcs:
                continue
            sam_out.write(read)
        except KeyError:
            pass
    sam_file.close()
    sam_out.close()
示例#17
0
def _sort(in_filename, out_filename):
    '''Custom sorts SAM file.'''
    sam_file = AlignmentFile(in_filename, 'r')
    out_file = AlignmentFile(out_filename,
                             'wh',
                             template=sam_file,
                             header=sam_file.header)

    for read in sorted([read for read in sam_file],
                       key=lambda x: (-x.query_length, x.reference_start)):
        out_file.write(read)

    out_file.close()

    return out_filename
示例#18
0
def pairs_to_telbam(af_pairs: AlignmentFile, af_telbam: AlignmentFile):
    read_iter = af_pairs.fetch(until_eof=True)
    while True:
        read_a = next(read_iter, None)
        if read_a is None:
            break
        read_b = next(read_iter)
        qseq = read_a.query_sequence
        if TEL_PATS[0] in qseq or TEL_PATS[1] in qseq:
            af_telbam.write(read_a)
            af_telbam.write(read_b)
        else:
            qseq = read_b.query_sequence
            if TEL_PATS[0] in qseq or TEL_PATS[1] in qseq:
                af_telbam.write(read_a)
                af_telbam.write(read_b)
    return
def remove_idx_from_read_names(input_bam: Path):
    """ Replace READNAME:ALIGN_IDX with just READNAME

    Originally created because WhatsHap requires unique read names.
    """

    infile = AlignmentFile(input_bam, "rb")
    stdout = AlignmentFile("-", "wb", template=infile)
    align_iter = infile.fetch(until_eof=True)

    for read in align_iter:
        readname = read.query_name.split(":")[0]
        read.query_name = readname
        stdout.write(read)

    stdout.close()
    infile.close()
示例#20
0
def downgrade_read_edges(in_fpath,
                         out_fpath,
                         read_start_size,
                         read_end_size,
                         qual_to_substract=QUAL_TO_SUBSTRACT):
    in_sam = AlignmentFile(in_fpath)
    out_sam = AlignmentFile(out_fpath, 'wb', template=in_sam)
    for aligned_read in in_sam:
        if (aligned_read.has_tag(LEFT_DOWNGRADED_TAG)
                or aligned_read.has_tag(RIGTH_DOWNGRADED_TAG)):
            raise RuntimeError('Edge qualities already downgraded\n')

        _downgrade_edge_qualities(aligned_read,
                                  read_start_size,
                                  read_end_size,
                                  qual_to_substract=qual_to_substract)
        out_sam.write(aligned_read)
def add_idx_to_read_name(input_bam: Path):
    """ Changes the readname to be READNAME:ALIGN_IDX to have 'unique' readnames

    WhatsHap requires unique read names.
    """

    infile = AlignmentFile(input_bam, "rb")
    stdout = AlignmentFile("-", "wb", template=infile)
    align_iter = infile.fetch(until_eof=True)

    i = 0
    for read in align_iter:
        read.query_name = read.query_name + ":" + str(i)
        stdout.write(read)
        i = i + 1

    stdout.close()
    infile.close()
示例#22
0
def bam(args, logger):
    if not args.chrom_sizes:
        exit("Chrom sizes required for bam conversion")
        
    chrom_mods = build_transform(args.mod, logger)
    input_ = AlignmentFile(args.input, 'rb')

    header = update_header(input_.header.as_dict(), args.chrom_sizes)
    output = AlignmentFile(args.output, 'wb', header=header)

    curr_chrom = ""
    for line in input_:

        if input_.references[line.reference_id] != curr_chrom:
            curr_chrom = input_.references[line.reference_id]
            positions, deltas = get_positions_and_deltas(chrom_mods,
                                                         curr_chrom,
                                                         logger)

        try:
            start_delta = find_delta(positions,
                                     deltas,
                                     int(line.reference_start))
            # end_delta = find_delta(positions,
            #                        deltas,
            #                        int(line.reference_end))
            mod_index = build_modification_index(positions,
                                                 deltas,
                                                 line,
                                                 start_delta)
            # new_cigar = update_cigar(mod_index, line.cigar)

            # if len(line.cigar) < len(new_cigar):
            #     line.cigar = new_cigar[-1*len(line.cigar):]
            # else:
            #     line.cigar = new_cigar
            line.reference_start = int(line.reference_start) + start_delta
            output.write(line)
        except IndexError:
            print "IndexError: ", line
            pass
        except TypeError:
            print "TypeError:", line
            pass
示例#23
0
文件: umis.py 项目: vals/umis
def bamtag(sam):
    ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and
    cellular barcode tags
    '''
    from pysam import AlignmentFile

    start_time = time.time()

    sam_file = open_bamfile(sam)
    out_file = AlignmentFile("-", "wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)

    # peek at first alignment to determine the annotations
    if is_python3():
        queryalignment = next(track)
    else:
        queryalignment = track.next()
    annotations = detect_alignment_annotations(queryalignment)
    track = itertools.chain([queryalignment], track)

    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)

    for count, aln in enumerate(track, start=1):
        if count and not count % 1000000:
            logger.info("Processed %d alignments." % count)

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if "cellular" in annotations:
            aln.tags += [('XC', match.group('CB'))]
        if "molecular" in annotations:
            aln.tags += [('RX', match.group('MB'))]
        if "sample" in annotations:
            aln.tags += [('XS', match.group('SB'))]

        out_file.write(aln)

    total_time = time.time() - start_time
    logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(total_time, int(60. * count / total_time)))
    logger.info("Processed %d alignments." % count)
示例#24
0
def bamtag(sam):
    ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and
    cellular barcode tags
    '''
    from pysam import AlignmentFile

    start_time = time.time()

    sam_file = open_bamfile(sam)
    out_file = AlignmentFile("-", "wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)

    # peek at first alignment to determine the annotations
    queryalignment = track.next()
    annotations = detect_alignment_annotations(queryalignment)
    track = itertools.chain([queryalignment], track)

    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)

    for count, aln in enumerate(track, start=1):
        if count and not count % 100000:
            logger.info("Processed %d alignments." % count)

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if "cellular" in annotations:
            aln.tags += [('XC', match.group('CB'))]
        if "molecular" in annotations:
            aln.tags += [('RX', match.group('MB'))]
        if "sample" in annotations:
            aln.tags += [('XS', match.group('SB'))]

        out_file.write(aln)

    total_time = time.time() - start_time
    logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(
        total_time, int(60. * count / total_time)))
    logger.info("Processed %d alignments." % count)
示例#25
0
def deduplicate_reads(bamin, bamout, tag='CB'):
    """Performs deduplication within barcodes/cells.

    Parameters
    ----------
    bamin : str
        Position sorted input bamfile.
    bamout : str
        Output file containing deduplicated reads.
    tag : str or callable
        Indicates the barcode tag or custom function to extract the barcode. Default: 'CB'

    Returns
    -------
    None

    """
    bamfile = AlignmentFile(bamin, 'rb')
    output = AlignmentFile(bamout, 'wb', template=bamfile)

    last_barcode = {}
    barcoder = Barcoder(tag)

    for aln in bamfile.fetch():
        # if previous hash matches the current has
        # skip the read
        val = (aln.reference_id, aln.reference_start, aln.is_reverse, aln.tlen)
        barcode = barcoder(aln)

        if barcode not in last_barcode:
            output.write(aln)
            # clear dictionary
            last_barcode[barcode] = val

        if val == last_barcode[barcode]:
            continue
        else:
            output.write(aln)
            last_barcode[barcode] = val
示例#26
0
def atac(args, logger):
    """

    """
    if not args.chrom_sizes:
        exit("Chrom sizes required for bam conversion")

    chrom_mods = build_transform(args.mod, logger)
    input_ = AlignmentFile(args.input, 'rb')

    header = update_header(input_.header.as_dict(), args.chrom_sizes)
    output = AlignmentFile(args.output, 'wb', header=header)

    curr_chrom = ""
    for line in input_:

        if input_.references[line.reference_id] != curr_chrom:
            curr_chrom = input_.references[line.reference_id]
            positions, deltas = get_positions_and_deltas(
                chrom_mods, curr_chrom, logger)
        # if line.is_reverse and (line.reference_length != len(line.seq)):
        #     print line
        #     print line.reference_length
        #     print line.cigar
        #     print len(line.seq)
        #     print len(line.get_reference_positions())
#        try:
        if not line.is_reverse:
            start_delta = find_delta(positions, deltas,
                                     int(line.reference_start))
            line.reference_start = int(line.reference_start) + start_delta
        else:
            end_delta = find_delta(positions, deltas, int(line.reference_end))
            mapped_end = int(line.reference_end) + end_delta
            line.reference_start = mapped_end - len(
                line.seq)  # line.reference_length
        output.write(line)
def run_project_alignments(args):
    """ Project mapped sam file"""

    sam = args.sam
    chromosomes = args.chromosomes.split(",")
    graph_dir = args.data_dir

    linear_ref_paths = {}
    haplotype_paths = {}

    out_sam = AlignmentFile(args.out_sam, "w", template=AlignmentFile(sam))

    logging.info("Reading linear paths")
    for chromosome in tqdm(chromosomes):
        linear_ref_paths[chromosome] = NumpyIndexedInterval.from_file(
            graph_dir + chromosome + "_linear_pathv2.interval")
        haplotype_paths[chromosome] = NumpyIndexedInterval.from_file(
            args.linear_paths_base_name + "_" + chromosome +
            ".intervalcollection.indexed")

    logging.info("Converting")
    n_unmapped = 0
    for sam_record in tqdm(read_sam(sam), total=number_of_lines_in_file(sam)):
        chromosome = sam_record.chromosome
        if chromosome is None:
            out_sam.write(sam_record.pysam_object)
            n_unmapped += 1
            continue
        length = len(sam_record.sequence)
        projected_start = convert_position_on_haplotype_to_position_on_linear_ref(
            linear_ref_paths[chromosome], haplotype_paths[chromosome],
            sam_record.start)
        sam_record.set_start(projected_start)
        out_sam.write(sam_record.pysam_object)

    logging.info("%d sam records missed chromosome (unmapped)" % n_unmapped)
示例#28
0
    print("Alignment of query %s (length %d) : [%d - %d) to %s (length %d) : %s - %s. Identity: %.3f. Length on query (reference): %d (%d). Reverse: %r" % \
            (r.query_name, init_length, query_alignment_start, query_alignment_end, \
                alignment.get_reference_name(r.reference_id), alignment.lengths[r.reference_id], \
                reference_start_str, reference_end_str, \
                idy * 100., r.query_alignment_length, r.reference_length, r.is_reverse), \
            file=sys.stderr)

    print(
        "%s\t%d\t%d\t%d\t%r\t%s\t%d\t%d\t%d\t%.3f" %
        (r.query_name, init_length, query_alignment_start, query_alignment_end,
         r.is_reverse, alignment.get_reference_name(
             r.reference_id), alignment.lengths[r.reference_id],
         r.reference_start, r.reference_end, idy * 100.))

    if args.filtered:
        out_alignment.write(r)

    if args.bed:
        if r.reference_length < 2 * args.bed_trim:
            print("WARN reference span %d to small for used trim %d" %
                  (r.reference_length, args.bed_trim),
                  file=sys.stderr)
        else:
            print("%s\t%d\t%d\t%s" %
                  (alignment.get_reference_name(
                      r.reference_id), r.reference_start + args.bed_trim,
                   r.reference_end - args.bed_trim, r.query_name),
                  file=out_bed)

if args.filtered:
    out_alignment.close()
示例#29
0
def remove_chroms(inbam, outbam, rmchroms, log):
    """
    This function takes a bam-file and outputs
    a bam-file in which the specified chromosomes
    have been removed.

    The function searches for matching chromosomes
    using regular expressions.
    For example, rmchroms=['chrM', '_random']
    would remove 'chrM' as well as all random chromsomes.
    E.g. chr1_KI270706v1_random.
    """

    treatment = AlignmentFile(inbam, 'rb')

    header = treatment.header
    new_chroms = []
    chrnames = []

    # tid_map is to reindex the chromosomes in the
    # new bam file.
    tid_map = [-1 for i in range(len(header['SQ']))]

    N = 0

    chr_to_remove_reason = {}
    # make new header with valid chromsomes
    for i, seq in enumerate(header['SQ']):
        keep = True
        for chrom in rmchroms:
            if chrom in seq['SN']:
                keep = False
                chr_to_remove_reason[seq['SN']] = chrom
                break
        if keep:
            tid_map[i] = N
            N += 1
            new_chroms.append(seq)
            chrnames.append(seq['SN'])

    new_header = {'SQ': new_chroms}

    bam_writer = AlignmentFile(outbam, 'wb', header=new_header)

    log_content = {chrom: 0 for chrom in rmchroms}
    log_content['remaining'] = 0
    log_content['unmapped'] = 0
    log_content['total'] = 0

    # write new bam files containing only valid chromosomes
    for aln in treatment.fetch(until_eof=True):
        log_content['total'] += 1
        if aln.is_unmapped:
            log_content['unmapped'] += 1
            continue
        if aln.reference_name in chrnames:
            aln.reference_id = tid_map[aln.reference_id]
            if aln.is_paired and aln.is_proper_pair:
                aln.next_reference_id = tid_map[aln.next_reference_id]
            bam_writer.write(aln)
            log_content['remaining'] += 1
        else:
            log_content[chr_to_remove_reason[aln.reference_name]] += 1

    bam_writer.close()
    treatment.close()

    #write log file
    with open(log, 'w') as f:
        f.write('Readgroup\tcounts\n')
        for icnt in log_content:
            f.write('{}\t{}\n'.format(icnt, log_content[icnt]))
示例#30
0
def remove_low_mapq_reads(inbam, outbam, minmapq, log):
    """
    This function takes a bam file and it produces
    a new bam file with aligments with a minimum
    mapping quality.

    For paired-end data, only alignments where
    both mates exceed the threshold are retained.
    """

    treatment = AlignmentFile(inbam, 'rb')
    bam_writer = AlignmentFile(outbam, 'wb', template=treatment)

    log_content = {}
    log_content['below_mapq'] = 0
    log_content['above_mapq'] = 0
    log_content['total'] = 0

    waiting_for_pair = {}
    for aln in treatment.fetch(until_eof=True):
        log_content['total'] += 1
        if aln.is_paired:
            if aln.rname not in waiting_for_pair:
                # for the first mate that we encounter
                # we get here and keep the aln in the waiting list
                # if it is valid
                if aln.mapq >= minmapq and not aln.is_unmapped:
                    waiting_for_pair[aln.rname] = aln
                else:
                    # None marks an invalid first mate
                    waiting_for_pair[aln.rname] = None
                    log_content['below_mapq'] += 1
            else:
                # for the second mate that we encounter
                # we get here
                if aln.mapq >= minmapq and not aln.is_unmapped \
                   and waiting_for_pair[aln.rname] is not None:
                    # both pairs satisfy the min mapq threshold
                    # and are mapped.
                    # write them into the output file
                    bam_writer.write(waiting_for_pair[aln.rname])
                    bam_writer.write(aln)
                    log_content['above_mapq'] += 2
                else:
                    # either the first mate was invalid
                    # or the second mate was below mapq
                    log_content['below_mapq'] += 1

                # finally clear the waiting list to save memory
                waiting_for_pair.pop(aln.rname)

        else:
            # single end
            if aln.maqp >= minmapq:
                bam_writer.write(aln)
                log_content['above_mapq'] += 1
            else:
                log_content['below_mapq'] += 1

    treatment.close()
    bam_writer.close()

    #write log file
    with open(log, 'w') as f:
        f.write('Readgroup\tcounts\n')
        for icnt in log_content:
            f.write('{}\t{}\n'.format(icnt, log_content[icnt]))
示例#31
0
def _processs_bam_file(bam_fname, metrics, mapq_th, skipped, segmentation=None, gap_th=1000000):
    """
    Extract data from BAM file into chunks of genome.

    Parameters
    ----------
    bam_fname : str
        BAM file with mapped reads.
    metrics : iCount.Metrics
        Metrics object for storing analysis metadata.
    mapq_th : int
        Ignore hits with MAPQ < mapq_th.
    skipped : str
        Output BAM file to store reads that do not map as expected by segmentation and
        reference genome sequence. If read's second start does not fall on any of
        segmentation borders, it is considered problematic. If segmentation is not provided,
        every read in two parts with gap longer than gap_th is not used (skipped).
        All such reads are reported to the user for further exploration.
    segmentation : str
        File with segmentation (obtained by ``iCount segment``).
    gap_th : int
        Reads with gaps less than gap_th are treated as if they have no gap.

    Returns
    -------
    dict
        Internal structure of BAM file, described in docstring.
    list
        BAM file with

    """
    metrics.all_recs = 0  # All records
    metrics.notmapped_recs = 0  # Not mapped records
    metrics.mapped_recs = 0  # Mapped records
    metrics.lowmapq_recs = 0  # Records with insufficient quality
    metrics.used_recs = 0  # Records used in analysis (all - unmapped - lowmapq)
    metrics.invalidrandomer_recs = 0  # Records with invalid randomer
    metrics.norandomer_recs = 0  # Records with no randomer
    metrics.bc_cn = {}  # Barcode counter
    metrics.strange_recs = 0  # Strange records (not expected by segmentation)

    def finalize(reads_pending_fwd, reads_pending_rev, start, chrom, progress):
        """Yield appropriate data."""
        reads_to_process_fwd = {}
        for pos in list(reads_pending_fwd):
            if pos < start:
                reads_to_process_fwd[pos] = reads_pending_fwd.pop(pos)
        if reads_to_process_fwd:
            yield ((chrom, '+'), progress, reads_to_process_fwd)

        reads_to_process_rev = {}
        for pos in list(reads_pending_rev):
            if pos < start:
                reads_to_process_rev[pos] = reads_pending_rev.pop(pos)
        if reads_to_process_rev:
            yield ((chrom, '-'), progress, reads_to_process_rev)

    genome_done = 0
    ann_data = None
    LOGGER.info('Detecting cross-links...')
    with AlignmentFile(bam_fname, 'rb') as bamfile:
        strange_bam = AlignmentFile(skipped, 'wb', header=bamfile.header)
        genome_size = sum([contig['LN'] for contig in bamfile.header['SQ']])
        for chrom in bamfile.references:
            chrom_len = bamfile.header['SQ'][bamfile.get_tid(chrom)]['LN']
            if segmentation:
                # pylint: disable=protected-access
                ann_data = iCount.genomes.segment._prepare_segmentation(segmentation, chrom)

            reads_pending_fwd = {}
            reads_pending_rev = {}
            read = None
            for read in bamfile.fetch(chrom):
                metrics.all_recs += 1
                if read.is_unmapped:
                    metrics.notmapped_recs += 1
                    continue
                metrics.mapped_recs += 1
                if read.mapping_quality < mapq_th:
                    metrics.lowmapq_recs += 1
                    continue
                metrics.used_recs += 1

                rdata = _get_read_data(
                    read, metrics, mapq_th, segmentation=ann_data, gap_th=gap_th)
                (xlink_pos, barcode, is_strange, strand), read_data = rdata[0:4], rdata[4:]

                if is_strange:
                    strange_bam.write(read)
                else:
                    if strand == '+':
                        reads_pending_fwd.setdefault(
                            xlink_pos, {}).setdefault(barcode, []).append(read_data)
                    else:
                        reads_pending_rev.setdefault(
                            xlink_pos, {}).setdefault(barcode, []).append(read_data)

            # Sliding window start (smaller coordinate)
            start = 0 if read is None else (0 if not read.positions else read.positions[0])
            progress = round(min((genome_done + start) / genome_size, 1.0), 4)

            for data in finalize(reads_pending_fwd, reads_pending_rev, start, chrom, progress):
                yield data

            start = chrom_len
            progress = round(min((genome_done + start) / genome_size, 1.0), 4)
            for data in finalize(reads_pending_fwd, reads_pending_rev, start, chrom, progress):
                yield data

            genome_done += chrom_len

    # Report:
    LOGGER.info('All records in BAM file: %d', metrics.all_recs)
    LOGGER.info('Reads not mapped: %d', metrics.notmapped_recs)
    LOGGER.info('Mapped reads records (hits): %d', metrics.mapped_recs)
    LOGGER.info('Hits ignored because of low MAPQ: %d', metrics.lowmapq_recs)
    LOGGER.info('Records used for quantification: %d', metrics.used_recs)
    LOGGER.info('Records with invalid randomer info in header: %d', metrics.invalidrandomer_recs)
    LOGGER.info('Records with no randomer info: %d', metrics.norandomer_recs)
    LOGGER.info('Ten most frequent randomers:')
    top10 = sorted(
        [(count, barcode) for barcode, count in metrics.bc_cn.items()], reverse=True)[:10]
    for count, barcode in top10:
        LOGGER.info('    %s: %d', barcode, count)
    LOGGER.info('There are %d reads with second-start not falling on segmentation. They are '
                'reported in file: %s', metrics.strange_recs, skipped)
示例#32
0
import sys
from pysam import AlignmentFile
from argparse import ArgumentParser

valid_spliced_reads=0
problem_reads=0

parser = ArgumentParser()
parser.add_argument('infile', nargs='?', default='-')
parser.add_argument('outfile', nargs='?', default='-')
args = parser.parse_args()
infile = AlignmentFile(args.infile, 'r')
outfile = AlignmentFile(args.outfile, 'wh', template=infile)

for read in infile:
    splice_len = 0
    min_edge = 1e6
    if read.mapping_quality < 10: continue
    for cig_op, cig_len in read.cigartuples:
        if cig_op == 3: # N
            splice_len += cig_len
        elif cig_op == 0:
            min_edge = min(min_edge, cig_len)
    if splice_len > 50 and min_edge >= 6:
        outfile.write(read)
        valid_spliced_reads += 1
        if valid_spliced_reads % 100000 == 0:
            sys.stderr.write("%d valid, %d problematic spliced reads\n" % (valid_spliced_reads, problem_reads) )
sys.stderr.write("%d valid, %d problematic spliced reads\n" % (valid_spliced_reads, problem_reads) )

示例#33
0
class Writer(Thread):

    def __init__(self, mode, iterator, aligner, fd=sys.stdout, duplex=False, ref_fn=None, groups=None, group_key=None):
        super().__init__()
        self.fd = fd
        self.log = []
        self.mode = mode
        self.duplex = duplex
        self.aligner = aligner
        self.iterator = iterator
        self.fastq = mode == 'wfq'
        self.group_key = group_key
        self.output = AlignmentFile(
            fd, 'w' if self.fastq else self.mode, add_sam_header=not self.fastq,
            reference_filename=ref_fn,
            header=AlignmentHeader.from_references(
                reference_names=aligner.seq_names if aligner else [],
                reference_lengths=[
                    len(aligner.seq(name)) for name in aligner.seq_names
                ] if aligner else [],
                text=sam_header(groups),
            )
        )

    def run(self):
        with CSVLogger(summary_file(), sep='\t') as summary:
            for read, res in self.iterator:

                seq = res['sequence']
                qstring = res.get('qstring', '*')
                mean_qscore = res.get('mean_qscore', mean_qscore_from_qstring(qstring))
                mapping = res.get('mapping', False)
                mods_tags = res.get('mods', [])

                if self.duplex:
                    samples = len(read[0].signal) + len(read[1].signal)
                    read_id = '%s;%s' % (read[0].read_id, read[1].read_id)
                else:
                    samples = len(read.signal)
                    read_id = read.read_id

                tags = [
                    f'RG:Z:{read.run_id}_{self.group_key}',
                    f'qs:i:{round(mean_qscore)}',
                    *read.tagdata(),
                    *mods_tags,
                ]

                if len(seq):
                    if self.mode == 'wfq':
                        write_fastq(read_id, seq, qstring, fd=self.fd, tags=tags)
                    else:
                        self.output.write(
                            AlignedSegment.fromstring(
                                sam_record(read_id, seq, qstring, mapping, tags=tags),
                                self.output.header
                            )
                        )
                    if self.duplex:
                        summary.append(duplex_summary_row(read[0], read[1], len(seq), mean_qscore, alignment=mapping))
                    else:
                        summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping))

                    self.log.append((read_id, samples))

                else:
                    logger.warn("> skipping empty sequence %s", read_id)
def call_family_consensus(
    current_family_forward_size: int,
    current_family_reverse_size: int,
    current_family_size: int,
    debug: bool,
    debug_family_ids: List[str],
    debug_family_location: str,
    family_file_prefix: str,
    family_index: int,
    input_bam: pysam.AlignmentFile,
    output_bam: pysam.AlignmentFile,
    synthetic_read_prefix: str,
    temp_bam_filename_forward: str,
    temp_bam_filename_reverse: str,
    calling_method: str,
) -> None:
    """
    Call consensus read for family identifying if collapse is required and
    selecting if forward or reverse orientation should be used
    :param current_family_forward_size: number of reads in forward orientation for this family
    :param current_family_reverse_size: number of reads in reverse orientation for this family
    :param current_family_size: total family size
    :param debug: debug mode
    :param debug_family_ids: families for which to generate debug files
    :param debug_family_location: location where to save the debug files
    :param family_file_prefix: prefix of family files
    :param family_index: index for the family
    :param input_bam: input bam file (opened)
    :param output_bam: output bam files
    :param synthetic_read_prefix: prefix for synthetic reads
    :param temp_bam_filename_forward: temp filename of file with forward reads
    :param temp_bam_filename_reverse: temp filenem of file with reverse reads
    :param calling_method: method for base calling
    :return: None
    """
    if current_family_size > 1:
        if current_family_forward_size >= current_family_reverse_size:
            new_read = call_consensus(
                temp_bam_filename_forward,
                new_read_name=f'{synthetic_read_prefix}{family_index}',
                temp_sorted_filename=f'{temp_bam_filename_forward}.sorted.bam',
                calling_method=calling_method)
            output_bam.write(new_read)
        else:
            new_read = call_consensus(
                temp_bam_filename_reverse,
                new_read_name=f'{synthetic_read_prefix}{family_index}',
                temp_sorted_filename=f'{temp_bam_filename_forward}.sorted.bam',
                calling_method=calling_method)
            output_bam.write(new_read)
    else:
        if current_family_forward_size == 1:
            # copy read, could cache last read and avoid re-opening file
            with pysam.AlignmentFile(temp_bam_filename_forward,
                                     "rb") as family_file:
                first_read = family_file.__next__()
                output_bam.write(first_read)
        else:
            # copy read, could cache last read and avoid re-opening file
            with pysam.AlignmentFile(temp_bam_filename_reverse,
                                     "rb") as family_file:
                first_read = family_file.__next__()
                output_bam.write(first_read)
    if debug:
        # save information about specific families
        if family_index in debug_family_ids:
            save_family_debug(debug_family_location, family_file_prefix,
                              family_index, input_bam, new_read,
                              temp_bam_filename_forward,
                              temp_bam_filename_reverse)
示例#35
0
class CTCWriter(Thread):
    """
    CTC writer process that writes output numpy training data.
    """
    def __init__(
            self, mode, iterator, aligner, fd=sys.stdout, min_coverage=0.90,
            min_accuracy=0.99, ref_fn=None, groups=None, group_key=None,
    ):
        super().__init__()
        self.fd = fd
        self.log = []
        self.mode = mode
        self.aligner = aligner
        self.iterator = iterator
        self.group_key = group_key
        self.min_coverage = min_coverage
        self.min_accuracy = min_accuracy
        self.output = AlignmentFile(
            fd, 'w' if self.mode == 'wfq' else self.mode, add_sam_header=self.mode != 'wfq',
            reference_filename=ref_fn,
            header=AlignmentHeader.from_references(
                reference_names=aligner.seq_names,
                reference_lengths=[len(aligner.seq(name)) for name in aligner.seq_names],
                text=sam_header(groups),
            )
        )

    def run(self):

        chunks = []
        targets = []
        lengths = []

        with CSVLogger(summary_file(), sep='\t') as summary:
            for read, ctc_data in self.iterator:

                seq = ctc_data['sequence']
                qstring = ctc_data['qstring']
                mean_qscore = ctc_data.get('mean_qscore', mean_qscore_from_qstring(qstring))
                mapping = ctc_data.get('mapping', False)

                self.log.append((read.read_id, len(read.signal)))

                if len(seq) == 0 or mapping is None:
                    continue

                cov = (mapping.q_en - mapping.q_st) / len(seq)
                acc = mapping.mlen / mapping.blen
                refseq = self.aligner.seq(mapping.ctg, mapping.r_st, mapping.r_en)

                if acc < self.min_accuracy or cov < self.min_coverage or 'N' in refseq:
                    continue

                self.output.write(
                    AlignedSegment.fromstring(
                        sam_record(read.read_id, seq, qstring, mapping),
                        self.output.header
                    )
                )
                summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping))

                if mapping.strand == -1:
                    refseq = mappy.revcomp(refseq)

                target = [int(x) for x in refseq.translate({65: '1', 67: '2', 71: '3', 84: '4'})]
                targets.append(target)
                chunks.append(read.signal)
                lengths.append(len(target))

        if len(chunks) == 0:
            sys.stderr.write("> no suitable ctc data to write\n")
            return

        chunks = np.array(chunks, dtype=np.float16)
        targets_ = np.zeros((chunks.shape[0], max(lengths)), dtype=np.uint8)
        for idx, target in enumerate(targets): targets_[idx, :len(target)] = target
        lengths = np.array(lengths, dtype=np.uint16)
        indices = np.random.permutation(typical_indices(lengths))

        chunks = chunks[indices]
        targets_ = targets_[indices]
        lengths = lengths[indices]

        summary = pd.read_csv(summary_file(), sep='\t')
        summary.iloc[indices].to_csv(summary_file(), sep='\t', index=False)

        output_directory = '.' if sys.stdout.isatty() else dirname(realpath('/dev/fd/1'))
        np.save(os.path.join(output_directory, "chunks.npy"), chunks)
        np.save(os.path.join(output_directory, "references.npy"), targets_)
        np.save(os.path.join(output_directory, "reference_lengths.npy"), lengths)

        sys.stderr.write("> written ctc training data\n")
        sys.stderr.write("  - chunks.npy with shape (%s)\n" % ','.join(map(str, chunks.shape)))
        sys.stderr.write("  - references.npy with shape (%s)\n" % ','.join(map(str, targets_.shape)))
        sys.stderr.write("  - reference_lengths.npy shape (%s)\n" % ','.join(map(str, lengths.shape)))

    def stop(self):
        self.join()
示例#36
0
    def analyzeReferenceId(self, referenceId, alignmentFile, outputDir):
        """
        Analyze the given reference id in the given alignment file (if an
        alignment to the reference id is present).

        @param referenceId: The C{str} id of the reference sequence to analyze.
        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: The C{str} name of the output directory.
        @return: C{None} if C{referenceId} is not present in C{alignmentFile}
            or if no significant offsets are found. Else, a C{dict} containing
            the signifcant offsets and the consensus sequence that best matches
            C{referenceId}.
        """
        analysis = self.initialReferenceIdAnalysis(referenceId, alignmentFile,
                                                   outputDir)

        if analysis:
            (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset,
             readsAtOffset, significantOffsets, samFilter,
             paddedSAM) = analysis
        else:
            return

        insignificantOffsets = set(
            range(genomeLength)) - set(significantOffsets)

        reference = self.referenceGenomes[referenceId]
        referenceSequence = reference.sequence

        consensus = []
        for base in referenceSequence:
            ob = OffsetBases()
            ob.incorporateBase(base)
            consensus.append(ob)

        readQueue = PriorityQueue()
        self.updatePriorityQueue(readQueue, alignedReads, consensus,
                                 significantOffsets)

        consensusFilename = join(outputDir, 'reference-consensus.sam')
        nonConsensusFilename = join(outputDir, 'reference-non-consensus.sam')
        self.report('    Writing consensus SAM to', consensusFilename)
        self.report('    Writing non-consensus SAM to', nonConsensusFilename)

        with samfile(alignmentFile) as sam:
            consensusAlignment = AlignmentFile(consensusFilename,
                                               mode='w',
                                               template=sam)
            nonConsensusAlignment = AlignmentFile(nonConsensusFilename,
                                                  mode='w',
                                                  template=sam)

        # Reads with no significant offsets get written to both output files.
        readsWithNoSignificantOffsetsCount = 0
        for read in alignedReads:
            if not read.significantOffsets:
                readsWithNoSignificantOffsetsCount += 1
                consensusAlignment.write(read.alignment)
                nonConsensusAlignment.write(read.alignment)

                for offset in insignificantOffsets:
                    base = read.base(offset)
                    if base is not None:
                        consensus[offset].incorporateBase(base)

        self.report('    %d read%s did not overlap any significant offsets' %
                    (readsWithNoSignificantOffsetsCount,
                     s(readsWithNoSignificantOffsetsCount)))

        readsMatchingConsensusCount = readsNotMatchingConsensusCount = 0
        cutoff = self.cutoff
        while readQueue:
            mismatchFraction, _ = readQueue.lowestPriority()
            read = readQueue.pop()
            if mismatchFraction <= cutoff:
                # We want this read. Incorporate it into the consensus.
                readsMatchingConsensusCount += 1
                consensusAlignment.write(read.alignment)
                affectedReads = set()
                for offset in read.significantOffsets:
                    readBase = read.base(offset)
                    consensus[offset].incorporateBase(readBase)
                    for readAtOffset in readsAtOffset[offset]:
                        if readAtOffset in readQueue:
                            affectedReads.add(readAtOffset)
                self.updatePriorityQueue(readQueue, affectedReads, consensus,
                                         significantOffsets)
            else:
                readsNotMatchingConsensusCount += 1
                nonConsensusAlignment.write(read.alignment)

        consensusAlignment.close()
        nonConsensusAlignment.close()

        self.report(
            '    %d read%s matched the consensus, %d did not.' %
            (readsMatchingConsensusCount, s(readsMatchingConsensusCount),
             readsNotMatchingConsensusCount))

        # Remove the reference bases from the consensus.
        for offset, base in enumerate(referenceSequence):
            consensus[offset].unincorporateBase(base)

        consensusInfoFilename = join(outputDir, 'reference-consensus.txt')
        self.report('    Writing consensus info to', consensusInfoFilename)

        with open(consensusInfoFilename, 'w') as fp:
            consensusSequence = []
            for offset in range(genomeLength):
                # Take a copy of the commonest set because we may pop from
                # it below.
                commonest = set(consensus[offset].commonest)
                referenceBase = referenceSequence[offset]

                if len(commonest) > 1:
                    nucleotides = ' Nucleotides: %s' % (
                        consensus[offset].baseCountsToStr())
                else:
                    nucleotides = ''

                if referenceBase in commonest:
                    consensusBase = referenceBase
                else:
                    if len(commonest) == 1:
                        # Nothing in the included reads covers this offset.
                        consensusBase = '-'
                    elif len(commonest) > 1:
                        # Report a draw (in which the reference base is not
                        # included and so cannot be used to break the draw).
                        commonest.pop()
                    else:
                        consensusBase = commonest.pop()

                consensusSequence.append(consensusBase)

                mismatch = '' if referenceBase == consensusBase else (
                    ' Mismatch (reference has %s)' % referenceBase)

                print('%d: %s%s%s' %
                      (offset + 1, consensusBase, mismatch, nucleotides),
                      file=fp)

        consensusRead = Read('gready-consensus-%s' % referenceId,
                             ''.join(consensusSequence))
        consensusFilename = join(outputDir, 'reference-consensus.fasta')
        self.report('    Writing gready consensus info to', consensusFilename)
        Reads([consensusRead]).save(consensusFilename)

        return {
            'consensusRead': consensusRead,
            'significantOffsets': significantOffsets,
        }