Пример #1
0
def frag_coverage(bam, chrom_lengths, region=None, min_aqual=0, ref_cov=True, verbose=True):
    """ Calculate fragment coverage vectors on the forward and reverse strands.

    :param bam: Input bam file.
    :param chrom_lengths: Dictionary of chromosome names and lengths.
    :param region: Restrict parsing to the specified region.
    :param min_aqual: Minimum mapping quality.
    :param verbose: Display progress bar.
    :returns: Forward and reverse fragment coverage vectors.
    :rtype: dict
    """

    frags_fwd = defaultdict(lambda: defaultdict(int))
    frags_rev = defaultdict(lambda: defaultdict(int))

    aln_ref_cov = (defaultdict(list))

    bam_reader = bam_common.pysam_open(bam, in_format='BAM')
    ue = True
    if region is not None:
        ue = False
    bam_iter = bam_reader.fetch(region=region, until_eof=ue)

    try:
        total_reads = bam_reader.mapped + bam_reader.unmapped
    except:
        total_reads = None
    if verbose and region is None:
        sys.stdout.write(
            "Gathering fragment statistics from file: {}\n".format(bam))
        bam_iter = tqdm.tqdm(bam_iter, total=total_reads)

    for r in bam_iter:
        # Skip unmapped reads:
        if r.is_unmapped:
            continue
        # Skip if mapping quality is too low:
        if r.mapq < min_aqual:
            continue
        pos = r.reference_start
        ref = r.reference_name
        if r.is_reverse:
            frags_rev[r.reference_name][pos] += 1
        else:
            frags_fwd[r.reference_name][pos] += 1

        if ref_cov:
            aln_ref_cov[ref].append(r.reference_length / float(chrom_lengths[ref]))

    frags_fwd = _frag_dict_to_array(frags_fwd, chrom_lengths)
    frags_rev = _frag_dict_to_array(frags_rev, chrom_lengths)

    res = {'frags_fwd': frags_fwd, 'frags_rev': frags_rev, 'ref_cov': aln_ref_cov}
    return res
Пример #2
0
def _process_bam(bam,
                 out_tsv,
                 chrom_lengths,
                 region=None,
                 min_aqual=0,
                 verbose=True):
    bam_reader = bam_common.pysam_open(bam, in_format='BAM')
    ue = True
    if region is not None:
        ue = False
    bam_iter = bam_reader.fetch(region=region, until_eof=ue)

    try:
        total_reads = bam_reader.mapped + bam_reader.unmapped
    except:
        total_reads = None
    if verbose and region is None:
        sys.stdout.write(
            "Gathering fragment statistics from file: {}\n".format(bam))
        bam_iter = tqdm.tqdm(bam_iter, total=total_reads)

    tsv = open(out_tsv, "w")
    tsv.write(
        "Read\tRef\tStrand\tRefCov\tReadCov\tReadLength\tReadAlnLength\tRefLength\tRefAlnLength\tMapQual\n"
    )

    for r in bam_iter:
        # Skip unmapped reads:
        if r.is_unmapped:
            continue
        # Skip if mapping quality is too low:
        if r.mapq < min_aqual:
            continue
        strand = '-' if r.is_reverse else '+'
        ref = r.reference_name
        ref_cov = r.reference_length / float(chrom_lengths[ref])
        read = r.query_name
        read_length = r.infer_read_length()
        mapq = r.mapping_quality
        read_aln_len = r.query_alignment_length
        read_cov = read_aln_len / float(read_length)
        ref_aln_length = r.reference_length

        tsv.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
            read, ref, strand, ref_cov, read_cov, read_length, read_aln_len,
            chrom_lengths[ref], ref_aln_length, mapq))

    tsv.flush()
    tsv.close()
Пример #3
0
def pileup_stats(bam, region=None, verbose=True, with_quals=True):
    """ Parse pileup columns and extract quality values.

    :param bam: Input BAM file.
    :param region: samtools region.
    :param verbose: Show progress bar.
    :param with_quals: Return quality values per position.
    :returns: Dictionaries per reference with per-base coverage and quality values.
    :rtype: dict
    """
    st = defaultdict(lambda: defaultdict(list))
    cst = defaultdict(lambda: defaultdict(int))
    samfile = bam_common.pysam_open(bam, in_format='BAM')

    pileup_iter = samfile.pileup(region=region, min_base_quality=0)
    start, end = None, None
    if region is not None:
        tmp = region.split(":")
        _, start, end = tmp[0], int(tmp[1]) - 1, int(tmp[2])
    if verbose:
        sys.stdout.write(
            "Gathering pileup statistics from file: {}\n".format(bam))
        total_bases = sum(samfile.lengths)
        if region is not None:
            tmp = region.split(":")
            total_bases = int(tmp[2]) - int(tmp[1])
        pileup_iter = tqdm.tqdm(pileup_iter, total=total_bases)

    for pileupcolumn in pileup_iter:
        if region is not None and (pileupcolumn.reference_pos < start
                                   or pileupcolumn.reference_pos >= end):
            continue
        # print pileupcolumn.reference_name, pileupcolumn.reference_pos,
        # pileupcolumn.nsegments
        cst[pileupcolumn.reference_name][
            pileupcolumn.reference_pos] = pileupcolumn.nsegments
        for pileupread in pileupcolumn.pileups:
            if not pileupread.is_del and not pileupread.is_refskip:
                # print pileupcolumn.reference_name, pileupcolumn.reference_pos,
                # pileupread.alignment.query_qualities[pileupread.query_position]
                if (pileupread.alignment.query_qualities
                        is not None) and with_quals:
                    st[pileupcolumn.reference_name][
                        pileupcolumn.reference_pos].append(
                            pileupread.alignment.query_qualities[
                                pileupread.query_position])
    samfile.close()
    return {'qualities': dict(st), 'coverage': dict(cst)}
Пример #4
0
parser.add_argument(
    '-f', metavar='format', type=str, help="Input/output format (SAM).", default='SAM')
parser.add_argument(
    '-s', metavar='strategy', type=str, help="Filtering strategy: top_per_query, query_coverage, ref_coverage (top_per_query).",
    default="top_per_query", choices=['top_per_query', 'query_coverage', 'ref_coverage'])
parser.add_argument(
    '-q', metavar='query_cover', type=float, help="Minimum query coverage fraction (0.8).", default=0.8)
parser.add_argument(
    'infile', metavar='input_file', type=str, help="Input file.")
parser.add_argument(
    'outfile', metavar='output_file', type=str, help="Output SAM file.")

if __name__ == '__main__':
    args = parser.parse_args()

    input_iter = bam_common.pysam_open(args.infile, args.f)

    if args.s == 'top_per_query':
        output_iter = bam_filter.filter_top_per_query(input_iter.fetch(until_eof=True))
    elif args.s == 'query_coverage':
        output_iter = bam_filter.filter_query_coverage(input_iter.fetch(until_eof=True), args.q)
    elif args.s == 'ref_coverage':
        output_iter = bam_filter.filter_ref_coverage(input_iter.fetch(until_eof=True), args.q, input_iter.header)
    else:
        raise Exception('Filtering strategy not implemented!')

    writer = pysam.AlignmentFile(args.outfile, "wh", template=input_iter, header=input_iter.header)
    for record in output_iter:
        writer.write(record)

    writer.close()
Пример #5
0
                    type=str,
                    help="Input fastq.",
                    required=True)
parser.add_argument('infile',
                    metavar='input_file',
                    type=str,
                    help="Input file.")
parser.add_argument('outfile',
                    metavar='output_file',
                    type=str,
                    help="Output SAM file.")

if __name__ == '__main__':
    args = parser.parse_args()

    input_iter = bam_common.pysam_open(args.infile,
                                       args.f).fetch(until_eof=True)

    # Get SAM record names:
    sam_names = [record.query_name for record in input_iter]

    writer = sam_writer.SamWriter(args.outfile)

    for read in seq_util.read_seq_records(args.q, 'fastq'):
        if read.id not in sam_names:
            qual = seq_util.quality_array_to_string(
                read.letter_annotations["phred_quality"])
            sam_record = writer.new_sam_record(qname=read.id,
                                               flag=4,
                                               rname="*",
                                               pos=0,
                                               mapq=0,
Пример #6
0
    '-x',
    action="store_true",
    help=
    "Sort by number of read bases instead of number of aligned reference bases.",
    default=False)
parser.add_argument('-Q',
                    action="store_true",
                    help="Be quiet and do not print progress bar (False).",
                    default=False)
parser.add_argument('bam', metavar='bam', type=str, help="Input BAM file.")

if __name__ == '__main__':
    args = parser.parse_args()
    verbose = not args.Q

    bam_reader = bam_common.pysam_open(args.bam, in_format='BAM')

    if verbose:
        sys.stdout.write(
            "Gathering read and alignment lengths from file: {}\n".format(
                args.bam))
        try:
            total_reads = bam_reader.mapped + bam_reader.unmapped
        except:
            total_reads = None
        bam_reader = tqdm.tqdm(bam_reader, total=total_reads)

    read_names = []
    ref_names = []
    ref_lengths = []
    read_lengths = []
Пример #7
0
def error_and_read_stats(bam, refs, context_sizes=(1, 1), region=None, min_aqual=0, verbose=True):
    """Gather read statistics and context-dependend error statistics from BAM file.
    WARNING: context overstepping reference start/end boundaries are not registered.

    Definition of context: for substitutions the event is happening from the "central base", in the case of indels the events are located
    between the central base and the base before.

    :param bam: Input BAM file.
    :param refs: Dictionary of references.
    :param context_sizes: The size of the left and right contexts.
    :param region: samtools regions.
    :param min_qual: Minimum mappign quality.
    :param verbose: Show progress bar.
    :returns: Dictionary with read and error statistics.
    :rtype: dict
    """
    events = defaultdict(lambda: defaultdict(int))
    read_stats = defaultdict(list)
    read_stats = {'unmapped': 0,
                  'mapped': 0,
                  'unaligned_quals': [],
                  'unaligned_lengths': [],
                  'aligned_quals': [],
                  'alignment_lengths': [],
                  'aligned_lengths': [],
                  'mqfail_aligned_quals': [],
                  'mqfail_alignment_lengths': [],
                  'mapping_quals': [],
                  }
    indel_dists = {'insertion_lengths': defaultdict(int), 'deletion_lengths': defaultdict(
        int), 'insertion_composition': defaultdict(int)}

    bam_reader = bam_common.pysam_open(bam, in_format='BAM')
    base_stats = {'match': 0, 'mismatch': 0, 'deletion': 0, 'insertion': 0}

    read_iter = bam_reader.fetch(region=region, until_eof=True)
    if verbose:
        sys.stdout.write(
            "Gathering read and error statistics from file: {}\n".format(bam))
        try:
            total_reads = bam_reader.mapped + bam_reader.unmapped
        except:
            total_reads = None
        read_iter = tqdm.tqdm(read_iter, total=total_reads)

    for r in read_iter:
        _update_read_stats(r, read_stats, min_aqual)
        if r.is_unmapped:
            continue
        if r.query_sequence is None:
            continue
        if r.mapping_quality < min_aqual:
            continue
        ref = refs[r.reference_name]
        _update_events(r, ref, events, indel_dists, context_sizes, base_stats)

    base_stats['aln_length'] = base_stats['match'] + base_stats['mismatch'] + \
        base_stats['insertion'] + base_stats['deletion']
    if base_stats['match'] + base_stats['mismatch'] == 0:
        base_stats['identity'] = 0
    else:
        base_stats['identity'] = float(
            base_stats['match']) / (base_stats['match'] + base_stats['mismatch'])
    if base_stats['aln_length'] == 0:
        base_stats['accuracy'] = 0
    else:
        base_stats['accuracy'] = 1.0 - \
            float(base_stats['mismatch'] + base_stats['insertion'] + base_stats['deletion']) / \
            base_stats['aln_length']

    res = {'events': dict(events), 'read_stats': dict(
        read_stats), 'indel_dists': dict(indel_dists), 'base_stats': base_stats}
    return res
Пример #8
0
def read_stats(bam, min_aqual=0, region=None, with_clipps=False, verbose=True):
    """ Parse reads in BAM file and record various statistics.

    :param bam: BAM file.
    :param min_aqual: Minimum mapping quality, skip read if mapping quality is lower.
    :param region: smatools region.
    :param with_clipps: Take into account clipps when calculating accuracy.
    :param verbose: Show progress bar.
    :returns: A dictionary with various global and per-read statistics.
    :rtype: dict
    """
    res = {'unmapped': 0,
           'mapped': 0,
           'unaligned_quals': [],
           'unaligned_lengths': [],
           'aligned_quals': [],
           'alignment_lengths': [],
           'aligned_lengths': [],
           'mqfail_aligned_quals': [],
           'mqfail_alignment_lengths': [],
           'mapping_quals': [],
           }
    base_stats = {'aln_length': 0, 'match': 0, 'mismatch': 0,
                  'deletion': 0, 'insertion': 0, 'clipps': 0}
    read_stats = OrderedDict([
        ("name", []),
        ("ref", []),
        ("coverage", []),
        ("direction", []),
        ("aln_length", []),
        ("insertion", []),
        ("deletion", []),
        ("mismatch", []),
        ("match", []),
        ("identity", []),
        ("accuracy", []),
        ("clipps", [])
    ])

    bam_reader = bam_common.pysam_open(bam, in_format='BAM')
    ue = True
    if region is not None:
        ue = False
    bam_iter = bam_reader.fetch(region=region, until_eof=ue)

    try:
        total_reads = bam_reader.mapped + bam_reader.unmapped
    except:
        total_reads = None
    if verbose and region is None:
        sys.stdout.write(
            "Gathering read statistics from file: {}\n".format(bam))
        bam_iter = tqdm.tqdm(bam_iter, total=total_reads)

    for r in bam_iter:
        # Update basic read statistics:
        _update_read_stats(r, res, min_aqual)

        # Get detailed statistics from aligned read and
        # updated global stats:
        bs = stats_from_aligned_read(r, with_clipps)
        # bs is None for unaligned reads.
        if bs is not None:
            for k in six.iterkeys(base_stats):
                base_stats[k] += bs[k]
            for stat, value in six.iteritems(bs):
                read_stats[stat].append(value)

    # Calculate global identity and accuracy:
    base_stats['identity'] = float(
        base_stats['match']) / (base_stats['match'] + base_stats['mismatch'])

    clipps = 0
    if with_clipps:
        clipps = base_stats['clipps']

    base_stats['accuracy'] = 1.0 - (float(base_stats['insertion'] +
                                          base_stats['deletion'] + base_stats['mismatch'] + clipps) / base_stats['aln_length'])
    res['base_stats'] = base_stats
    res['read_stats'] = read_stats
    bam_reader.close()
    return res
Пример #9
0
def bam_compare(aln_one,
                aln_two,
                coarse_tolerance=50,
                strict_flags=False,
                in_format='BAM',
                verbose=False):
    """Count reads mapping to references in a BAM file.

    :param alignment_file: BAM file.
    :param min_aln_qual: Minimum mapping quality.
    :param verbose: Show progress bar.
    :returns: Dictionary with read counts per reference.
    :rtype: dict
    """

    aln_iter_one = bam_common.pysam_open(aln_one, in_format)
    aln_iter_two = bam_common.pysam_open(aln_two, in_format)

    total = None
    if in_format == "BAM":
        total_one = aln_iter_one.mapped + aln_iter_one.unmapped
        total_two = aln_iter_two.mapped + aln_iter_two.unmapped

        if total_one != total_two:
            raise Exception(
                "The two input files ({} {}) have a different number of records!"
                .format(aln_one, aln_two))
        total = total_one

    # Comparison summary structure:
    stats = OrderedDict([
        ('BamFiles', [aln_one, aln_two]),
        ('TotalQueries', 0),
        ('DirectionMismatch', 0),
        ('RefMismatch', 0),
        ('StrictFlagMismatch', 0),
        ('SeqMismatch', 0),
        ('CoarseMatches', 0),
        ('CommonAlignedBases', 0),
        ('CommonMatchingBases', 0),
        ('PerQueryBaseSim', []),
        ('PerQueryBaseSimClipped', []),
        (aln_one, {
            'HardClippedBases': 0,
            'SoftClippedBases': 0,
            'AlignedBases': 0,
            'UnalignedQueries': 0,
            'AlignedQueries': 0
        }),
        (aln_two, {
            'HardClippedBases': 0,
            'SoftClippedBases': 0,
            'AlignedBases': 0,
            'UnalignedQueries': 0,
            'AlignedQueries': 0
        }),
        ('AlignedSimilarity', 0.0),
    ])

    records_iter = zip(aln_iter_one.fetch(until_eof=True),
                       aln_iter_two.fetch(until_eof=True))

    if verbose and in_format == "BAM":
        records_iter = tqdm.tqdm(records_iter, total=total)

    for segments in records_iter:
        aln_diff = compare_alignments(segments[0], segments[1], strict_flags)
        stats['TotalQueries'] += 1

        # Register hard and soft clipped bases:
        stats[aln_one]['HardClippedBases'] += aln_diff['hard_clipped'][0]
        stats[aln_two]['HardClippedBases'] += aln_diff['hard_clipped'][1]

        stats[aln_one]['SoftClippedBases'] += aln_diff['soft_clipped'][0]
        stats[aln_two]['SoftClippedBases'] += aln_diff['soft_clipped'][1]

        # Both reads are aligned:
        if aln_diff['mapped'] == (True, True):
            stats[aln_one]['AlignedQueries'] += 1
            stats[aln_two]['AlignedQueries'] += 1

            # Reference mismatch:
            if aln_diff['ref_match'] is False:
                stats['RefMismatch'] = +1
                continue

            # Orientation mismatch:
            if aln_diff['dir_match'] is False:
                stats['DirectionMismatch'] += 1
                continue

            # Flag mismatch:
            if aln_diff['flag_match'] is False:
                stats['StrictFlagMismatch'] += 1
                continue

            # Sequence mismatch:
            if aln_diff['seq_match'] is False:
                stats['SeqMismatch'] += 1

            stats['CommonAlignedBases'] += aln_diff['bases']
            stats['CommonMatchingBases'] += aln_diff['cons_score']
            stats['PerQueryBaseSim'].append(aln_diff['cons_score'] /
                                            float(aln_diff['bases']))
            stats['PerQueryBaseSimClipped'].append(
                float(aln_diff['cons_score']) /
                min(segments[0].infer_query_length(),
                    segments[1].infer_query_length()))

            if is_coarse_match(aln_diff, coarse_tolerance):
                stats['CoarseMatches'] += 1

            stats[aln_one]['AlignedBases'] += aln_diff['bases']
            stats[aln_two]['AlignedBases'] += aln_diff['bases']

        # Read from first BAM is aligned:
        elif aln_diff['mapped'] == (True, False):
            stats[aln_one]['AlignedQueries'] += 1
            stats[aln_one]['AlignedBases'] += aln_diff['bases_one']
            stats[aln_two]['UnalignedQueries'] += 1
        # Read from second BAM is aligned:
        elif aln_diff['mapped'] == (False, True):
            stats[aln_two]['AlignedQueries'] += 1
            stats[aln_two]['AlignedBases'] += aln_diff['bases_two']
            stats[aln_one]['UnalignedQueries'] += 1
        # Both unaligned:
        elif aln_diff['mapped'] == (False, False):
            stats[aln_one]['UnalignedQueries'] += 1
            stats[aln_two]['UnalignedQueries'] += 1

    if stats['CommonAlignedBases'] > 0:
        stats['AlignedSimilarity'] = stats['CommonMatchingBases'] / \
            float(stats['CommonAlignedBases'])
    else:
        stats['AlignedSimilarity'] = 0.0
    return stats