示例#1
0
def extract_approximate_library_stats(opts, bam, rough_insert_median):
    reads_per_chunk = int(
        np.floor(opts['approx_stats_nreads'] / opts['approx_stats_nchunks']))

    # lib_patterns, lib_stats = parse_library_stats(meta)
    # maps read groups matching lib_patterns to indices in lib_stats
    # lib_dict = {}
    # MULTILIB
    nlib = opts['nlib']
    insert_len = [[] for i in range(nlib)]
    read_len_shorter = [[] for i in range(nlib)]
    read_len_longer = [[] for i in range(nlib)]

    chrom_name = opts['chromosome']
    chrom_size = get_chrom_size_from_bam(chrom_name, bam)
    chunk_size = 10 * opts['insert_max_mu_multiple'] * rough_insert_median

    rough_insert_max = opts['insert_max_mu_multiple'] * rough_insert_median
    reads_processed = [0 for i in range(nlib)]
    chunks_processed = 0
    # MINOR reads_per_chunk should mean completed
    while min(reads_processed) < opts['approx_stats_nreads']:
        # extract random chunk
        start = np.random.randint(0, chrom_size - chunk_size)
        end = start + chunk_size
        # parse reads
        seen_aln = {}
        chunk_reads_seen = 0
        alns = list(bam.fetch_unsorted(chrom_name, start, end))
        if bam.num_bam > 1:
            alns.sort(key=lambda a: a.pos)
        for aln in list(bam.fetch_unsorted(chrom_name, start, end)):
            # conditioning on mate position introduces slight bias,
            # but insignificant if chunk_size >> insert size
            if not_primary(aln) or aln.is_duplicate or aln.is_unmapped or \
               aln.mpos < start or aln.mpos >= end or aln.mate_is_unmapped:
                continue
            if aln.qname not in seen_aln:
                if chunk_reads_seen < reads_per_chunk:
                    seen_aln[aln.qname] = aln
                    chunk_reads_seen += 1
                    continue
                else:
                    continue
            # pair completed
            mate = seen_aln[aln.qname]
            pair = (aln, mate)
            del seen_aln[aln.qname]

            lib_idx = 0  # get_lib_idx(aln.get_tag('RG'), lib_dict, lib_patterns)
            process_insert_len(pair,
                               insert_len[lib_idx],
                               opts['min_mapq_reads'],
                               opts['read_len'],
                               maximum_insert_size=rough_insert_max)
            process_read_len(pair, read_len_shorter[lib_idx],
                             read_len_longer[lib_idx])
            reads_processed[lib_idx] += 1
            if min(reads_processed) % 200000 == 0 and opts['verbosity'] > 0:
                print(
                    '[library_stats] processed {0} reads ({1} chunks) for each lib'
                    .format(min(reads_processed), chunks_processed))
        chunks_processed += 1

    insert_mean = [np.median(il) for il in insert_len]
    insert_sd = [robust_sd(il) for il in insert_len]
    insert_lower = [np.percentile(il, 0.15) for il in insert_len]
    insert_upper = [np.percentile(il, 99.85) for il in insert_len]
    insert_pmf = [
        pmf_kernel_smooth(il, 0, opts['insert_max_mu_multiple'] * mu,
                          opts['max_kde_samples'])
        for (il, mu) in zip(insert_len, insert_mean)
    ]
    rlen_short = [round(np.median(rl)) for rl in read_len_shorter]
    rlen_long = [round(np.median(rl)) for rl in read_len_longer]
    rlen_medians = list(zip(rlen_short, rlen_long))
    return insert_mean, insert_sd, insert_pmf, insert_lower, insert_upper, rlen_medians
示例#2
0
def parse_bam(opts, reference_files, bamfiles):
    chrom_name = opts['chromosome']
    start, end = opts['region_start'], opts['region_end']
    outdir = opts['outdir']
    min_mapq_reads = opts['min_mapq_reads']
    nlib = opts['nlib']  # MULTILIB
    # lib_patterns, lib_stats = parse_library_stats(meta)
    # lib_dict = {}

    bam = BamGroup(bamfiles)
    opts['read_len'] = bam_read_len(bam)
    # bam_has_unmapped = has_unmapped_records(bam)
    # if opts['verbosity'] > 0:
    #     if bam_has_unmapped:
    #         print('[parse_bam] bam file DOES contain unmapped records')
    #     else:
    #         print('[parse_bam] bam file DOES NOT contain unmapped records')

    if opts['verbosity'] > 0:
        print('\n[parse_bam] extracting approximate library stats')
    rough_insert_median = get_rough_insert_median(opts, bam)
    if opts['verbosity'] > 0:
        print('[parse_bam] read_len: {0}; rough_insert_median: {1}'.format(
            opts['read_len'], rough_insert_median))
    als = extract_approximate_library_stats(opts, bam, rough_insert_median)
    mean_approx, sd_approx, pmf_approx, qlower, qupper, rlen_medians = als
    for i in range(len(pmf_approx)):
        with open(
                os.path.join(
                    outdir, 'logging',
                    '{0}_insert_pmf.txt'.format(opts['library_names'][i])),
                'w') as f:
            for j in range(len(pmf_approx[i])):
                f.write('{0}\t{1}\n'.format(j, pmf_approx[i][j]))
    if opts['verbosity'] > 0:
        print('[parse_bam] library stats:\n\tmu = {0}\n\tsigma = {1}'.format(
            mean_approx, sd_approx))
        add_time_checkpoint(opts, 'lib. stats')

    def get_lr_cutoff(opts, pmf, do_min=False):
        cutoff_normal_equivalent = opts['insert_cutoff']
        lr_cutoff = normpdf(0) - normpdf(cutoff_normal_equivalent)
        mode = max(pmf)
        logmode = np.log(mode)
        which_mode = [i for i in range(len(pmf)) if pmf[i] == mode]
        cutoff = None
        if do_min:
            for i in range(1, len(pmf)):
                if pmf[i] != 0 and logmode - np.log(pmf[i]) < lr_cutoff:
                    cutoff = i - 1
                    break
        else:
            for i in range(len(pmf) - 2, -1, -1):
                if pmf[i] != 0 and logmode - np.log(pmf[i]) < lr_cutoff:
                    cutoff = i + 1
                    break
        if opts['verbosity'] > 0:
            print('[insert_cutoff] lr_cutoff is {0}'.format(lr_cutoff))
            print('[insert_cutoff] mode (log) {0} at {1}'.format(
                logmode, which_mode))
            print('[insert_cutoff] cutoff ratio (log) {0} at {1}'.format(
                logmode - np.log(pmf[i]), cutoff))
        return cutoff

    min_concordant_insert = [
        get_lr_cutoff(opts, pmf, do_min=True) for pmf in pmf_approx
    ]
    max_concordant_insert = [get_lr_cutoff(opts, pmf) for pmf in pmf_approx]
    if opts['verbosity'] > 0:
        print('[parse_bam] insert size cutoffs:')
        print('[parse_bam]' + '\n'.join([
            '{0}-{1}'.format(min_concordant_insert[i],
                             max_concordant_insert[i])
            for i in range(len(mean_approx))
        ]))
        print(
            '[parse_bam] equivalent to mu +/- 3 sigma in normal:\n\t{0}\n\t{1}\n'
            .format(qlower, qupper))

    seen_aln = {}
    nreads, npairs = 0, 0
    num_read_through = 0
    insert_len = [[] for i in range(nlib)]
    softclips = [(defaultdict(list), defaultdict(list)) for i in range(nlib)]
    splits = [[] for i in range(nlib)]
    if opts['do_pecluster']:
        discordant_pairs = [OrderedDict() for i in range(nlib)]
    if not opts['use_mate_tags']:  # need to estimate mappability proportions
        mapstats = [defaultdict(int) for i in range(nlib)]
    else:
        mapstats = None

    if opts['verbosity'] > 0:
        print('[parse_bam] starting alignment parsing. . .')
    alignments = bam.fetch_unsorted(chrom_name, start, end)
    for aln in alignments:
        if not_primary(aln) or aln.is_unmapped or aln.is_duplicate:
            continue
        nreads += 1
        if opts['verbosity'] > 0 and nreads % (1000000) == 0:
            print('[parse_bam] %d reads processed' % nreads)

        # TODO this can be done cleaner -- check for is_unmapped above
        #    and use handle_unpaired for everything with mate_is_unmapped
        if aln.qname not in seen_aln:
            # read is not going to pair, so handle now
            if aln.mate_is_unmapped or aln.rname != aln.mrnm:
                handle_unpaired_read(opts, aln, softclips, splits, bam,
                                     mapstats)
            # waiting for this read's pair
            else:
                seen_aln[aln.qname] = aln
            continue

        # Completed a pair!
        npairs += 1
        mate = seen_aln[aln.qname]
        pair = (aln, mate)
        del seen_aln[aln.qname]

        if opts['filter_read_through'] and is_read_through(opts, pair):
            num_read_through += 1
            continue

        # MULTILIB
        lib_idx = 0

        # handle softclip information, insert len, mapping stats, splits/discordants
        if not opts['use_mate_tags']:
            process_aggregate_mapstats(pair, mapstats[lib_idx], min_mapq_reads,
                                       opts['max_pair_distance'])
        ilen = process_insert_len(pair, insert_len[lib_idx],
                                  opts['min_mapq_reads'], opts['read_len'])
        if opts['do_pecluster']:
            process_discordant_pair(pair[0], pair[1], chrom_name,
                                    discordant_pairs[lib_idx], min_mapq_reads,
                                    ilen, min_concordant_insert[lib_idx],
                                    max_concordant_insert[lib_idx],
                                    opts['library_is_rf'])
        if any(op == CIGAR_SOFT_CLIP for (
                op,
                oplen) in itertools.chain(aln.cigartuples, mate.cigartuples)):
            if opts['do_splits']:
                a1_split = process_splits(pair[0],
                                          splits[lib_idx],
                                          bam,
                                          min_mapq=min_mapq_reads,
                                          mate=pair[1])
                a2_split = process_splits(pair[1],
                                          splits[lib_idx],
                                          bam,
                                          min_mapq=min_mapq_reads,
                                          mate=pair[0])
            else:
                a1_split, a2_split = False, False
            # if we found the same breakpoint in both reads,
            # it's quite likely that the reads were overlapping due to a short insert
            if a1_split and a2_split and splits_are_mirrored(
                    splits[lib_idx][-1], splits[lib_idx][-2]):
                if opts['verbosity'] > 1:
                    print('[bamparser] mirrored split: {0} {1} {2}'.format(
                        chrom_name, splits[lib_idx][-1].bp2, pair[0].qname))
                del splits[lib_idx][-1]

            process_softclip(opts, pair, (a1_split, a2_split),
                             softclips[lib_idx], lib_idx)

    # handle unpaired reads
    if opts['verbosity'] > 0:
        print('[parse_bam] handling unpaired reads')
    for aln in seen_aln.values():
        handle_unpaired_read(opts, aln, softclips, splits, bam, mapstats)

    if any(len(ins) == 0
           for ins in insert_len):  # MULTILIB should only fail if all()
        print('Error: region specified contains no reads!')
        sys.exit(1)

    # report stats
    if opts['verbosity'] > 0:
        print('[parse_bam] processed a total of {0} reads'.format(nreads))
        if opts['filter_read_through']:
            print('[parse_bam] found {0} read-through pairs out of {1} total'.
                  format(num_read_through, npairs))
    add_time_checkpoint(opts, 'parse bam')

    # compute insert length distributions and save plots
    if opts['verbosity'] > 1:
        print('[parse_bam] observed insert size min:')
        print('\n'.join([str(min(insert_len[i])) for i in range(nlib)]))
        print('\n'.join(
            [str(Counter(sorted(insert_len[i]))) for i in range(nlib)]))
        print('[parse_bam] insert 25-50-75 percentiles by library:')
        percentiles = [np.percentile(ins, (25, 50, 75)) for ins in insert_len]
        print(''.join([
            '{0}: {1}\n'.format(opts['library_names'][l],
                                tuple(percentiles[l])) for l in range(nlib)
        ]))
    if opts['verbosity'] > 0:
        print('[parse_bam] computing insert length pmfs')
    insert_mean = [np.median(il) for il in insert_len]
    insert_sd = [robust_sd(il) for il in insert_len]
    max_mult = opts['insert_max_mu_multiple']
    insert_len_dist = [
        pmf_kernel_smooth(insert_len[i], 0, max_mult * mu,
                          opts['max_kde_samples'])
        for (i, mu) in zip(range(nlib), insert_mean)
    ]

    if opts['verbosity'] > 1:
        for i in range(nlib):
            print('[parse_bam] lib {0} mu {1} sigma {2}'.format(
                i, insert_mean[i], insert_sd[i]))

    # insert dist plots
    plot_insert_dist(opts, insert_len_dist, outdir)

    # compute average coverage
    # MULTILIB this needs adjusting -- keeping track of nreads from each bamgroup
    region_len = len_without_gaps(chrom_name, start, end,
                                  reference_files['gap'])
    opts['seq_coverage'] = [
        nreads * opts['read_len'] / (nlib * region_len) for _ in range(nlib)
    ]
    opts['phys_coverage'] = [npairs * m / region_len for m in insert_mean]
    opts['max_pecluster_size'] = [
        pc * opts['pecluster_size_coverage_ratio']
        for pc in opts['phys_coverage']
    ]

    if opts['verbosity'] > 0:
        print('[parse_bam] average sequence coverage: %.1fx' %
              opts['seq_coverage'][0])
        print('[parse_bam] average physical coverage: %.1fx' %
              opts['phys_coverage'][0])

    if opts['do_pecluster']:
        return (softclips, splits, mapstats, rlen_medians, insert_len_dist,
                insert_mean, insert_sd, discordant_pairs,
                min_concordant_insert, max_concordant_insert)
    else:
        return (softclips, splits, mapstats, rlen_medians, insert_len_dist,
                insert_mean, insert_sd, None, None, None)