Exemplo n.º 1
0
def get_allele_counts_insertions_from_file_unfiltered(bamfilename,
                                                      length,
                                                      qual_min=30,
                                                      match_len_min=10,
                                                      maxreads=-1,
                                                      VERBOSE=0):
    '''Get the allele counts and insertions
    
    Parameters:
       - maxreads: limit the counts to a random subset of the reads of this size
    '''
    # Prepare output structures
    counts = np.zeros((len(read_types), len(alpha), length), int)
    # Note: the data structure for inserts is a nested dict with:
    # position --> string --> read type --> count
    #  (dict)      (dict)       (list)      (int)
    inserts = defaultdict(
        lambda: defaultdict(lambda: np.zeros(len(read_types), int)))

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        if maxreads != -1:
            from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open
            read_iter = extract_mapped_reads_subsample_open(bamfile,
                                                            maxreads,
                                                            VERBOSE=VERBOSE,
                                                            pairs=False)
        else:
            read_iter = bamfile

        # Iterate over single reads
        for i, read in enumerate(read_iter):

            # Max number of reads
            if i == maxreads:
                if VERBOSE >= 2:
                    print 'Max reads reached:', maxreads
                break

            # Print output
            if (VERBOSE >= 3) and (not ((i + 1) % 10000)):
                print(i + 1)

            # NOTE: since we change the consensus all the time, mapping is never
            # safe, and we have to filter the results thoroughly.

            # If unmapped/unpaired, trash
            if read.is_unmapped or (not read.is_proper_pair) or (read.isize
                                                                 == 0):
                if VERBOSE >= 3:
                    print 'Read ' + read.qname + ': unmapped/unpaired/no isize'
                continue

            # Get good CIGARs
            (good_cigars, first_good_cigar, last_good_cigar) = \
                    get_ind_good_cigars(read.cigar, match_len_min=match_len_min,
                                        full_output=True)

            # If no good CIGARs, give up
            if not good_cigars.any():
                continue

            # Divide by read 1/2 and forward/reverse
            js = 2 * read.is_read2 + read.is_reverse

            # Read CIGARs
            seq = np.fromstring(read.seq, 'S1')
            qual = np.fromstring(read.qual, np.int8) - 33
            pos = read.pos
            cigar = read.cigar
            len_cig = len(cigar)

            # Iterate over CIGARs
            for ic, (block_type, block_len) in enumerate(cigar):

                # Check for pos: it should never exceed the length of the fragment
                if (block_type in [0, 1, 2]) and (pos > length):
                    raise ValueError('Pos exceeded the length of the fragment')

                # Inline block
                if block_type == 0:
                    # Keep only stuff from good CIGARs
                    if first_good_cigar <= ic <= last_good_cigar:
                        seqb = seq[:block_len]
                        qualb = qual[:block_len]
                        # Increment counts
                        for j, a in enumerate(alpha):
                            posa = ((seqb == a) &
                                    (qualb >= qual_min)).nonzero()[0]
                            if len(posa):
                                counts[js, j, pos + posa] += 1

                    # Chop off this block
                    if ic != len_cig - 1:
                        seq = seq[block_len:]
                        qual = qual[block_len:]
                        pos += block_len

                # Deletion
                elif block_type == 2:
                    # Keep only stuff from good CIGARs
                    if first_good_cigar <= ic <= last_good_cigar:

                        # Increment gap counts
                        counts[js, 4, pos:pos + block_len] += 1

                    # Chop off pos, but not sequence
                    pos += block_len

                # Insertion
                # an insert @ pos 391 means that seq[:391] is BEFORE the insert,
                # THEN the insert, FINALLY comes seq[391:]
                elif block_type == 1:
                    # Keep only stuff from good CIGARs
                    if first_good_cigar <= ic <= last_good_cigar:
                        seqb = seq[:block_len]
                        qualb = qual[:block_len]
                        # Accept only high-quality inserts
                        if (qualb >= qual_min).all():
                            inserts[pos][seqb.tostring()][js] += 1

                    # Chop off seq, but not pos
                    if ic != len_cig - 1:
                        seq = seq[block_len:]
                        qual = qual[block_len:]

                # Other types of cigar?
                else:
                    raise ValueError('CIGAR type ' + str(block_type) +
                                     ' not recognized')

    return (counts, inserts)
def get_allele_counts_insertions_from_file_unfiltered(bamfilename, length, qual_min=30,
                                                      match_len_min=10,
                                                      maxreads=-1, VERBOSE=0):
    '''Get the allele counts and insertions
    
    Parameters:
       - maxreads: limit the counts to a random subset of the reads of this size
    '''
    # Prepare output structures
    counts = np.zeros((len(read_types), len(alpha), length), int)
    # Note: the data structure for inserts is a nested dict with:
    # position --> string --> read type --> count
    #  (dict)      (dict)       (list)      (int)
    inserts = defaultdict(lambda: defaultdict(lambda: np.zeros(len(read_types), int)))

    # Open BAM file
    # Note: the reads should already be filtered of unmapped stuff at this point
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        if maxreads != -1:
            from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open
            read_iter = extract_mapped_reads_subsample_open(bamfile, maxreads,
                                                            VERBOSE=VERBOSE,
                                                            pairs=False)
        else:
            read_iter = bamfile

        # Iterate over single reads
        for i, read in enumerate(read_iter):

            # Max number of reads
            if i == maxreads:
                if VERBOSE >= 2:
                    print 'Max reads reached:', maxreads
                break
        
            # Print output
            if (VERBOSE >= 3) and (not ((i +1) % 10000)):
                print (i+1)

            # NOTE: since we change the consensus all the time, mapping is never
            # safe, and we have to filter the results thoroughly.

            # If unmapped/unpaired, trash
            if read.is_unmapped or (not read.is_proper_pair) or (read.isize == 0):
                if VERBOSE >= 3:
                        print 'Read '+read.qname+': unmapped/unpaired/no isize'
                continue

            # Get good CIGARs
            (good_cigars, first_good_cigar, last_good_cigar) = \
                    get_ind_good_cigars(read.cigar, match_len_min=match_len_min,
                                        full_output=True)

            # If no good CIGARs, give up
            if not good_cigars.any():
                continue
                    
            # Divide by read 1/2 and forward/reverse
            js = 2 * read.is_read2 + read.is_reverse
        
            # Read CIGARs
            seq = np.fromstring(read.seq, 'S1')
            qual = np.fromstring(read.qual, np.int8) - 33
            pos = read.pos
            cigar = read.cigar
            len_cig = len(cigar)            

            # Iterate over CIGARs
            for ic, (block_type, block_len) in enumerate(cigar):

                # Check for pos: it should never exceed the length of the fragment
                if (block_type in [0, 1, 2]) and (pos > length):
                    raise ValueError('Pos exceeded the length of the fragment')
            
                # Inline block
                if block_type == 0:
                    # Keep only stuff from good CIGARs
                    if first_good_cigar <= ic <= last_good_cigar:
                        seqb = seq[:block_len]
                        qualb = qual[:block_len]
                        # Increment counts
                        for j, a in enumerate(alpha):
                            posa = ((seqb == a) & (qualb >= qual_min)).nonzero()[0]
                            if len(posa):
                                counts[js, j, pos + posa] += 1
            
                    # Chop off this block
                    if ic != len_cig - 1:
                        seq = seq[block_len:]
                        qual = qual[block_len:]
                        pos += block_len
            
                # Deletion
                elif block_type == 2:
                    # Keep only stuff from good CIGARs
                    if first_good_cigar <= ic <= last_good_cigar:

                        # Increment gap counts
                        counts[js, 4, pos:pos + block_len] += 1
            
                    # Chop off pos, but not sequence
                    pos += block_len
            
                # Insertion
                # an insert @ pos 391 means that seq[:391] is BEFORE the insert,
                # THEN the insert, FINALLY comes seq[391:]
                elif block_type == 1:
                    # Keep only stuff from good CIGARs
                    if first_good_cigar <= ic <= last_good_cigar:
                        seqb = seq[:block_len]
                        qualb = qual[:block_len]
                        # Accept only high-quality inserts
                        if (qualb >= qual_min).all():
                            inserts[pos][seqb.tostring()][js] += 1
            
                    # Chop off seq, but not pos
                    if ic != len_cig - 1:
                        seq = seq[block_len:]
                        qual = qual[block_len:]
            
                # Other types of cigar?
                else:
                    raise ValueError('CIGAR type '+str(block_type)+' not recognized')

    return (counts, inserts)
Exemplo n.º 3
0
                if sample[fragment] not in ['ok', 'low']:
                    if VERBOSE >= 1:
                        print 'not "ok". skipping'
                    continue

                if VERBOSE >= 1:
                    print 'ok'

                bamfilename = sample.get_mapped_filtered_filename(
                    fragment, decontaminated=True)
                with pysam.Samfile(bamfilename, 'rb') as bamfile:
                    if maxreads == -1:
                        reads = pair_generator(bamfile)
                    else:
                        reads = extract_mapped_reads_subsample_open(
                            bamfile, maxreads, VERBOSE=VERBOSE)

                    dists[sample.name] = get_distance_reads_sequence(
                        refseq,
                        reads,
                        VERBOSE=VERBOSE,
                        score_match=3,
                        score_mismatch=-3)

            hs = {}
            binmax = max(map(max, dists.itervalues()))
            bins = np.arange(0, binmax, 6)
            bincenters = 0.5 * (bins[1:] + bins[:-1])
            for samplename, dist in dists.iteritems():
                hs[samplename] = np.histogram(dist, bins=bins, density=True)[0]
Exemplo n.º 4
0
def get_local_block(bamfilename,
                    start,
                    end,
                    VERBOSE=0,
                    maxreads=-1,
                    refroi=None):
    '''Extract reads fully covering the region, discarding insertions'''
    import sys
    import pysam
    from hivwholeseq.utils.mapping import pair_generator
    from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        block = []

        if maxreads == -1:
            reads_iter = pair_generator(bamfile)
        else:
            reads_iter = extract_mapped_reads_subsample_open(bamfile,
                                                             maxreads,
                                                             VERBOSE=VERBOSE,
                                                             pairs=True)

        for irp, reads in enumerate(reads_iter):
            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    if irp + 1 != 10000:
                        sys.stdout.write("\x1b[1A\n")
                    sys.stdout.write(str(irp + 1))
                    sys.stdout.flush()

            # Sort fwd read first
            is_fwd = reads[0].is_reverse
            reads = [reads[is_fwd], reads[not is_fwd]]

            # Check for coverage of the region
            start_fwd = reads[0].pos
            end_fwd = start_fwd + sum(
                bl for (bt, bl) in reads[0].cigar if bt in (0, 2))
            start_rev = reads[1].pos
            end_rev = start_rev + sum(
                bl for (bt, bl) in reads[1].cigar if bt in (0, 2))
            if start_fwd > start:
                continue
            if end_rev < end:
                continue
            if (end_fwd < end) and (start_rev > start) and (start_rev >
                                                            end_fwd):
                continue

            if VERBOSE >= 3:
                print ' '.join(
                    map('{:>4d}'.format,
                        [start_fwd, end_fwd, start_rev, end_rev]))

            # Gather info from both reads, merge by putting ambiguous nucleotides
            seqs = []
            st_ens = [[start_fwd, end_fwd], [start_rev, end_rev]]
            for ir, read in enumerate(reads):
                (start_read, end_read) = st_ens[ir]
                if (end_read <= start) or (start_read >= end):
                    seqs.append(None)
                    continue

                seq = []
                pos_ref = start_read
                pos_read = 0
                start_block = max(start, start_read) - start
                end_block = min(end, end_read) - start
                for (bt, bl) in read.cigar:
                    if bt == 1:
                        pos_read += bl

                    elif bt == 2:
                        if pos_ref + bl > start:
                            st = max(0, start - pos_ref)
                            en = min(bl, end - pos_ref)
                            seq.append('-' * (en - st))
                            if pos_ref + bl >= end:
                                break
                        pos_ref += bl

                    elif bt == 0:
                        if pos_ref + bl > start:
                            st = max(0, start - pos_ref)
                            en = min(bl, end - pos_ref)
                            seq.append(read.seq[pos_read + st:pos_read + en])
                            if pos_ref + bl >= end:
                                break
                        pos_ref += bl
                        pos_read += bl
                seq = ''.join(seq)
                seqs.append((start_block, end_block, seq))

            # Merge sequences if both fwd and rev cover part of it
            if seqs[0] is None:
                seq = seqs[1][2]
            elif seqs[1] is None:
                seq = seqs[0][2]
            else:
                # The fwd read starts before the rev, because of our insert sizes
                end_block_fwd = seqs[0][1]
                start_block_rev = seqs[1][0]
                overlap = [
                    seqs[0][2][start_block_rev:],
                    seqs[1][2][:end_block_fwd - start_block_rev]
                ]

                # The two reads in a pair should have the same length in the overlap
                if len(overlap[0]) != len(overlap[1]):
                    if VERBOSE >= 3:
                        print 'WARNING:', reads[
                            0].qname, 'not same length in overlap!'
                    continue

                ol_fwd = np.fromstring(overlap[0], 'S1')
                ol_rev = np.fromstring(overlap[1], 'S1')

                ol_fwd[ol_fwd != ol_rev] = 'N'
                seq = seqs[0][2][:start_block_rev] + \
                      ol_fwd.tostring() + \
                      seqs[1][2][end_block_fwd - start_block_rev:]

            block.append(seq)

        if VERBOSE >= 2:
            print ''

    return block
Exemplo n.º 5
0
                    print sample.name,

                if sample[fragment] not in ['ok', 'low']:
                    if VERBOSE >= 1:
                        print 'not "ok". skipping'
                    continue

                if VERBOSE >= 1:
                    print 'ok'
                               
                bamfilename = sample.get_mapped_filtered_filename(fragment, decontaminated=True)
                with pysam.Samfile(bamfilename, 'rb') as bamfile:
                    if maxreads == -1:
                        reads = pair_generator(bamfile)
                    else:
                        reads = extract_mapped_reads_subsample_open(bamfile, maxreads,
                                                                    VERBOSE=VERBOSE)

                    dists[sample.name] = get_distance_reads_sequence(refseq, reads,
                                                                     VERBOSE=VERBOSE,
                                                                     score_match=3,
                                                                     score_mismatch=-3)


            hs = {}
            binmax = max(map(max, dists.itervalues()))
            bins = np.arange(0, binmax, 6)
            bincenters = 0.5 * (bins[1:] + bins[:-1])
            for samplename, dist in dists.iteritems():
                hs[samplename] = np.histogram(dist, bins=bins, density=True)[0]

            fig, ax = plt.subplots(figsize=(10, 5))
def get_local_haplotypes(bamfilename, start, end, VERBOSE=0, maxreads=-1,
                         label=''):
    '''Extract reads fully covering the region, discarding insertions'''
    import sys
    import pysam
    from hivwholeseq.utils.mapping import pair_generator
    from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open

    from collections import Counter
    haplotypes = Counter()

    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        if maxreads == -1:
            reads_iter = pair_generator(bamfile)
        else:
            reads_iter =  extract_mapped_reads_subsample_open(bamfile, maxreads,
                                                              VERBOSE=VERBOSE,
                                                              pairs=True)

        for irp, reads in enumerate(reads_iter):
            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    if irp + 1 != 10000:
                        sys.stdout.write("\x1b[1A\n")
                    if label:
                        sys.stdout.write(label+'\t')
                    sys.stdout.write(str(irp + 1))
                    sys.stdout.flush()

            # Sort fwd read first: this is important because with our insert
            # size we know the fwd read starts <= the rev read
            is_fwd = reads[0].is_reverse
            reads = [reads[is_fwd], reads[not is_fwd]]

            # Check for coverage of the region
            start_fwd = reads[0].pos
            end_fwd = start_fwd + sum(bl for (bt, bl) in reads[0].cigar if bt in (0, 2))
            start_rev = reads[1].pos
            end_rev = start_rev + sum(bl for (bt, bl) in reads[1].cigar if bt in (0, 2))
            overlap_len = max(0, end_fwd - start_rev)

            # Various scenarios possible
            if start_fwd > start:
                continue

            if end_rev < end:
                continue

            # No single read covers the whole region AND (the insert has a whole
            # OR a very short overlap)
            if (end_fwd < end) and (start_rev > start) and (overlap_len < 20):
                continue

            # Now the good cases
            if (start_fwd <= start) and (end_fwd >= end):
                seq = trim_read_roi(reads[0], start, end)

            elif (start_rev <= start) and (end_rev >= end):
                seq = trim_read_roi(reads[1], start, end)

            else:
                seqs = [trim_read_roi(read, start, end) for read in reads]
                seq = merge_read_pair(*seqs)

            haplotypes[seq] += 1
            if VERBOSE >= 4:
                import ipdb; ipdb.set_trace()

    if VERBOSE >= 2:
        if irp >= 10000:
            sys.stdout.write('\n')
            sys.stdout.flush()

    return haplotypes
Exemplo n.º 7
0
def get_local_haplotypes(bamfilename,
                         start,
                         end,
                         VERBOSE=0,
                         maxreads=-1,
                         label=''):
    '''Extract reads fully covering the region, discarding insertions'''
    import sys
    import pysam
    from hivwholeseq.utils.mapping import pair_generator
    from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open

    from collections import Counter
    haplotypes = Counter()

    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        if maxreads == -1:
            reads_iter = pair_generator(bamfile)
        else:
            reads_iter = extract_mapped_reads_subsample_open(bamfile,
                                                             maxreads,
                                                             VERBOSE=VERBOSE,
                                                             pairs=True)

        for irp, reads in enumerate(reads_iter):
            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    if irp + 1 != 10000:
                        sys.stdout.write("\x1b[1A\n")
                    if label:
                        sys.stdout.write(label + '\t')
                    sys.stdout.write(str(irp + 1))
                    sys.stdout.flush()

            # Sort fwd read first: this is important because with our insert
            # size we know the fwd read starts <= the rev read
            is_fwd = reads[0].is_reverse
            reads = [reads[is_fwd], reads[not is_fwd]]

            # Check for coverage of the region
            start_fwd = reads[0].pos
            end_fwd = start_fwd + sum(
                bl for (bt, bl) in reads[0].cigar if bt in (0, 2))
            start_rev = reads[1].pos
            end_rev = start_rev + sum(
                bl for (bt, bl) in reads[1].cigar if bt in (0, 2))
            overlap_len = max(0, end_fwd - start_rev)

            # Various scenarios possible
            if start_fwd > start:
                continue

            if end_rev < end:
                continue

            # No single read covers the whole region AND (the insert has a whole
            # OR a very short overlap)
            if (end_fwd < end) and (start_rev > start) and (overlap_len < 20):
                continue

            # Now the good cases
            if (start_fwd <= start) and (end_fwd >= end):
                seq = trim_read_roi(reads[0], start, end)

            elif (start_rev <= start) and (end_rev >= end):
                seq = trim_read_roi(reads[1], start, end)

            else:
                seqs = [trim_read_roi(read, start, end) for read in reads]
                seq = merge_read_pair(*seqs)

            haplotypes[seq] += 1
            if VERBOSE >= 4:
                import ipdb
                ipdb.set_trace()

    if VERBOSE >= 2:
        if irp >= 10000:
            sys.stdout.write('\n')
            sys.stdout.flush()

    return haplotypes
def quality_score_along_reads_mapped(read_len, bamfilename,
                                     insertsize_range=[400, 1000],
                                     skipreads=0,
                                     maxreads=-1,
                                     randomreads=True,
                                     VERBOSE=0):
    '''Calculate the quality score along the reads'''
    from hivwholeseq.utils.mapping import trim_read_pair_crossoverhangs as trim_coh
    from hivwholeseq.utils.mapping import pair_generator

    quality = [[[] for j in xrange(read_len)] for i in xrange(2)]

    # Precompute conversion table
    SANGER_SCORE_OFFSET = ord("!")
    q_mapping = dict()
    for letter in range(0, 255):
        q_mapping[chr(letter)] = letter - SANGER_SCORE_OFFSET

    # Iterate over all reads (using fast iterators)
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        if not randomreads:
            reads_all = []
            for i, read_pair in enumerate(pair_generator(bamfile)):
                if i < skipreads:
                    continue
    
                if i == skipreads + maxreads:
                    if VERBOSE:
                        print 'Maximal number of read pairs reached:', maxreads
                    break
    
                if VERBOSE and (not ((i + 1) % 10000)):
                    print i + 1

                reads_all.append(read_pair)

        else:
            reads_all = extract_mapped_reads_subsample_open(bamfile, maxreads,
                                                            VERBOSE=VERBOSE)

        print len(reads_all)

        for reads in reads_all:

            # Check insert size
            read = reads[reads[0].is_reverse]
            if (read.is_unmapped or (not read.is_proper_pair) or \
                (read.isize < insertsize_range[0]) or \
                (read.isize >= insertsize_range[1])):
                continue

            trim_coh(reads, trim=5, include_tests=False)

            pos_read = 0
            for read in reads:
                ip = read.is_read2
                for (bt, bl) in read.cigar:
                    if bt == 1:
                        pos_read += bl
                    elif bt == 2:
                        pass
                    elif bt == 0:
                        qualb = read.qual[pos_read: pos_read + bl]
                        poss_read = np.arange(pos_read, pos_read + bl)
                        if read.is_reverse:
                            poss_read = len(read.seq) - 1 - poss_read

                        for j, qletter in izip(poss_read, qualb):
                            quality[ip][j].append(q_mapping[qletter])

    for qual in quality:
        for qpos in qual:
            qpos.sort()

    return quality