def get_allele_counts_insertions_from_file_unfiltered(bamfilename, length, qual_min=30, match_len_min=10, maxreads=-1, VERBOSE=0): '''Get the allele counts and insertions Parameters: - maxreads: limit the counts to a random subset of the reads of this size ''' # Prepare output structures counts = np.zeros((len(read_types), len(alpha), length), int) # Note: the data structure for inserts is a nested dict with: # position --> string --> read type --> count # (dict) (dict) (list) (int) inserts = defaultdict( lambda: defaultdict(lambda: np.zeros(len(read_types), int))) # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point with pysam.Samfile(bamfilename, 'rb') as bamfile: if maxreads != -1: from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open read_iter = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE, pairs=False) else: read_iter = bamfile # Iterate over single reads for i, read in enumerate(read_iter): # Max number of reads if i == maxreads: if VERBOSE >= 2: print 'Max reads reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i + 1) % 10000)): print(i + 1) # NOTE: since we change the consensus all the time, mapping is never # safe, and we have to filter the results thoroughly. # If unmapped/unpaired, trash if read.is_unmapped or (not read.is_proper_pair) or (read.isize == 0): if VERBOSE >= 3: print 'Read ' + read.qname + ': unmapped/unpaired/no isize' continue # Get good CIGARs (good_cigars, first_good_cigar, last_good_cigar) = \ get_ind_good_cigars(read.cigar, match_len_min=match_len_min, full_output=True) # If no good CIGARs, give up if not good_cigars.any(): continue # Divide by read 1/2 and forward/reverse js = 2 * read.is_read2 + read.is_reverse # Read CIGARs seq = np.fromstring(read.seq, 'S1') qual = np.fromstring(read.qual, np.int8) - 33 pos = read.pos cigar = read.cigar len_cig = len(cigar) # Iterate over CIGARs for ic, (block_type, block_len) in enumerate(cigar): # Check for pos: it should never exceed the length of the fragment if (block_type in [0, 1, 2]) and (pos > length): raise ValueError('Pos exceeded the length of the fragment') # Inline block if block_type == 0: # Keep only stuff from good CIGARs if first_good_cigar <= ic <= last_good_cigar: seqb = seq[:block_len] qualb = qual[:block_len] # Increment counts for j, a in enumerate(alpha): posa = ((seqb == a) & (qualb >= qual_min)).nonzero()[0] if len(posa): counts[js, j, pos + posa] += 1 # Chop off this block if ic != len_cig - 1: seq = seq[block_len:] qual = qual[block_len:] pos += block_len # Deletion elif block_type == 2: # Keep only stuff from good CIGARs if first_good_cigar <= ic <= last_good_cigar: # Increment gap counts counts[js, 4, pos:pos + block_len] += 1 # Chop off pos, but not sequence pos += block_len # Insertion # an insert @ pos 391 means that seq[:391] is BEFORE the insert, # THEN the insert, FINALLY comes seq[391:] elif block_type == 1: # Keep only stuff from good CIGARs if first_good_cigar <= ic <= last_good_cigar: seqb = seq[:block_len] qualb = qual[:block_len] # Accept only high-quality inserts if (qualb >= qual_min).all(): inserts[pos][seqb.tostring()][js] += 1 # Chop off seq, but not pos if ic != len_cig - 1: seq = seq[block_len:] qual = qual[block_len:] # Other types of cigar? else: raise ValueError('CIGAR type ' + str(block_type) + ' not recognized') return (counts, inserts)
def get_allele_counts_insertions_from_file_unfiltered(bamfilename, length, qual_min=30, match_len_min=10, maxreads=-1, VERBOSE=0): '''Get the allele counts and insertions Parameters: - maxreads: limit the counts to a random subset of the reads of this size ''' # Prepare output structures counts = np.zeros((len(read_types), len(alpha), length), int) # Note: the data structure for inserts is a nested dict with: # position --> string --> read type --> count # (dict) (dict) (list) (int) inserts = defaultdict(lambda: defaultdict(lambda: np.zeros(len(read_types), int))) # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point with pysam.Samfile(bamfilename, 'rb') as bamfile: if maxreads != -1: from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open read_iter = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE, pairs=False) else: read_iter = bamfile # Iterate over single reads for i, read in enumerate(read_iter): # Max number of reads if i == maxreads: if VERBOSE >= 2: print 'Max reads reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i +1) % 10000)): print (i+1) # NOTE: since we change the consensus all the time, mapping is never # safe, and we have to filter the results thoroughly. # If unmapped/unpaired, trash if read.is_unmapped or (not read.is_proper_pair) or (read.isize == 0): if VERBOSE >= 3: print 'Read '+read.qname+': unmapped/unpaired/no isize' continue # Get good CIGARs (good_cigars, first_good_cigar, last_good_cigar) = \ get_ind_good_cigars(read.cigar, match_len_min=match_len_min, full_output=True) # If no good CIGARs, give up if not good_cigars.any(): continue # Divide by read 1/2 and forward/reverse js = 2 * read.is_read2 + read.is_reverse # Read CIGARs seq = np.fromstring(read.seq, 'S1') qual = np.fromstring(read.qual, np.int8) - 33 pos = read.pos cigar = read.cigar len_cig = len(cigar) # Iterate over CIGARs for ic, (block_type, block_len) in enumerate(cigar): # Check for pos: it should never exceed the length of the fragment if (block_type in [0, 1, 2]) and (pos > length): raise ValueError('Pos exceeded the length of the fragment') # Inline block if block_type == 0: # Keep only stuff from good CIGARs if first_good_cigar <= ic <= last_good_cigar: seqb = seq[:block_len] qualb = qual[:block_len] # Increment counts for j, a in enumerate(alpha): posa = ((seqb == a) & (qualb >= qual_min)).nonzero()[0] if len(posa): counts[js, j, pos + posa] += 1 # Chop off this block if ic != len_cig - 1: seq = seq[block_len:] qual = qual[block_len:] pos += block_len # Deletion elif block_type == 2: # Keep only stuff from good CIGARs if first_good_cigar <= ic <= last_good_cigar: # Increment gap counts counts[js, 4, pos:pos + block_len] += 1 # Chop off pos, but not sequence pos += block_len # Insertion # an insert @ pos 391 means that seq[:391] is BEFORE the insert, # THEN the insert, FINALLY comes seq[391:] elif block_type == 1: # Keep only stuff from good CIGARs if first_good_cigar <= ic <= last_good_cigar: seqb = seq[:block_len] qualb = qual[:block_len] # Accept only high-quality inserts if (qualb >= qual_min).all(): inserts[pos][seqb.tostring()][js] += 1 # Chop off seq, but not pos if ic != len_cig - 1: seq = seq[block_len:] qual = qual[block_len:] # Other types of cigar? else: raise ValueError('CIGAR type '+str(block_type)+' not recognized') return (counts, inserts)
if sample[fragment] not in ['ok', 'low']: if VERBOSE >= 1: print 'not "ok". skipping' continue if VERBOSE >= 1: print 'ok' bamfilename = sample.get_mapped_filtered_filename( fragment, decontaminated=True) with pysam.Samfile(bamfilename, 'rb') as bamfile: if maxreads == -1: reads = pair_generator(bamfile) else: reads = extract_mapped_reads_subsample_open( bamfile, maxreads, VERBOSE=VERBOSE) dists[sample.name] = get_distance_reads_sequence( refseq, reads, VERBOSE=VERBOSE, score_match=3, score_mismatch=-3) hs = {} binmax = max(map(max, dists.itervalues())) bins = np.arange(0, binmax, 6) bincenters = 0.5 * (bins[1:] + bins[:-1]) for samplename, dist in dists.iteritems(): hs[samplename] = np.histogram(dist, bins=bins, density=True)[0]
def get_local_block(bamfilename, start, end, VERBOSE=0, maxreads=-1, refroi=None): '''Extract reads fully covering the region, discarding insertions''' import sys import pysam from hivwholeseq.utils.mapping import pair_generator from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open with pysam.Samfile(bamfilename, 'rb') as bamfile: block = [] if maxreads == -1: reads_iter = pair_generator(bamfile) else: reads_iter = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE, pairs=True) for irp, reads in enumerate(reads_iter): if VERBOSE >= 2: if not ((irp + 1) % 10000): if irp + 1 != 10000: sys.stdout.write("\x1b[1A\n") sys.stdout.write(str(irp + 1)) sys.stdout.flush() # Sort fwd read first is_fwd = reads[0].is_reverse reads = [reads[is_fwd], reads[not is_fwd]] # Check for coverage of the region start_fwd = reads[0].pos end_fwd = start_fwd + sum( bl for (bt, bl) in reads[0].cigar if bt in (0, 2)) start_rev = reads[1].pos end_rev = start_rev + sum( bl for (bt, bl) in reads[1].cigar if bt in (0, 2)) if start_fwd > start: continue if end_rev < end: continue if (end_fwd < end) and (start_rev > start) and (start_rev > end_fwd): continue if VERBOSE >= 3: print ' '.join( map('{:>4d}'.format, [start_fwd, end_fwd, start_rev, end_rev])) # Gather info from both reads, merge by putting ambiguous nucleotides seqs = [] st_ens = [[start_fwd, end_fwd], [start_rev, end_rev]] for ir, read in enumerate(reads): (start_read, end_read) = st_ens[ir] if (end_read <= start) or (start_read >= end): seqs.append(None) continue seq = [] pos_ref = start_read pos_read = 0 start_block = max(start, start_read) - start end_block = min(end, end_read) - start for (bt, bl) in read.cigar: if bt == 1: pos_read += bl elif bt == 2: if pos_ref + bl > start: st = max(0, start - pos_ref) en = min(bl, end - pos_ref) seq.append('-' * (en - st)) if pos_ref + bl >= end: break pos_ref += bl elif bt == 0: if pos_ref + bl > start: st = max(0, start - pos_ref) en = min(bl, end - pos_ref) seq.append(read.seq[pos_read + st:pos_read + en]) if pos_ref + bl >= end: break pos_ref += bl pos_read += bl seq = ''.join(seq) seqs.append((start_block, end_block, seq)) # Merge sequences if both fwd and rev cover part of it if seqs[0] is None: seq = seqs[1][2] elif seqs[1] is None: seq = seqs[0][2] else: # The fwd read starts before the rev, because of our insert sizes end_block_fwd = seqs[0][1] start_block_rev = seqs[1][0] overlap = [ seqs[0][2][start_block_rev:], seqs[1][2][:end_block_fwd - start_block_rev] ] # The two reads in a pair should have the same length in the overlap if len(overlap[0]) != len(overlap[1]): if VERBOSE >= 3: print 'WARNING:', reads[ 0].qname, 'not same length in overlap!' continue ol_fwd = np.fromstring(overlap[0], 'S1') ol_rev = np.fromstring(overlap[1], 'S1') ol_fwd[ol_fwd != ol_rev] = 'N' seq = seqs[0][2][:start_block_rev] + \ ol_fwd.tostring() + \ seqs[1][2][end_block_fwd - start_block_rev:] block.append(seq) if VERBOSE >= 2: print '' return block
print sample.name, if sample[fragment] not in ['ok', 'low']: if VERBOSE >= 1: print 'not "ok". skipping' continue if VERBOSE >= 1: print 'ok' bamfilename = sample.get_mapped_filtered_filename(fragment, decontaminated=True) with pysam.Samfile(bamfilename, 'rb') as bamfile: if maxreads == -1: reads = pair_generator(bamfile) else: reads = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE) dists[sample.name] = get_distance_reads_sequence(refseq, reads, VERBOSE=VERBOSE, score_match=3, score_mismatch=-3) hs = {} binmax = max(map(max, dists.itervalues())) bins = np.arange(0, binmax, 6) bincenters = 0.5 * (bins[1:] + bins[:-1]) for samplename, dist in dists.iteritems(): hs[samplename] = np.histogram(dist, bins=bins, density=True)[0] fig, ax = plt.subplots(figsize=(10, 5))
def get_local_haplotypes(bamfilename, start, end, VERBOSE=0, maxreads=-1, label=''): '''Extract reads fully covering the region, discarding insertions''' import sys import pysam from hivwholeseq.utils.mapping import pair_generator from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open from collections import Counter haplotypes = Counter() with pysam.Samfile(bamfilename, 'rb') as bamfile: if maxreads == -1: reads_iter = pair_generator(bamfile) else: reads_iter = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE, pairs=True) for irp, reads in enumerate(reads_iter): if VERBOSE >= 2: if not ((irp + 1) % 10000): if irp + 1 != 10000: sys.stdout.write("\x1b[1A\n") if label: sys.stdout.write(label+'\t') sys.stdout.write(str(irp + 1)) sys.stdout.flush() # Sort fwd read first: this is important because with our insert # size we know the fwd read starts <= the rev read is_fwd = reads[0].is_reverse reads = [reads[is_fwd], reads[not is_fwd]] # Check for coverage of the region start_fwd = reads[0].pos end_fwd = start_fwd + sum(bl for (bt, bl) in reads[0].cigar if bt in (0, 2)) start_rev = reads[1].pos end_rev = start_rev + sum(bl for (bt, bl) in reads[1].cigar if bt in (0, 2)) overlap_len = max(0, end_fwd - start_rev) # Various scenarios possible if start_fwd > start: continue if end_rev < end: continue # No single read covers the whole region AND (the insert has a whole # OR a very short overlap) if (end_fwd < end) and (start_rev > start) and (overlap_len < 20): continue # Now the good cases if (start_fwd <= start) and (end_fwd >= end): seq = trim_read_roi(reads[0], start, end) elif (start_rev <= start) and (end_rev >= end): seq = trim_read_roi(reads[1], start, end) else: seqs = [trim_read_roi(read, start, end) for read in reads] seq = merge_read_pair(*seqs) haplotypes[seq] += 1 if VERBOSE >= 4: import ipdb; ipdb.set_trace() if VERBOSE >= 2: if irp >= 10000: sys.stdout.write('\n') sys.stdout.flush() return haplotypes
def get_local_haplotypes(bamfilename, start, end, VERBOSE=0, maxreads=-1, label=''): '''Extract reads fully covering the region, discarding insertions''' import sys import pysam from hivwholeseq.utils.mapping import pair_generator from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open from collections import Counter haplotypes = Counter() with pysam.Samfile(bamfilename, 'rb') as bamfile: if maxreads == -1: reads_iter = pair_generator(bamfile) else: reads_iter = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE, pairs=True) for irp, reads in enumerate(reads_iter): if VERBOSE >= 2: if not ((irp + 1) % 10000): if irp + 1 != 10000: sys.stdout.write("\x1b[1A\n") if label: sys.stdout.write(label + '\t') sys.stdout.write(str(irp + 1)) sys.stdout.flush() # Sort fwd read first: this is important because with our insert # size we know the fwd read starts <= the rev read is_fwd = reads[0].is_reverse reads = [reads[is_fwd], reads[not is_fwd]] # Check for coverage of the region start_fwd = reads[0].pos end_fwd = start_fwd + sum( bl for (bt, bl) in reads[0].cigar if bt in (0, 2)) start_rev = reads[1].pos end_rev = start_rev + sum( bl for (bt, bl) in reads[1].cigar if bt in (0, 2)) overlap_len = max(0, end_fwd - start_rev) # Various scenarios possible if start_fwd > start: continue if end_rev < end: continue # No single read covers the whole region AND (the insert has a whole # OR a very short overlap) if (end_fwd < end) and (start_rev > start) and (overlap_len < 20): continue # Now the good cases if (start_fwd <= start) and (end_fwd >= end): seq = trim_read_roi(reads[0], start, end) elif (start_rev <= start) and (end_rev >= end): seq = trim_read_roi(reads[1], start, end) else: seqs = [trim_read_roi(read, start, end) for read in reads] seq = merge_read_pair(*seqs) haplotypes[seq] += 1 if VERBOSE >= 4: import ipdb ipdb.set_trace() if VERBOSE >= 2: if irp >= 10000: sys.stdout.write('\n') sys.stdout.flush() return haplotypes
def quality_score_along_reads_mapped(read_len, bamfilename, insertsize_range=[400, 1000], skipreads=0, maxreads=-1, randomreads=True, VERBOSE=0): '''Calculate the quality score along the reads''' from hivwholeseq.utils.mapping import trim_read_pair_crossoverhangs as trim_coh from hivwholeseq.utils.mapping import pair_generator quality = [[[] for j in xrange(read_len)] for i in xrange(2)] # Precompute conversion table SANGER_SCORE_OFFSET = ord("!") q_mapping = dict() for letter in range(0, 255): q_mapping[chr(letter)] = letter - SANGER_SCORE_OFFSET # Iterate over all reads (using fast iterators) with pysam.Samfile(bamfilename, 'rb') as bamfile: if not randomreads: reads_all = [] for i, read_pair in enumerate(pair_generator(bamfile)): if i < skipreads: continue if i == skipreads + maxreads: if VERBOSE: print 'Maximal number of read pairs reached:', maxreads break if VERBOSE and (not ((i + 1) % 10000)): print i + 1 reads_all.append(read_pair) else: reads_all = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE) print len(reads_all) for reads in reads_all: # Check insert size read = reads[reads[0].is_reverse] if (read.is_unmapped or (not read.is_proper_pair) or \ (read.isize < insertsize_range[0]) or \ (read.isize >= insertsize_range[1])): continue trim_coh(reads, trim=5, include_tests=False) pos_read = 0 for read in reads: ip = read.is_read2 for (bt, bl) in read.cigar: if bt == 1: pos_read += bl elif bt == 2: pass elif bt == 0: qualb = read.qual[pos_read: pos_read + bl] poss_read = np.arange(pos_read, pos_read + bl) if read.is_reverse: poss_read = len(read.seq) - 1 - poss_read for j, qletter in izip(poss_read, qualb): quality[ip][j].append(q_mapping[qletter]) for qual in quality: for qpos in qual: qpos.sort() return quality