def split_reads(data_folder, adaID, fragment, chunk_size=10000, maxreads=-1, VERBOSE=0): '''Split reads into chunks for mapping''' input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: if VERBOSE: if maxreads == -1: n_reads = get_number_reads_open(bamfile) // 2 else: n_reads = maxreads print 'Expected number of chunks:', 1 + (n_reads // chunk_size) chunk_number = 0 chunkfile = None for irp, read_pair in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 10000): print irp + 1 if not (irp % chunk_size): if chunkfile is not None: chunkfile.close() chunk_number += 1 chunk_filename = get_divided_filename(data_folder, adaID, fragment, type='bam', chunk=chunk_number) chunkfile = pysam.Samfile(chunk_filename, 'wb', template=bamfile) if VERBOSE >= 2: print 'Chunk n', chunk_number, 'started' chunkfile.write(read_pair[0]) chunkfile.write(read_pair[1]) if chunkfile is not None: chunkfile.close() if VERBOSE: print 'Chunking finished'
def split_reads(data_folder, adaID, fragment, chunk_size=10000, maxreads=-1, VERBOSE=0): '''Split reads into chunks for mapping''' input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: if VERBOSE: if maxreads == -1: n_reads = get_number_reads_open(bamfile) // 2 else: n_reads = maxreads print 'Expected number of chunks:', 1 + (n_reads // chunk_size) chunk_number = 0 chunkfile = None for irp, read_pair in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp+1) % 10000): print irp+1 if not (irp % chunk_size): if chunkfile is not None: chunkfile.close() chunk_number += 1 chunk_filename = get_divided_filename(data_folder, adaID, fragment, type='bam', chunk=chunk_number) chunkfile = pysam.Samfile(chunk_filename, 'wb', template=bamfile) if VERBOSE >= 2: print 'Chunk n', chunk_number, 'started' chunkfile.write(read_pair[0]) chunkfile.write(read_pair[1]) if chunkfile is not None: chunkfile.close() if VERBOSE: print 'Chunking finished'
def build_consensus(bamfilename, len_reference, VERBOSE=0, block_len=100, reads_per_alignment=31, deltamax=60): '''Build a consensus from mapped filtered reads''' if VERBOSE: print 'Build consensus' from operator import itemgetter import numpy as np import pysam from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet.IUPAC import ambiguous_dna from hivwholeseq.utils.miseq import alpha from hivwholeseq.utils.mapping import pair_generator from hivwholeseq.utils.sequence import build_local_consensus with pysam.Samfile(bamfilename, 'rb') as bamfile: if VERBOSE >= 3: from hivwholeseq.utils.mapping import get_number_reads_open print 'The bamfile has', get_number_reads_open(bamfile), 'reads.' # Get first block covered, even if partially, and record where each read started if VERBOSE >= 2: print 'First block' block_len = block_len seqs = [] n_block = 0 while not seqs: start_block = n_block * (block_len // 2) for read in bamfile: if read.pos <= start_block: seqs.append((read.pos, ('N' * read.pos) + read.seq[:block_len - read.pos])) bamfile.reset() n_block += 1 # If there are too many reads, take the reads that start earliest if len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs.sort(key=itemgetter(0)) seqs = seqs[:reads_per_alignment] seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, (pos, s) in enumerate(seqs)] consensus = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False) # Block, by block, make local alignment and join to previous consensus # There are two ways of finishing the loop: # 1. if we cover all the way to the end of the reference, good # 2. if we find no reads fully covering a block BEFORE that, add a final block while start_block < len_reference: edges = (start_block, min(len_reference, start_block + block_len)) if VERBOSE >= 2: print 'block n.', n_block, 'region:', edges seqs = pileup_trim_reads_coverfull(bamfile, edges, VERBOSE=VERBOSE) # If we do not find reads that fully cover, consider it the end of # the consensus, only the final block is missing if not seqs: break elif len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs = seqs[:reads_per_alignment] # Make local consensus using a multiple sequence alignment # -------------- # ----- ------ # -------- --- #--------------- seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, s in enumerate(seqs)] cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=True) # Join to the rest of the consensus, like this: # --------------------------- # -------------------- consensus = join_block_to_consensus(consensus, cons_block, VERBOSE=VERBOSE, deltamax=deltamax) start_block += 2 * block_len // 3 n_block += 1 # If we cover the whole reference, good else: return consensus if VERBOSE >= 2: print 'final block' # If we broke out of the while, a final block is needed seqs = pileup_trim_reads_coverstart(bamfile, start_block, VERBOSE=VERBOSE) # Sort reads by length if len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs.sort(key=len, reverse=True) seqs = seqs[:reads_per_alignment] # Complete with N, approximately sl = len(seqs[0]) seqs = [s+('N' * (sl - len(s))) for s in seqs] seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, s in enumerate(seqs)] cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False) consensus = join_block_to_consensus(consensus, cons_block, VERBOSE=VERBOSE, deltamax=deltamax) return consensus
def build_consensus(bamfilename, len_reference, VERBOSE=0, block_len=100, reads_per_alignment=31, deltamax=60): '''Build a consensus from mapped filtered reads''' if VERBOSE: print 'Build consensus' from operator import itemgetter import numpy as np import pysam from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet.IUPAC import ambiguous_dna from hivwholeseq.utils.miseq import alpha from hivwholeseq.utils.mapping import pair_generator from hivwholeseq.utils.sequence import build_local_consensus with pysam.Samfile(bamfilename, 'rb') as bamfile: if VERBOSE >= 3: from hivwholeseq.utils.mapping import get_number_reads_open print 'The bamfile has', get_number_reads_open(bamfile), 'reads.' # Get first block covered, even if partially, and record where each read started if VERBOSE >= 2: print 'First block' block_len = block_len seqs = [] n_block = 0 while not seqs: start_block = n_block * (block_len // 2) for read in bamfile: if read.pos <= start_block: seqs.append( (read.pos, ('N' * read.pos) + read.seq[:block_len - read.pos])) bamfile.reset() n_block += 1 # If there are too many reads, take the reads that start earliest if len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs.sort(key=itemgetter(0)) seqs = seqs[:reads_per_alignment] seqrecs = [ SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, (pos, s) in enumerate(seqs) ] consensus = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False) # Block, by block, make local alignment and join to previous consensus # There are two ways of finishing the loop: # 1. if we cover all the way to the end of the reference, good # 2. if we find no reads fully covering a block BEFORE that, add a final block while start_block < len_reference: edges = (start_block, min(len_reference, start_block + block_len)) if VERBOSE >= 2: print 'block n.', n_block, 'region:', edges seqs = pileup_trim_reads_coverfull(bamfile, edges, VERBOSE=VERBOSE) # If we do not find reads that fully cover, consider it the end of # the consensus, only the final block is missing if not seqs: break elif len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs = seqs[:reads_per_alignment] # Make local consensus using a multiple sequence alignment # -------------- # ----- ------ # -------- --- #--------------- seqrecs = [ SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, s in enumerate(seqs) ] cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=True) # Join to the rest of the consensus, like this: # --------------------------- # -------------------- consensus = join_block_to_consensus(consensus, cons_block, VERBOSE=VERBOSE, deltamax=deltamax) start_block += 2 * block_len // 3 n_block += 1 # If we cover the whole reference, good else: return consensus if VERBOSE >= 2: print 'final block' # If we broke out of the while, a final block is needed seqs = pileup_trim_reads_coverstart(bamfile, start_block, VERBOSE=VERBOSE) # Sort reads by length if len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs.sort(key=len, reverse=True) seqs = seqs[:reads_per_alignment] # Complete with N, approximately sl = len(seqs[0]) seqs = [s + ('N' * (sl - len(s))) for s in seqs] seqrecs = [ SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, s in enumerate(seqs) ] cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False) consensus = join_block_to_consensus(consensus, cons_block, VERBOSE=VERBOSE, deltamax=deltamax) return consensus