def check_coverage(data_folder, adaID, fragment, seq_run, qual_min=35, reference='HXB2', maxreads=-1, VERBOSE=0, rescue=False, minor_allele=False): '''Check division into fragments: coverage, etc.''' ref_fn = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(ref_fn, 'fasta') input_filename = get_mapped_filename(data_folder, adaID, fragment, type='bam', rescue=rescue) counts, inserts = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE) # Plot results title = ', '.join( map(lambda x: ' '.join([x[0], str(x[1])]), [ ['run', seq_run], ['adaID', adaID], ['fragment', fragment], ['maxreads', maxreads], ])) plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def get_distance_histogram(data_folder, adaID, fragment, maxreads=1000, VERBOSE=0, filtered=False): '''Get the distance of reads from their consensus''' reffilename = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=filtered) with pysam.Samfile(bamfilename, 'rb') as bamfile: n_pairs = 0 read_pairs = [] for (i, rp) in enumerate(pair_generator(bamfile)): if n_pairs >= maxreads: break r1 = rp[0] if not r1.is_proper_pair: continue read_pairs.append(rp) n_pairs += 1 ds = get_distance_from_reference(ref, read_pairs, threshold=30) h = np.bincount(ds) return h
def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10): '''Extract allele and insert counts from a bamfile''' # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True) refseq = SeqIO.read(reffilename, 'fasta') # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) # Call lower-level function return get_allele_counts_insertions_from_file(bamfilename, len(refseq), qual_min=qual_min, maxreads=maxreads, VERBOSE=VERBOSE)
def get_insert_size_distribution(data_folder, adaID, fragment, bins=None, maxreads=-1, VERBOSE=0, density=True): '''Get the distribution of insert sizes''' if maxreads <= 0: maxreads = 1e6 insert_sizes = np.zeros(maxreads, np.int16) # Open BAM file if fragment == 'premapped': bamfilename = get_premapped_filename(data_folder, adaID, type='bam') else: bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) # Convert from SAM if necessary if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) # Open file with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over single reads (no linkage info needed) n_written = 0 for i, reads in enumerate(pair_generator(bamfile)): if i == maxreads: if VERBOSE >= 2: print 'Max reads reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i +1) % 10000)): print (i+1) # If unmapped or unpaired, mini, or insert size mini, discard if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \ reads[1].is_unmapped or (not reads[1].is_proper_pair): continue # Store insert size i_fwd = reads[0].is_reverse insert_sizes[i] = reads[i_fwd].isize n_written += 1 insert_sizes = insert_sizes[:n_written] insert_sizes.sort() # Bin it if bins is None: h = np.histogram(insert_sizes, density=density) else: h = np.histogram(insert_sizes, bins=bins, density=density) return insert_sizes, h
def get_coverage_tuples(data_folder, adaID, fragment, mtuples, maxreads=-1, VERBOSE=0): '''Get the joint coverage of a list of positions''' # Prepare data structures mtuples = [np.asarray(tup, int) for tup in mtuples] coverage = np.zeros(len(mtuples), int) # TODO: what to do if it is covered multiple times? or only some sites? covs_pair = [np.zeros(len(tup), bool) for tup in mtuples] # Open BAM bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over all pairs for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: if VERBOSE: print 'Max reads reached:', maxreads break if VERBOSE >= 3: if not ((irp + 1) % 10000): print irp + 1 # Reinitialize temporary structure for cov_pair in covs_pair: cov_pair[:] = False # Look in both reads for read in reads: # NOTE: deletions count as covered, because in principle # we see that part of the reference cigar = read.cigar ref_start = read.pos ref_end = ref_start + sum(bl for (bt, bl) in cigar if bt in (0, 2)) # Use numba to accelerate? better not if False: add_read(ref_start, ref_end, mtuples, covs_pair) else: for cov_pair, mtuple in izip(covs_pair, mtuples): cov_pair[(mtuple >= ref_start) & (mtuple < ref_end)] = True # Check which tuples are fully covered for i, cov_pair in enumerate(covs_pair): if cov_pair.all(): coverage[i] += 1 return coverage
def make_output_folders(data_folder, adaID, VERBOSE=0): '''Make the output folders if necessary for hash and map''' hash_foldername = os.path.dirname(get_hash_file(data_folder, adaID, 'F0')) map_foldername = os.path.dirname(get_mapped_filename(data_folder, adaID, 'F0')) foldernames = [hash_foldername, map_foldername] # Make the folders for dirname in foldernames: if not os.path.isdir(dirname): os.mkdir(dirname) if VERBOSE: print 'Folder created:', dirname
def get_input_filename(data_folder, adaID, frag_spec, type='bam', only_chunk=None, filtered=True): '''Get filename of input for mapping to initial reference''' # We should take reads filtered after mapping to the auto-consensus if filtered: from hivwholeseq.sequencing.filenames import get_mapped_filename frag_gen = frag_spec[:2] fn = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) else: from hivwholeseq.sequencing.filenames import get_divided_filename fn = get_divided_filename(data_folder, adaID, frag_spec, type='bam', chunk=only_chunk) return fn
def make_output_folders(data_folder, adaID, VERBOSE=0): '''Make the output folders if necessary for hash and map''' hash_foldername = os.path.dirname(get_hash_file(data_folder, adaID, 'F0')) map_foldername = os.path.dirname( get_mapped_filename(data_folder, adaID, 'F0')) foldernames = [hash_foldername, map_foldername] # Make the folders for dirname in foldernames: if not os.path.isdir(dirname): os.mkdir(dirname) if VERBOSE: print 'Folder created:', dirname
def remove_mapped_tempfiles(data_folder, adaID, fragment='F', VERBOSE=0, rescue=False): '''Remove the part files of multi-threaded mapping''' from hivwholeseq.sequencing.filenames import get_mapped_filename dirname = os.path.dirname(get_mapped_filename(data_folder, adaID, 'F1'))+'/' fns = glob.glob(dirname+fragment+'*_part*') + \ glob.glob(dirname+fragment+'*_unsorted*') + \ glob.glob(dirname+fragment+'*.00*.bam') fns.append(dirname+fragment+'.sam') if rescue: fns.append(dirname+fragment+'_rescue.sam') for fn in fns: os.remove(fn) if VERBOSE >= 3: print 'File removed:', fn
def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10): """Extract allele and insert counts from a bamfile""" # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True) refseq = SeqIO.read(reffilename, "fasta") # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type="bam", filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) # Call lower-level function return get_allele_counts_insertions_from_file( bamfilename, len(refseq), qual_min=qual_min, maxreads=maxreads, VERBOSE=VERBOSE )
def check_coverage(data_folder, adaID, fragment, seq_run, qual_min=35, reference='HXB2', maxreads=-1, VERBOSE=0, rescue=False, minor_allele=False): '''Check division into fragments: coverage, etc.''' ref_fn = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(ref_fn, 'fasta') input_filename = get_mapped_filename(data_folder, adaID, fragment, type='bam', rescue=rescue) counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE) # Plot results title=', '.join(map(lambda x: ' '.join([x[0], str(x[1])]), [['run', seq_run], ['adaID', adaID], ['fragment', fragment], ['maxreads', maxreads], ])) plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def get_read_lengths(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1): '''Get the read lengths''' # Lengths from 1 to 250 lengths = np.zeros((len(read_types), 250), int) # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over single reads (no linkage info needed) for i, read in enumerate(bamfile): # Max number of reads if i == maxreads: if VERBOSE >= 2: print 'Max reads reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i +1) % 10000)): print (i+1) # Divide by read 1/2 and forward/reverse js = 2 * read.is_read2 + read.is_reverse # Increment counter lengths[js, read.rlen - 1] += 1 # Note: we do not delve into CIGARs because the reads are trimmed return lengths
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1, cluster_time='23:59:59', maxreads=-1, summary=True, rescue=False, dry=False): '''Map using stampy''' frag_gen = fragment[:2] if summary: summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=rescue) # Set mapping penalty scores: softer for rescues and F3 and F5 global subsrate if rescue: subsrate = '0.2' stampy_gapopen = 5 # Default: 40 stampy_gapextend = 1 # Default: 3 elif frag_gen not in ('F3', 'F5'): stampy_gapopen = 60 # Default: 40 stampy_gapextend = 5 # Default: 3 else: stampy_gapopen = 30 # Default: 40 stampy_gapextend = 2 # Default: 3 if VERBOSE: print 'Map via stampy: '+adaID+' '+frag_gen if not rescue: input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if frag_gen == 'F3': input_filename = input_filename.replace('F3a', 'F3') else: input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename+', fragment '+fragment+': input file not found.') # parallelize if requested if threads == 1: output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', rescue=rescue) # Map call_list = [stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') # Take only a (random) subsample: stampy uses the fraction of reads # intead of the number if maxreads > 0: # FIXME: figure out the -s option and the --numrecords option call_list.extend(['--numrecords', maxreads]) #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2 #frac_pairs = 1.0 * maxreads / n_pairs_tot #random_seed = np.random.randint(1e5) #call_list.extend(['-s', frac_pairs + random_seed]) call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >=2: print ' '.join(call_list) if not dry: sp.call(call_list) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', rescue=rescue) convert_sam_to_bam(output_filename) else: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (single thread).\n') if VERBOSE >= 1: print 'Dry run works (single thread)' return else: # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): # Get output filename output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=(j+1), rescue=rescue) # Map call_list = ['qsub','-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm'+adaID.replace('-', '')+frag_gen+str(j+1), '-l', 'h_rt='+cluster_time, '-l', 'h_vmem='+vmem, stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--processpart='+str(j+1)+'/'+str(threads), '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID if dry: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (multi thread).\n') if VERBOSE >= 1: print 'Dry run works (multi thread)' return # Monitor output output_file_parts = [get_mapped_filename(data_folder, adaID, frag_gen, type='bam', part=(j+1), rescue=rescue) for j in xrange(threads)] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: adaID '+\ adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped ('+str(threads)+' threads).\n') # Concatenate output files output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=True, rescue=rescue) if VERBOSE >= 1: print 'Concatenate mapped reads: adaID '+adaID+', fragment '+frag_gen pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=False, rescue=rescue) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: adaID '+adaID+', fragment '+frag_gen pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: adaID '+adaID+', fragment '+frag_gen header_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=1, rescue=rescue) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') # FIXME: check whether temp files are all deleted if VERBOSE >= 1: print 'Remove temporary files: adaID '+adaID+', fragment '+frag_gen remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
if VERBOSE >= 1: print 'PCR type not found, skipping' continue if not fragments: fragments_sample = sample.regions_generic else: fragments_sample = [fr for fr in fragments if fr in sample.regions_generic] if VERBOSE >= 3: print 'adaID '+adaID+': fragments '+' '.join(fragments_sample) for fragment in fragments_sample: if VERBOSE >= 1: print fragment, bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=use_filtered) if not os.path.isfile(bamfilename): if VERBOSE >= 1: print 'missing mapped file, skipping' continue dist_hist = get_distance_histogram(data_folder, adaID, fragment, VERBOSE=VERBOSE, maxreads=maxreads, filtered=use_filtered) label = [seq_run, adaID, samplename, fragment, dist_hist.sum()] hists.append((dist_hist, label)) if VERBOSE >= 1: print '' if len(hists) == 1:
if not found: if VERBOSE >= 1: print 'not filtered (probably no HIV reads)' continue frac_dist = 1.0 * n_distant / n_good if frac_dist < 0.01: if VERBOSE >= 1: print '< 1% of reads are distant' else: if VERBOSE >= 1: print '{:3.0%}'.format(frac_dist), 'of reads are distant' consrec = sample.get_consensus(fragment) bamfilename = get_mapped_filename(data_folder, adaID, fragment, filtered=False) (ds, edges, seqs) = fish_distant_reads(bamfilename, consrec, VERBOSE=VERBOSE, min_mismatches=min_mismatches, max_mismatches=max_mismatches, maxseqs=maxseqs) indrandom = np.arange(len(ds)) np.random.shuffle(indrandom) ds = ds[indrandom] edges = np.array(edges)[indrandom] seqs = [seqs[i] for i in indrandom] for irp, (dpair, edgepair, seqpair) in enumerate(izip(ds, edges, seqs)): # NOTE: Take only the most distant read of a pair print irp, dpair
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1, cluster_time='23:59:59', maxreads=-1, summary=True, rescue=False, dry=False): '''Map using stampy''' frag_gen = fragment[:2] if summary: summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=rescue) # Set mapping penalty scores: softer for rescues and F3 and F5 global subsrate if rescue: subsrate = '0.2' stampy_gapopen = 5 # Default: 40 stampy_gapextend = 1 # Default: 3 elif frag_gen not in ('F3', 'F5'): stampy_gapopen = 60 # Default: 40 stampy_gapextend = 5 # Default: 3 else: stampy_gapopen = 30 # Default: 40 stampy_gapextend = 2 # Default: 3 if VERBOSE: print 'Map via stampy: ' + adaID + ' ' + frag_gen if not rescue: input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if frag_gen == 'F3': input_filename = input_filename.replace('F3a', 'F3') else: input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename + ', fragment ' + fragment + ': input file not found.') # parallelize if requested if threads == 1: output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', rescue=rescue) # Map call_list = [ stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') # Take only a (random) subsample: stampy uses the fraction of reads # intead of the number if maxreads > 0: # FIXME: figure out the -s option and the --numrecords option call_list.extend(['--numrecords', maxreads]) #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2 #frac_pairs = 1.0 * maxreads / n_pairs_tot #random_seed = np.random.randint(1e5) #call_list.extend(['-s', frac_pairs + random_seed]) call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: sp.call(call_list) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', rescue=rescue) convert_sam_to_bam(output_filename) else: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (single thread).\n') if VERBOSE >= 1: print 'Dry run works (single thread)' return else: # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): # Get output filename output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=(j + 1), rescue=rescue) # Map call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm' + adaID.replace('-', '') + frag_gen + str(j + 1), '-l', 'h_rt=' + cluster_time, '-l', 'h_vmem=' + vmem, stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--processpart=' + str(j + 1) + '/' + str(threads), '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID if dry: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (multi thread).\n') if VERBOSE >= 1: print 'Dry run works (multi thread)' return # Monitor output output_file_parts = [ get_mapped_filename(data_folder, adaID, frag_gen, type='bam', part=(j + 1), rescue=rescue) for j in xrange(threads) ] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: adaID '+\ adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (' + str(threads) + ' threads).\n') # Concatenate output files output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=True, rescue=rescue) if VERBOSE >= 1: print 'Concatenate mapped reads: adaID ' + adaID + ', fragment ' + frag_gen pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=False, rescue=rescue) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: adaID ' + adaID + ', fragment ' + frag_gen pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: adaID ' + adaID + ', fragment ' + frag_gen header_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=1, rescue=rescue) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') # FIXME: check whether temp files are all deleted if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID + ', fragment ' + frag_gen remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
def get_coallele_counts(data_folder, adaID, fragment, VERBOSE=0): '''Extract allele and insert counts from a bamfile''' # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True) refseq = SeqIO.read(reffilename, 'fasta') # Allele counts and inserts (TODO: compress this data?) # Note: the pair is of 2 types only, while the single reads usually are of 4 counts = np.zeros((len(read_pair_types), len(alpha), len(alpha), len(refseq), len(refseq)), int) positions = np.zeros(501, int) ais = np.zeros_like(positions) # TODO: no inserts for now # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over read pairs for i, reads in enumerate(pair_generator(bamfile)): # Limit to some reads for testing if i > maxreads: if VERBOSE: print 'Max read number reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i +1) % 10)): print (i+1) # Divide by read 1/2 and forward/reverse js = reads[0].is_reverse count = counts[js] # List of mutations positions[:] = -1 ais[:] = -1 imut = 0 # Collect from the pair of reads for read in reads: # Sequence and position # Note: stampy takes the reverse complement already seq = read.seq pos = read.pos # Iterate over CIGARs len_cig = len(read.cigar) for ic, (block_type, block_len) in enumerate(read.cigar): # Check for pos: it should never exceed the length of the fragment if (block_type in [0, 1, 2]) and (pos > len(refseq)): raise ValueError('Pos exceeded the length of the fragment') # Inline block if block_type == 0: # Get the mutations and add them indb = map(alphal.index, seq) positions[imut: imut + len(indb)] = \ pos + np.arange(len(indb)) ais[imut: imut + len(indb)] = indb imut += len(indb) # Chop off this block if ic != len_cig - 1: seq = seq[block_len:] pos += block_len # Deletion elif block_type == 2: # Chop off pos, but not sequence pos += block_len # Insertion # an insert @ pos 391 means that seq[:391] is BEFORE the insert, # THEN the insert, FINALLY comes seq[391:] elif block_type == 1: # Chop off seq, but not pos if ic != len_cig - 1: seq = seq[block_len:] # Other types of cigar? else: raise ValueError('CIGAR type '+str(block_type)+' not recognized') if VERBOSE >= 4: for pos, ai in izip(positions, ais): if pos == -1: break print pos, ai # Put the mutations into the matrix for ai1 in xrange(len(alpha)): for ai2 in xrange(len(alpha)): coun = count[ai1, ai2] pos1 = positions[ais == ai1] if ai1 == ai2: pos2 = pos1 else: pos2 = positions[ais == ai2] coords = np.meshgrid(pos1, pos2) ind = coords[0].ravel() * coun.shape[0] + coords[1].ravel() coun.ravel()[ind] += 1 return counts
def get_coverage_tuples(data_folder, adaID, fragment, mtuples, maxreads=-1, VERBOSE=0): '''Get the joint coverage of a list of positions''' # Prepare data structures mtuples = [np.asarray(tup, int) for tup in mtuples] coverage = np.zeros(len(mtuples), int) # TODO: what to do if it is covered multiple times? or only some sites? covs_pair = [np.zeros(len(tup), bool) for tup in mtuples] # Open BAM bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over all pairs for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: if VERBOSE: print 'Max reads reached:', maxreads break if VERBOSE >= 3: if not ((irp + 1) % 10000): print irp + 1 # Reinitialize temporary structure for cov_pair in covs_pair: cov_pair[:] = False # Look in both reads for read in reads: # NOTE: deletions count as covered, because in principle # we see that part of the reference cigar = read.cigar ref_start = read.pos ref_end = ref_start + sum( bl for (bt, bl) in cigar if bt in (0, 2)) # Use numba to accelerate? better not if False: add_read(ref_start, ref_end, mtuples, covs_pair) else: for cov_pair, mtuple in izip(covs_pair, mtuples): cov_pair[(mtuple >= ref_start) & (mtuple < ref_end)] = True # Check which tuples are fully covered for i, cov_pair in enumerate(covs_pair): if cov_pair.all(): coverage[i] += 1 return coverage
def filter_reads(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1, contaminants=None, n_cycles=600, max_mismatches=30, susp_mismatches=20, summary=True, plot=False): '''Filter the reads to good chunks''' frag_gen = fragment[:2] reffilename = get_consensus_filename(data_folder, adaID, frag_gen) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=False) if not os.path.isfile(bamfilename): samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', filtered=False) if os.path.isfile(samfilename): convert_sam_to_bam(bamfilename) else: if VERBOSE >= 1: print 'ERROR: ' + adaID + ', mapped file not found.' return outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) suspiciousfilename = get_mapped_suspicious_filename( data_folder, adaID, frag_gen) trashfilename = outfilename[:-4] + '_trashed.bam' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: # Iterate over all pairs n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_suspect = 0 n_mismapped_edge = 0 n_badcigar = 0 histogram_distance_from_consensus = np.zeros(n_cycles + 1, int) binsize = 200 histogram_dist_along = np.zeros( (len(ref) // binsize + 1, n_cycles + 1), int) for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: break # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair ' + str(irp) + ': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': unmapped' n_unmapped += 1 map(trashfile.write, reads) continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': not properly paired' n_unpaired += 1 map(trashfile.write, reads) continue # Mismappings are sometimes at fragment edges: # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: n_mismapped_edge += 1 map(trashfile.write, reads) continue # Mismappings are often characterized by many mutations: # check the number of mismatches of the whole pair and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) histogram_distance_from_consensus[dc.sum()] += 1 hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize histogram_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\ 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' n_mutator += 1 map(trashfile.write, reads) continue # Check for contamination from other PCR plates. Typically, # contamination happens for only one fragment, whereas superinfection # happens for all. At this stage, we can only give clues about # cross-contamination, the rest will be done in a script downstream # (here we could TAG suspicious reads for contamination) elif (dc.sum() > susp_mismatches): if contaminants is not None: skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE) else: skip = True if skip: n_suspect += 1 map(suspfile.write, reads) continue # Trim the bad CIGARs from the sides, if there are any good ones skip = trim_bad_cigar(reads, match_len_min=match_len_min, trim_left=trim_bad_cigars, trim_right=trim_bad_cigars) if skip: n_badcigar += 1 map(trashfile.write, reads) continue # TODO: we might want to incorporate some more stringent # criterion here, to avoid short reads, cross-overhang, etc. # Write the output n_good += 1 map(outfile.write, reads) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Mispapped at edge:', n_mismapped_edge print 'Many-mutations:', n_mutator print 'Suspect contaminations:', n_suspect print 'Bad CIGARs:', n_badcigar if summary: summary_filename = get_filter_mapped_summary_filename( data_folder, adaID, fragment) with open(summary_filename, 'a') as f: f.write('Filter results: adaID ' + adaID + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Mismapped at edge:\t' + str(n_mismapped_edge) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Suspect contaminations:\t' + str(n_suspect) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') if plot: plot_distance_histogram(data_folder, adaID, frag_gen, histogram_distance_from_consensus, savefig=True) plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen, len(ref), histogram_dist_along, binsize=binsize, savefig=True)
def filter_reads(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1, contaminants=None, n_cycles=600, max_mismatches=30, susp_mismatches=20, summary=True, plot=False): '''Filter the reads to good chunks''' frag_gen = fragment[:2] reffilename = get_consensus_filename(data_folder, adaID, frag_gen) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=False) if not os.path.isfile(bamfilename): samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', filtered=False) if os.path.isfile(samfilename): convert_sam_to_bam(bamfilename) else: if VERBOSE >= 1: print 'ERROR: '+adaID+', mapped file not found.' return outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) suspiciousfilename = get_mapped_suspicious_filename(data_folder, adaID, frag_gen) trashfilename = outfilename[:-4]+'_trashed.bam' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: # Iterate over all pairs n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_suspect = 0 n_mismapped_edge = 0 n_badcigar = 0 histogram_distance_from_consensus = np.zeros(n_cycles + 1, int) binsize = 200 histogram_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: break # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair '+read1.qname+': unmapped' n_unmapped += 1 map(trashfile.write, reads) continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair '+read1.qname+': not properly paired' n_unpaired += 1 map(trashfile.write, reads) continue # Mismappings are sometimes at fragment edges: # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: n_mismapped_edge += 1 map(trashfile.write, reads) continue # Mismappings are often characterized by many mutations: # check the number of mismatches of the whole pair and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) histogram_distance_from_consensus[dc.sum()] += 1 hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize histogram_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\ 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' n_mutator += 1 map(trashfile.write, reads) continue # Check for contamination from other PCR plates. Typically, # contamination happens for only one fragment, whereas superinfection # happens for all. At this stage, we can only give clues about # cross-contamination, the rest will be done in a script downstream # (here we could TAG suspicious reads for contamination) elif (dc.sum() > susp_mismatches): if contaminants is not None: skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE) else: skip = True if skip: n_suspect += 1 map(suspfile.write, reads) continue # Trim the bad CIGARs from the sides, if there are any good ones skip = trim_bad_cigar(reads, match_len_min=match_len_min, trim_left=trim_bad_cigars, trim_right=trim_bad_cigars) if skip: n_badcigar += 1 map(trashfile.write, reads) continue # TODO: we might want to incorporate some more stringent # criterion here, to avoid short reads, cross-overhang, etc. # Write the output n_good += 1 map(outfile.write, reads) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Mispapped at edge:', n_mismapped_edge print 'Many-mutations:', n_mutator print 'Suspect contaminations:', n_suspect print 'Bad CIGARs:', n_badcigar if summary: summary_filename = get_filter_mapped_summary_filename(data_folder, adaID, fragment) with open(summary_filename, 'a') as f: f.write('Filter results: adaID '+adaID+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Mismapped at edge:\t'+str(n_mismapped_edge)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Suspect contaminations:\t'+str(n_suspect)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') if plot: plot_distance_histogram(data_folder, adaID, frag_gen, histogram_distance_from_consensus, savefig=True) plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen, len(ref), histogram_dist_along, binsize=binsize, savefig=True)