def filter_reads(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1, contaminants=None, n_cycles=600, max_mismatches=30, susp_mismatches=20, summary=True, plot=False): '''Filter the reads to good chunks''' frag_gen = fragment[:2] reffilename = get_consensus_filename(data_folder, adaID, frag_gen) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=False) if not os.path.isfile(bamfilename): samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', filtered=False) if os.path.isfile(samfilename): convert_sam_to_bam(bamfilename) else: if VERBOSE >= 1: print 'ERROR: ' + adaID + ', mapped file not found.' return outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) suspiciousfilename = get_mapped_suspicious_filename( data_folder, adaID, frag_gen) trashfilename = outfilename[:-4] + '_trashed.bam' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: # Iterate over all pairs n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_suspect = 0 n_mismapped_edge = 0 n_badcigar = 0 histogram_distance_from_consensus = np.zeros(n_cycles + 1, int) binsize = 200 histogram_dist_along = np.zeros( (len(ref) // binsize + 1, n_cycles + 1), int) for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: break # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair ' + str(irp) + ': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': unmapped' n_unmapped += 1 map(trashfile.write, reads) continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': not properly paired' n_unpaired += 1 map(trashfile.write, reads) continue # Mismappings are sometimes at fragment edges: # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: n_mismapped_edge += 1 map(trashfile.write, reads) continue # Mismappings are often characterized by many mutations: # check the number of mismatches of the whole pair and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) histogram_distance_from_consensus[dc.sum()] += 1 hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize histogram_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\ 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' n_mutator += 1 map(trashfile.write, reads) continue # Check for contamination from other PCR plates. Typically, # contamination happens for only one fragment, whereas superinfection # happens for all. At this stage, we can only give clues about # cross-contamination, the rest will be done in a script downstream # (here we could TAG suspicious reads for contamination) elif (dc.sum() > susp_mismatches): if contaminants is not None: skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE) else: skip = True if skip: n_suspect += 1 map(suspfile.write, reads) continue # Trim the bad CIGARs from the sides, if there are any good ones skip = trim_bad_cigar(reads, match_len_min=match_len_min, trim_left=trim_bad_cigars, trim_right=trim_bad_cigars) if skip: n_badcigar += 1 map(trashfile.write, reads) continue # TODO: we might want to incorporate some more stringent # criterion here, to avoid short reads, cross-overhang, etc. # Write the output n_good += 1 map(outfile.write, reads) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Mispapped at edge:', n_mismapped_edge print 'Many-mutations:', n_mutator print 'Suspect contaminations:', n_suspect print 'Bad CIGARs:', n_badcigar if summary: summary_filename = get_filter_mapped_summary_filename( data_folder, adaID, fragment) with open(summary_filename, 'a') as f: f.write('Filter results: adaID ' + adaID + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Mismapped at edge:\t' + str(n_mismapped_edge) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Suspect contaminations:\t' + str(n_suspect) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') if plot: plot_distance_histogram(data_folder, adaID, frag_gen, histogram_distance_from_consensus, savefig=True) plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen, len(ref), histogram_dist_along, binsize=binsize, savefig=True)
alis = {fr: AlignIO.read(get_consensi_alignment_filename('all', fr), 'fasta') for fr in fragments} for samplename, sample in samples.iterrows(): sample = SampleSeq(sample) data_folder = sample.seqrun_folder adaID = sample.adapter pname = sample.patientname for fragment in fragments: if VERBOSE >= 1: print sample['seq run'], adaID, fragment, samplename, # Read the summary filename of the filter_mapped, and find out whether # there are many distant reads (a few are normal) fn = get_filter_mapped_summary_filename(data_folder, adaID, fragment) if os.path.isfile(fn): found = False with open(fn, 'r') as f: for line in f: line = line.rstrip('\n') if line[:4] == 'Good': n_good = int(line.split()[-1]) elif line[:14] == 'Many-mutations': n_distant = int(line.split()[-1]) found = True break if not found: if VERBOSE >= 1:
def filter_reads(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1, contaminants=None, n_cycles=600, max_mismatches=30, susp_mismatches=20, summary=True, plot=False): '''Filter the reads to good chunks''' frag_gen = fragment[:2] reffilename = get_consensus_filename(data_folder, adaID, frag_gen) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=False) if not os.path.isfile(bamfilename): samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', filtered=False) if os.path.isfile(samfilename): convert_sam_to_bam(bamfilename) else: if VERBOSE >= 1: print 'ERROR: '+adaID+', mapped file not found.' return outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) suspiciousfilename = get_mapped_suspicious_filename(data_folder, adaID, frag_gen) trashfilename = outfilename[:-4]+'_trashed.bam' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: # Iterate over all pairs n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_suspect = 0 n_mismapped_edge = 0 n_badcigar = 0 histogram_distance_from_consensus = np.zeros(n_cycles + 1, int) binsize = 200 histogram_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: break # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair '+read1.qname+': unmapped' n_unmapped += 1 map(trashfile.write, reads) continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair '+read1.qname+': not properly paired' n_unpaired += 1 map(trashfile.write, reads) continue # Mismappings are sometimes at fragment edges: # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: n_mismapped_edge += 1 map(trashfile.write, reads) continue # Mismappings are often characterized by many mutations: # check the number of mismatches of the whole pair and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) histogram_distance_from_consensus[dc.sum()] += 1 hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize histogram_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\ 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' n_mutator += 1 map(trashfile.write, reads) continue # Check for contamination from other PCR plates. Typically, # contamination happens for only one fragment, whereas superinfection # happens for all. At this stage, we can only give clues about # cross-contamination, the rest will be done in a script downstream # (here we could TAG suspicious reads for contamination) elif (dc.sum() > susp_mismatches): if contaminants is not None: skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE) else: skip = True if skip: n_suspect += 1 map(suspfile.write, reads) continue # Trim the bad CIGARs from the sides, if there are any good ones skip = trim_bad_cigar(reads, match_len_min=match_len_min, trim_left=trim_bad_cigars, trim_right=trim_bad_cigars) if skip: n_badcigar += 1 map(trashfile.write, reads) continue # TODO: we might want to incorporate some more stringent # criterion here, to avoid short reads, cross-overhang, etc. # Write the output n_good += 1 map(outfile.write, reads) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Mispapped at edge:', n_mismapped_edge print 'Many-mutations:', n_mutator print 'Suspect contaminations:', n_suspect print 'Bad CIGARs:', n_badcigar if summary: summary_filename = get_filter_mapped_summary_filename(data_folder, adaID, fragment) with open(summary_filename, 'a') as f: f.write('Filter results: adaID '+adaID+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Mismapped at edge:\t'+str(n_mismapped_edge)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Suspect contaminations:\t'+str(n_suspect)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') if plot: plot_distance_histogram(data_folder, adaID, frag_gen, histogram_distance_from_consensus, savefig=True) plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen, len(ref), histogram_dist_along, binsize=binsize, savefig=True)