def get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=0): """Find the overlap coordinates for the two fragments""" from hivwholeseq.utils.mapping import align_muscle seq1 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag1), "fasta") seq2 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag2), "fasta") sm1 = np.array(seq1) sm2 = np.array(seq2) # Find the beginning of s2 in s1 seed_len = 20 matches_min = 16 seed = sm2[:seed_len] found = False trials = 0 while (not found) and (trials < 3): for pos in xrange(len(seq1) - 700, len(seq1) - seed_len): if (sm1[pos : pos + seed_len] == seed).sum() >= matches_min - trials: found = True start_s2 = pos break if not found: trials += 1 if not found: return None if VERBOSE >= 3: print "Beginning of " + frag2 + " found in " + frag1 # In an ideal world, the overlap is a holy place in which no indels happen. # We cannot assume that, sadly. However, we can search from the other side # and align: find the end of s1 in s2 found = False seed = sm1[-seed_len:] trials = 0 while (not found) and (trials < 3): for pos in xrange(700): if (sm2[pos : pos + seed_len] == seed).sum() >= matches_min - trials: found = True end_s1 = pos + seed_len break if not found: trials += 1 if not found: return None if VERBOSE >= 3: print "End of " + frag1 + " found in " + frag2 # Align ali = align_muscle(seq1[start_s2:], seq2[:end_s1]) return (start_s2, end_s1, ali)
def get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=0): '''Find the overlap coordinates for the two fragments''' from hivwholeseq.utils.mapping import align_muscle seq1 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag1), 'fasta') seq2 = SeqIO.read(get_consensus_filename(data_folder, adaID, frag2), 'fasta') sm1 = np.array(seq1) sm2 = np.array(seq2) # Find the beginning of s2 in s1 seed_len = 20 matches_min = 16 seed = sm2[:seed_len] found = False trials = 0 while (not found) and (trials < 3): for pos in xrange(len(seq1) - 700, len(seq1) - seed_len): if (sm1[pos: pos + seed_len] == seed).sum() >= matches_min - trials: found = True start_s2 = pos break if not found: trials += 1 if not found: return None if VERBOSE >= 3: print 'Beginning of '+frag2+' found in '+frag1 # In an ideal world, the overlap is a holy place in which no indels happen. # We cannot assume that, sadly. However, we can search from the other side # and align: find the end of s1 in s2 found = False seed = sm1[-seed_len:] trials = 0 while (not found) and (trials < 3): for pos in xrange(700): if (sm2[pos: pos + seed_len] == seed).sum() >= matches_min - trials: found = True end_s1 = pos + seed_len break if not found: trials += 1 if not found: return None if VERBOSE >= 3: print 'End of '+frag1+' found in '+frag2 # Align ali = align_muscle(seq1[start_s2:], seq2[:end_s1]) return (start_s2, end_s1, ali)
def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10): '''Extract allele and insert counts from a bamfile''' # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True) refseq = SeqIO.read(reffilename, 'fasta') # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) # Call lower-level function return get_allele_counts_insertions_from_file(bamfilename, len(refseq), qual_min=qual_min, maxreads=maxreads, VERBOSE=VERBOSE)
def check_coverage(data_folder, adaID, fragment, seq_run, qual_min=35, reference='HXB2', maxreads=-1, VERBOSE=0, rescue=False, minor_allele=False): '''Check division into fragments: coverage, etc.''' ref_fn = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(ref_fn, 'fasta') input_filename = get_mapped_filename(data_folder, adaID, fragment, type='bam', rescue=rescue) counts, inserts = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE) # Plot results title = ', '.join( map(lambda x: ' '.join([x[0], str(x[1])]), [ ['run', seq_run], ['adaID', adaID], ['fragment', fragment], ['maxreads', maxreads], ])) plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def complement_consensus_PCR2(cons_rec, patient, fragment, samplen, VERBOSE=0): '''Complement consensus from PCR2 with wings from later PCR1 sample''' from hivwholeseq.utils.sequence import find_seed_imperfect, rfind_seed_imperfect found = False for _, sampletmp in patient.samples.iloc[samplen + 1:].iterrows(): for _, sampleseqtmp in sampletmp['samples seq'].iterrows(): sampleseqtmp = SampleSeq(sampleseqtmp) if int(sampleseqtmp.PCR) == 1: sampleseq_later = sampleseqtmp found = True break if found: break adaID_later = sampleseq_later['adapter'] data_folder_later = sampleseq_later.sequencing_run.folder cons_rec_later = SeqIO.read(get_consensus_filename(data_folder_later, adaID_later, fragment), 'fasta') conss_later = str(cons_rec_later.seq) start = find_seed_imperfect(cons_rec_later, cons_rec[:20]) end = rfind_seed_imperfect(cons_rec_later, cons_rec[-20:]) + 20 if VERBOSE >= 1: print 'Complementing PCR2 consensus with later PCR1:', print sampleseq_later.name, sampleseq_later['seq run'], sampleseq_later.adapter frag_spec = sampleseq_later.regions_complete[sampleseq_later.regions_generic.index(fragment)] return (frag_spec, conss_later[:start]+cons_rec+conss_later[end:])
def get_distance_histogram(data_folder, adaID, fragment, maxreads=1000, VERBOSE=0, filtered=False): '''Get the distance of reads from their consensus''' reffilename = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=filtered) with pysam.Samfile(bamfilename, 'rb') as bamfile: n_pairs = 0 read_pairs = [] for (i, rp) in enumerate(pair_generator(bamfile)): if n_pairs >= maxreads: break r1 = rp[0] if not r1.is_proper_pair: continue read_pairs.append(rp) n_pairs += 1 ds = get_distance_from_reference(ref, read_pairs, threshold=30) h = np.bincount(ds) return h
def make_index_and_hash(data_folder, adaID, fragment, VERBOSE=0, summary=True): '''Make index and hash files for consensus''' frag_gen = fragment[:2] # NOTE: we can use --overwrite here, because there is no concurrency (every # job has its own hash) # 1. Make genome index file sp.call([stampy_bin, '--species="HIV fragment '+frag_gen+'"', '--overwrite', '-G', get_index_file(data_folder, adaID, frag_gen, ext=False), get_consensus_filename(data_folder, adaID, frag_gen, trim_primers=True), ]) if VERBOSE: print 'Built index: '+adaID+' '+frag_gen # 2. Build a hash file sp.call([stampy_bin, '--overwrite', '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-H', get_hash_file(data_folder, adaID, frag_gen, ext=False), ]) if VERBOSE: print 'Built hash: '+adaID+' '+frag_gen if summary: with open(get_map_summary_filename(data_folder, adaID, frag_gen), 'a') as f: f.write('\n') f.write('Stampy index and hash written.') f.write('\n')
def complement_consensus_PCR2(cons_rec, patient, fragment, samplen, VERBOSE=0): '''Complement consensus from PCR2 with wings from later PCR1 sample''' from hivwholeseq.utils.sequence import find_seed_imperfect, rfind_seed_imperfect found = False for _, sampletmp in patient.samples.iloc[samplen + 1:].iterrows(): for _, sampleseqtmp in sampletmp['samples seq'].iterrows(): sampleseqtmp = SampleSeq(sampleseqtmp) if int(sampleseqtmp.PCR) == 1: sampleseq_later = sampleseqtmp found = True break if found: break adaID_later = sampleseq_later['adapter'] data_folder_later = sampleseq_later.sequencing_run.folder cons_rec_later = SeqIO.read( get_consensus_filename(data_folder_later, adaID_later, fragment), 'fasta') conss_later = str(cons_rec_later.seq) start = find_seed_imperfect(cons_rec_later, cons_rec[:20]) end = rfind_seed_imperfect(cons_rec_later, cons_rec[-20:]) + 20 if VERBOSE >= 1: print 'Complementing PCR2 consensus with later PCR1:', print sampleseq_later.name, sampleseq_later[ 'seq run'], sampleseq_later.adapter frag_spec = sampleseq_later.regions_complete[ sampleseq_later.regions_generic.index(fragment)] return (frag_spec, conss_later[:start] + cons_rec + conss_later[end:])
def write_consensus_final(seq_run, adaID, fragment, consensus): '''Write the final consensus (fragments are now called F5 instead of F5ai)''' dataset = MiSeq_runs[seq_run] data_folder = dataset['folder'] samplename = dataset['samples'][dataset['adapters'].index(adaID)] frag_out = fragment[:2] name = samplename+'_seqrun_'+seq_run+'_adaID_'+adaID+'_'+frag_out+'_consensus' consensusseq = SeqRecord(Seq(consensus), id=name, name=name) outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True) SeqIO.write(consensusseq, outfile, 'fasta') # Align all consensi via muscle and store seqs = list(SeqIO.parse(get_reference_all_filename(data_folder, adaID, fragment), 'fasta')) ali = align_muscle(*seqs) AlignIO.write(ali, get_reference_all_filename(data_folder, adaID, fragment), 'fasta')
def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10): """Extract allele and insert counts from a bamfile""" # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True) refseq = SeqIO.read(reffilename, "fasta") # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type="bam", filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) # Call lower-level function return get_allele_counts_insertions_from_file( bamfilename, len(refseq), qual_min=qual_min, maxreads=maxreads, VERBOSE=VERBOSE )
def merge_consensi(data_folder, adaID, fragments, VERBOSE=0): '''Merge consensi at overlapping pairs''' import warnings consensi = {frag: SeqIO.read(get_consensus_filename(data_folder, adaID, frag, trim_primers=True), 'fasta') for frag in fragments} pairs = get_overlapping_fragments(fragments) overlaps = {} for (frag1, frag2) in pairs: overlap = get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=VERBOSE) is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE) if is_diff: warnings.warn(frag1+' and '+frag2+' have different consensi.', RuntimeWarning) overlaps[(frag1, frag2)] = overlap consensus = [] fragments = sorted(fragments) for i, frag in enumerate(fragments): # If the start is not an overlap, start a new consensus and copy all if (i == 0) or (fragments[i-1], frag) not in overlaps: cons = [[frag], str(consensi[frag].seq)] consensus.append(cons) # copy from the end of the overlap on else: cons = consensus[-1] cons[0].append(frag) tmp = overlaps[(fragments[i-1], frag)] if tmp is not None: (_, start, _) = tmp cons[1] = cons[1]+str(consensi[frag][start:].seq) else: cons[1] = cons[1]+('N' * 10)+str(consensi[frag].seq) # Make SeqRecords out of consensi for i, (frags, cons) in enumerate(consensus): name = 'adaID_'+str(adaID)+'_'+'-'.join(frags) rec = SeqRecord(Seq(cons, IUPAC.ambiguous_dna), id=name, name=name) consensus[i] = (frags, rec) return consensus
def merge_allele_frequencies(data_folder, adaID, fragments, VERBOSE=0): '''Merge allele frequencies at overlapping pairs''' import warnings import numpy as np consensi = {frag: SeqIO.read(get_consensus_filename(data_folder, adaID, frag, trim_primers=True), 'fasta') for frag in fragments} nus = {frag: np.load(get_allele_frequencies_filename(data_folder, adaID, frag)) for frag in fragments} pairs = get_overlapping_fragments(fragments) overlaps = {} for (frag1, frag2) in pairs: overlap = get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=VERBOSE) is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE) if is_diff: warnings.warn(frag1+' and '+frag2+' have different consensi.', RuntimeWarning) overlaps[(frag1, frag2)] = overlap nu = [] fragments = sorted(fragments) for i, frag in enumerate(fragments): # If the start is not an overlap, start a new chunk and copy all if (i == 0) or (fragments[i-1], frag) not in overlaps: nuf = [[frag], nus[frag]] nu.append(nuf) # else, copy from the end of the overlap on # FIXME: we could average the consensus zone out of indels... else: nuf = nu[-1] nuf[0].append(frag) tmp = overlaps[(fragments[i-1], frag)] if tmp is not None: (_, start, _) = tmp #(recursion is not the most efficient but -- oh, well) nuf[1] = np.concatenate([nuf[1], nus[frag][:, start:]], axis=1) else: tmp = np.zeros((nuf[1].shape[0], 10), float) tmp[-1] = 1 nuf[1] = np.concatenate([nuf[1], tmp, nus[frag][:, start:]], axis=1) return nu
def align_consensi_dataset(dataset, adaIDs, fragments, VERBOSE=0): '''Align consensi from different samples in a dataset''' data_folder = dataset['folder'] # Collect consensi if VERBOSE >= 1: print 'Collecting consensi...', consensi = defaultdict(dict) for adaID in adaIDs: samplename = dataset['samples'][dataset['adapters'].index(adaID)] fragments_sample = samples[samplename]['fragments'] for frag in fragments_sample: frag_gen = frag[:2] if frag_gen not in fragments: continue con_fn = get_consensus_filename(data_folder, adaID, frag_gen) if os.path.isfile(con_fn): con = SeqIO.read(con_fn, 'fasta') consensi[frag_gen][adaID] = con if 'genomewide' in fragments: frag_gens = [frag[:2] for frag in fragments_sample] con_gw_fn = get_merged_consensus_filename(data_folder, adaID, frag_gens) if os.path.isfile(con_gw_fn): con = SeqIO.read(con_gw_fn, 'fasta') consensi['genomewide'][adaID] = con if VERBOSE >= 1: print 'done.' print 'Aligning...', # Align alis = {} for (frag, con_dict) in consensi.iteritems(): if VERBOSE >= 2: print frag, ali_frag = align_muscle(*(con_dict.values())) alis[frag] = ali_frag if VERBOSE >= 1: print 'done.' return alis
def score_consensus(sample, VERBOSE=0): '''Score a consensus based on completeness and quality''' data_folder = sample.sequencing_run.folder adaID = sample.adapter frag_spec = filter(lambda x: fragment in x, sample.regions_complete) if not len(frag_spec): field = '' return (True, '') fn = get_consensus_filename(data_folder, adaID, fragment) if not os.path.isfile(fn): return (False, 'MISS') frag_spec = frag_spec[0] fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec) if not os.path.isfile(fn_ref): if frag_spec[:3] == 'F3a': frag_spec = frag_spec.replace('a', '') fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec) if not os.path.isfile(fn_ref): return (False, 'MISSREF') else: return (False, 'MISSREF') ref = SeqIO.read(fn_ref, 'fasta') cons = SeqIO.read(fn, 'fasta') if len(cons) < len(ref) - 200: return (False, 'SHORT') elif len(cons) > len(ref) + 200: return (False, 'LONG') #ali = align_global(str(ref.seq), str(cons.seq), band=200) #alim1 = np.fromstring(ali[1], 'S1') #alim2 = np.fromstring(ali[2], 'S1') #if (alim1 != alim2).sum() > return (True, 'OK')
def write_consensus_final(seq_run, adaID, fragment, consensus): '''Write the final consensus (fragments are now called F5 instead of F5ai)''' dataset = MiSeq_runs[seq_run] data_folder = dataset['folder'] samplename = dataset['samples'][dataset['adapters'].index(adaID)] frag_out = fragment[:2] name = samplename + '_seqrun_' + seq_run + '_adaID_' + adaID + '_' + frag_out + '_consensus' consensusseq = SeqRecord(Seq(consensus), id=name, name=name) outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True) SeqIO.write(consensusseq, outfile, 'fasta') # Align all consensi via muscle and store seqs = list( SeqIO.parse(get_reference_all_filename(data_folder, adaID, fragment), 'fasta')) ali = align_muscle(*seqs) AlignIO.write(ali, get_reference_all_filename(data_folder, adaID, fragment), 'fasta')
def make_index_and_hash(data_folder, adaID, fragment, VERBOSE=0, summary=True): '''Make index and hash files for consensus''' frag_gen = fragment[:2] # NOTE: we can use --overwrite here, because there is no concurrency (every # job has its own hash) # 1. Make genome index file sp.call([ stampy_bin, '--species="HIV fragment ' + frag_gen + '"', '--overwrite', '-G', get_index_file(data_folder, adaID, frag_gen, ext=False), get_consensus_filename(data_folder, adaID, frag_gen, trim_primers=True), ]) if VERBOSE: print 'Built index: ' + adaID + ' ' + frag_gen # 2. Build a hash file sp.call([ stampy_bin, '--overwrite', '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-H', get_hash_file(data_folder, adaID, frag_gen, ext=False), ]) if VERBOSE: print 'Built hash: ' + adaID + ' ' + frag_gen if summary: with open(get_map_summary_filename(data_folder, adaID, frag_gen), 'a') as f: f.write('\n') f.write('Stampy index and hash written.') f.write('\n')
def check_coverage(data_folder, adaID, fragment, seq_run, qual_min=35, reference='HXB2', maxreads=-1, VERBOSE=0, rescue=False, minor_allele=False): '''Check division into fragments: coverage, etc.''' ref_fn = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(ref_fn, 'fasta') input_filename = get_mapped_filename(data_folder, adaID, fragment, type='bam', rescue=rescue) counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE) # Plot results title=', '.join(map(lambda x: ' '.join([x[0], str(x[1])]), [['run', seq_run], ['adaID', adaID], ['fragment', fragment], ['maxreads', maxreads], ])) plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
if VERBOSE >= 2: print ali[:, :30] print ali[:, -30:] print 'Lenghts: ref', len(refseq), 'consensus', len( consensusseq) len_ali = ali.get_alignment_length() n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali)) print 'Differences from ref:', n_diff, '(' + '{:3.1f}'.format( 100.0 * n_diff / len_ali) + '%)' # Ungap consensus consensusseq = SeqRecord(ali[1].seq, id=name, name=name) if '-' in consensusseq: consensusseq.seq = consensusseq.seq.ungap('-') # Write output outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True) SeqIO.write(consensusseq, outfile, 'fasta') AlignIO.write( ali, get_reference_consensus_ali_filename(data_folder, adaID, fragment), 'fasta') if store_allele_counts: allele_counts.dump( get_allele_counts_filename(data_folder, adaID, frag_out))
# If the script is called with no adaID, iterate over all if not adaIDs: adaIDs = MiSeq_runs[seq_run]['adapters'] if VERBOSE >= 3: print 'adaIDs', adaIDs # If the script is called with no fragment, iterate over all if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments # Iterate over samples and fragments for adaID in adaIDs: for fragment in fragments: consensus = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment), 'fasta') cmat = np.array(consensus) counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) nu = filter_nus(counts, coverage, VERBOSE=VERBOSE) cmat_af = alpha[nu.argmax(axis=0)] if len(cmat) != len(cmat_af): raise ValueError('The two consensi have a different length!') pos_diff = (cmat != cmat_af).nonzero()[0] # If they are the same, do nothing (we do not want useless backup files) if len(pos_diff) == 0:
if not adaIDs: adaIDs = load_adapter_table(data_folder)['ID'] if VERBOSE >= 3: print 'adaIDs', adaIDs # Select fragment and primers fragment = 'F3' # Look for the F3 rev primer (already reversed) primer_old = 'GATTGTGTGGCAAGTAGACAGG' primer_new = 'TATGGAAAACAGATGGCAGGTG' # Iterate over all requested samples for adaID in adaIDs: # Read reference (fragmented) reffilename = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) # read file bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) bamfile = pysam.Samfile(bamfilename, 'rb') # Get the coverage for reads which have long insert sizes
else: sample = load_sample_sequenced(samplename) for fragment in fragments: sample_seq = SampleSeq(sample.samples_seq.iloc[repn]) seq_run = sample_seq['seq run'] adaID = sample_seq['adapter'] dataset = sample_seq.sequencing_run data_folder = dataset.folder if VERBOSE: print 'Initial sample:', sample_seq.name, sample_seq['seq run'], print sample_seq.adapter cons_rec = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment), 'fasta') frag_spec = sample_seq.regions_complete[\ sample_seq.regions_generic.index(fragment)] # Complement PCR2 initial reference with tails from a later sample if int(sample_seq.PCR) == 2: (frag_spec, cons_rec) = complement_consensus_PCR2(cons_rec, patient, fragment, samplen, VERBOSE=VERBOSE) conss = str(cons_rec.seq) output_filename = get_initial_reference_filename(pname, fragment) seq_in = SeqRecord(Seq(conss, unambiguous_dna),
adaIDs = load_adapter_table(data_folder)['ID'] if VERBOSE >= 3: print 'adaIDs', adaIDs # If the script is called with no fragment, iterate over all if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments # Iterate over all requested samples for adaID in adaIDs: for fragment in fragments: # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) # Open BAM bamfilename = get_mapped_filename(data_folder, adaID, fragment, filtered=False) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate through reads for i, read in enumerate(bamfile): # Limit to the first reads if i >= maxreads: break
if not adaIDs: adaIDs = MiSeq_runs[seq_run]['adapters'] if VERBOSE >= 3: print 'adaIDs', adaIDs # If the script is called with no fragment, iterate over all if not fragments: fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments # Iterate over samples and fragments for adaID in adaIDs: for fragment in fragments: consensus = SeqIO.read( get_consensus_filename(data_folder, adaID, fragment), 'fasta') cmat = np.array(consensus) counts = np.load( get_allele_counts_filename(data_folder, adaID, fragment)) coverage = np.load( get_coverage_filename(data_folder, adaID, fragment)) nu = filter_nus(counts, coverage, VERBOSE=VERBOSE) # Note: not-covered positions are filtered, but argmax cannot work # with masked arrays cmat_af = alpha[nu.argmax(axis=0)] if hasattr(nu, 'mask'): cmat_af[nu.mask.all(axis=0)] = 'N' # Check for consistency first
ali = align_muscle(refseq, consensusseq, sort=True) if ali[0][-1] == '-': start_nongap = len(ali[0]) - len(ali[0].seq.lstrip('-')) end_nongap = len(ali[0].seq.rstrip('-')) ali = ali[:, start_nongap: end_nongap] if VERBOSE >= 2: print ali[:, :30] print ali[:, -30:] print 'Lenghts: ref', len(refseq), 'consensus', len(consensusseq) len_ali = ali.get_alignment_length() n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali)) print 'Differences from ref:', n_diff, '('+'{:3.1f}'.format(100.0 * n_diff / len_ali)+'%)' # Ungap consensus consensusseq = SeqRecord(ali[1].seq, id=name, name=name) if '-' in consensusseq: consensusseq.seq = consensusseq.seq.ungap('-') # Write output outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True) SeqIO.write(consensusseq, outfile, 'fasta') AlignIO.write(ali, get_reference_consensus_ali_filename(data_folder, adaID, fragment), 'fasta') if store_allele_counts: allele_counts.dump(get_allele_counts_filename(data_folder, adaID, frag_out))
def get_coallele_counts(data_folder, adaID, fragment, VERBOSE=0): '''Extract allele and insert counts from a bamfile''' # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True) refseq = SeqIO.read(reffilename, 'fasta') # Allele counts and inserts (TODO: compress this data?) # Note: the pair is of 2 types only, while the single reads usually are of 4 counts = np.zeros((len(read_pair_types), len(alpha), len(alpha), len(refseq), len(refseq)), int) positions = np.zeros(501, int) ais = np.zeros_like(positions) # TODO: no inserts for now # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over read pairs for i, reads in enumerate(pair_generator(bamfile)): # Limit to some reads for testing if i > maxreads: if VERBOSE: print 'Max read number reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i +1) % 10)): print (i+1) # Divide by read 1/2 and forward/reverse js = reads[0].is_reverse count = counts[js] # List of mutations positions[:] = -1 ais[:] = -1 imut = 0 # Collect from the pair of reads for read in reads: # Sequence and position # Note: stampy takes the reverse complement already seq = read.seq pos = read.pos # Iterate over CIGARs len_cig = len(read.cigar) for ic, (block_type, block_len) in enumerate(read.cigar): # Check for pos: it should never exceed the length of the fragment if (block_type in [0, 1, 2]) and (pos > len(refseq)): raise ValueError('Pos exceeded the length of the fragment') # Inline block if block_type == 0: # Get the mutations and add them indb = map(alphal.index, seq) positions[imut: imut + len(indb)] = \ pos + np.arange(len(indb)) ais[imut: imut + len(indb)] = indb imut += len(indb) # Chop off this block if ic != len_cig - 1: seq = seq[block_len:] pos += block_len # Deletion elif block_type == 2: # Chop off pos, but not sequence pos += block_len # Insertion # an insert @ pos 391 means that seq[:391] is BEFORE the insert, # THEN the insert, FINALLY comes seq[391:] elif block_type == 1: # Chop off seq, but not pos if ic != len_cig - 1: seq = seq[block_len:] # Other types of cigar? else: raise ValueError('CIGAR type '+str(block_type)+' not recognized') if VERBOSE >= 4: for pos, ai in izip(positions, ais): if pos == -1: break print pos, ai # Put the mutations into the matrix for ai1 in xrange(len(alpha)): for ai2 in xrange(len(alpha)): coun = count[ai1, ai2] pos1 = positions[ais == ai1] if ai1 == ai2: pos2 = pos1 else: pos2 = positions[ais == ai2] coords = np.meshgrid(pos1, pos2) ind = coords[0].ravel() * coun.shape[0] + coords[1].ravel() coun.ravel()[ind] += 1 return counts
def filter_reads(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1, contaminants=None, n_cycles=600, max_mismatches=30, susp_mismatches=20, summary=True, plot=False): '''Filter the reads to good chunks''' frag_gen = fragment[:2] reffilename = get_consensus_filename(data_folder, adaID, frag_gen) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=False) if not os.path.isfile(bamfilename): samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', filtered=False) if os.path.isfile(samfilename): convert_sam_to_bam(bamfilename) else: if VERBOSE >= 1: print 'ERROR: ' + adaID + ', mapped file not found.' return outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) suspiciousfilename = get_mapped_suspicious_filename( data_folder, adaID, frag_gen) trashfilename = outfilename[:-4] + '_trashed.bam' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: # Iterate over all pairs n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_suspect = 0 n_mismapped_edge = 0 n_badcigar = 0 histogram_distance_from_consensus = np.zeros(n_cycles + 1, int) binsize = 200 histogram_dist_along = np.zeros( (len(ref) // binsize + 1, n_cycles + 1), int) for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: break # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair ' + str(irp) + ': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': unmapped' n_unmapped += 1 map(trashfile.write, reads) continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': not properly paired' n_unpaired += 1 map(trashfile.write, reads) continue # Mismappings are sometimes at fragment edges: # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: n_mismapped_edge += 1 map(trashfile.write, reads) continue # Mismappings are often characterized by many mutations: # check the number of mismatches of the whole pair and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) histogram_distance_from_consensus[dc.sum()] += 1 hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize histogram_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\ 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' n_mutator += 1 map(trashfile.write, reads) continue # Check for contamination from other PCR plates. Typically, # contamination happens for only one fragment, whereas superinfection # happens for all. At this stage, we can only give clues about # cross-contamination, the rest will be done in a script downstream # (here we could TAG suspicious reads for contamination) elif (dc.sum() > susp_mismatches): if contaminants is not None: skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE) else: skip = True if skip: n_suspect += 1 map(suspfile.write, reads) continue # Trim the bad CIGARs from the sides, if there are any good ones skip = trim_bad_cigar(reads, match_len_min=match_len_min, trim_left=trim_bad_cigars, trim_right=trim_bad_cigars) if skip: n_badcigar += 1 map(trashfile.write, reads) continue # TODO: we might want to incorporate some more stringent # criterion here, to avoid short reads, cross-overhang, etc. # Write the output n_good += 1 map(outfile.write, reads) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Mispapped at edge:', n_mismapped_edge print 'Many-mutations:', n_mutator print 'Suspect contaminations:', n_suspect print 'Bad CIGARs:', n_badcigar if summary: summary_filename = get_filter_mapped_summary_filename( data_folder, adaID, fragment) with open(summary_filename, 'a') as f: f.write('Filter results: adaID ' + adaID + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Mismapped at edge:\t' + str(n_mismapped_edge) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Suspect contaminations:\t' + str(n_suspect) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') if plot: plot_distance_histogram(data_folder, adaID, frag_gen, histogram_distance_from_consensus, savefig=True) plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen, len(ref), histogram_dist_along, binsize=binsize, savefig=True)
def filter_reads(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1, contaminants=None, n_cycles=600, max_mismatches=30, susp_mismatches=20, summary=True, plot=False): '''Filter the reads to good chunks''' frag_gen = fragment[:2] reffilename = get_consensus_filename(data_folder, adaID, frag_gen) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=False) if not os.path.isfile(bamfilename): samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', filtered=False) if os.path.isfile(samfilename): convert_sam_to_bam(bamfilename) else: if VERBOSE >= 1: print 'ERROR: '+adaID+', mapped file not found.' return outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) suspiciousfilename = get_mapped_suspicious_filename(data_folder, adaID, frag_gen) trashfilename = outfilename[:-4]+'_trashed.bam' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: # Iterate over all pairs n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_suspect = 0 n_mismapped_edge = 0 n_badcigar = 0 histogram_distance_from_consensus = np.zeros(n_cycles + 1, int) binsize = 200 histogram_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: break # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair '+read1.qname+': unmapped' n_unmapped += 1 map(trashfile.write, reads) continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair '+read1.qname+': not properly paired' n_unpaired += 1 map(trashfile.write, reads) continue # Mismappings are sometimes at fragment edges: # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: n_mismapped_edge += 1 map(trashfile.write, reads) continue # Mismappings are often characterized by many mutations: # check the number of mismatches of the whole pair and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) histogram_distance_from_consensus[dc.sum()] += 1 hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize histogram_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\ 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' n_mutator += 1 map(trashfile.write, reads) continue # Check for contamination from other PCR plates. Typically, # contamination happens for only one fragment, whereas superinfection # happens for all. At this stage, we can only give clues about # cross-contamination, the rest will be done in a script downstream # (here we could TAG suspicious reads for contamination) elif (dc.sum() > susp_mismatches): if contaminants is not None: skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE) else: skip = True if skip: n_suspect += 1 map(suspfile.write, reads) continue # Trim the bad CIGARs from the sides, if there are any good ones skip = trim_bad_cigar(reads, match_len_min=match_len_min, trim_left=trim_bad_cigars, trim_right=trim_bad_cigars) if skip: n_badcigar += 1 map(trashfile.write, reads) continue # TODO: we might want to incorporate some more stringent # criterion here, to avoid short reads, cross-overhang, etc. # Write the output n_good += 1 map(outfile.write, reads) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Mispapped at edge:', n_mismapped_edge print 'Many-mutations:', n_mutator print 'Suspect contaminations:', n_suspect print 'Bad CIGARs:', n_badcigar if summary: summary_filename = get_filter_mapped_summary_filename(data_folder, adaID, fragment) with open(summary_filename, 'a') as f: f.write('Filter results: adaID '+adaID+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Mismapped at edge:\t'+str(n_mismapped_edge)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Suspect contaminations:\t'+str(n_suspect)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') if plot: plot_distance_histogram(data_folder, adaID, frag_gen, histogram_distance_from_consensus, savefig=True) plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen, len(ref), histogram_dist_along, binsize=binsize, savefig=True)