def check_division(data_folder, adaID, fragment, seq_run, qual_min=35, reference='HXB2', maxreads=-1, VERBOSE=0, minor_allele=False): '''Check division into fragments: coverage, etc.''' ref_fn = get_reference_premap_filename(data_folder, adaID, fragment) # FIXME: old nomenclature for F3a if not os.path.isfile(ref_fn): if fragment[:2] == 'F3': ref_fn = ref_fn.replace('F3a', 'F3') refseq = SeqIO.read(ref_fn, 'fasta') # Scan reads input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # FIXME: old nomenclature for F3a if not os.path.isfile(input_filename): if fragment[:2] == 'F3': input_filename = input_filename.replace('F3a', 'F3') counts, inserts = get_allele_counts_insertions_from_file(input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE) # Plot results title=', '.join(map(lambda x: ' '.join([x[0], str(x[1])]), [['run', seq_run], ['adaID', adaID], ['fragment', fragment], ['maxreads', maxreads], ])) plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def make_index_and_hash(data_folder, adaID, VERBOSE=0, summary=True): '''Make index and hash files for reference or consensus''' if VERBOSE: print 'Making index and hash files: adaID', adaID # 1. Make genome index file for reference if os.path.isfile(get_reference_premap_index_filename(data_folder, adaID, ext=True)): os.remove(get_reference_premap_index_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([stampy_bin, '--species="HIV"', '--overwrite', '-G', get_reference_premap_index_filename(data_folder, adaID, ext=False), get_reference_premap_filename(data_folder, adaID), ], stderr=sp.STDOUT) if VERBOSE: print 'Built index: '+adaID # 2. Build a hash file for reference if os.path.isfile(get_reference_premap_hash_filename(data_folder, adaID, ext=True)): os.remove(get_reference_premap_hash_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-H', get_reference_premap_hash_filename(data_folder, adaID, ext=False), ], stderr=sp.STDOUT) if VERBOSE: print 'Built hash: '+adaID if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Stampy index and hash written.') f.write('\n')
def get_reference_filename(data_folder, adaID, fragment, n_iter, ext=True): '''Get the reference filename for the intermediate mappings''' if n_iter == 1: fn = get_reference_premap_filename(data_folder, adaID, fragment) if not ext: fn = fn[:-6] else: fn = '_'.join(['consensus', str(n_iter-1), fragment]) fn = data_folder+foldername_adapter(adaID)+'map_iter/'+fn if ext: fn = fn+'.fasta' return fn
def get_reference_filename(data_folder, adaID, fragment, n_iter, ext=True): '''Get the reference filename for the intermediate mappings''' if n_iter == 1: fn = get_reference_premap_filename(data_folder, adaID, fragment) if not ext: fn = fn[:-6] else: fn = '_'.join(['consensus', str(n_iter - 1), fragment]) fn = data_folder + foldername_adapter(adaID) + 'map_iter/' + fn if ext: fn = fn + '.fasta' return fn
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True): '''Produce a report on rough coverage on reference (ignore inserts)''' ref_filename = get_reference_premap_filename(data_folder, adaID) refseq = SeqIO.read(ref_filename, 'fasta') # Prepare data structures coverage = np.zeros(len(refseq), int) # Parse the BAM file unmapped = 0 mapped = 0 bamfilename = get_premapped_filename(data_folder, adaID, type='bam') with pysam.Samfile(bamfilename, 'rb') as bamfile: for read in bamfile: if read.is_unmapped or (not read.is_proper_pair) or (not len( read.cigar)): unmapped += 1 continue # Proceed along CIGARs ref_pos = read.pos for (bt, bl) in read.cigar: if bt not in (0, 2): continue # Treat deletions as 'covered' coverage[ref_pos:ref_pos + bl] += 1 ref_pos += bl mapped += 1 # Save results from hivwholeseq.sequencing.filenames import get_coverage_figure_filename import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(13, 6)) ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b') ax.set_xlabel('Position') ax.set_ylabel('Coverage') ax.set_yscale('log') ax.set_title('adaID ' + adaID + ', premapped', fontsize=18) ax.set_xlim(-20, len(refseq) + 20) plt.tight_layout() from hivwholeseq.utils.generic import mkdirs from hivwholeseq.sequencing.filenames import get_figure_folder mkdirs(get_figure_folder(data_folder, adaID)) plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped')) plt.close(fig) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nPremapping results: '+\ str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n') f.write('\nCoverage plotted: '+\ get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True): '''Produce a report on rough coverage on reference (ignore inserts)''' ref_filename = get_reference_premap_filename(data_folder, adaID) refseq = SeqIO.read(ref_filename, 'fasta') # Prepare data structures coverage = np.zeros(len(refseq), int) # Parse the BAM file unmapped = 0 mapped = 0 bamfilename = get_premapped_filename(data_folder, adaID, type='bam') with pysam.Samfile(bamfilename, 'rb') as bamfile: for read in bamfile: if read.is_unmapped or (not read.is_proper_pair) or (not len( read.cigar)): unmapped += 1 continue # Proceed along CIGARs ref_pos = read.pos for (bt, bl) in read.cigar: if bt not in (0, 2): continue # Treat deletions as 'covered' coverage[ref_pos:ref_pos + bl] += 1 ref_pos += bl mapped += 1 # Save results from hivwholeseq.sequencing.filenames import get_coverage_figure_filename import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(13, 6)) ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b') ax.set_xlabel('Position') ax.set_ylabel('Coverage') ax.set_yscale('log') ax.set_title('adaID ' + adaID + ', premapped', fontsize=18) ax.set_xlim(-20, len(refseq) + 20) plt.tight_layout() from hivwholeseq.utils.generic import mkdirs from hivwholeseq.sequencing.filenames import get_figure_folder mkdirs(get_figure_folder(data_folder, adaID)) plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped')) plt.close(fig) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nPremapping results: '+\ str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n') f.write('\nCoverage plotted: '+\ get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
def store_reference_fragmented(data_folder, adaID, refseq, fragment_trim_poss_dict): '''Store FASTA files for the reference in fragments''' for fragment, poss in fragment_trim_poss_dict.iteritems(): if not np.isscalar(poss[0]): poss = [poss[0]['inner'], poss[1]['inner']] refseq_frag = refseq[poss[0]: poss[1]] refseq_frag.id = refseq_frag.id+'_'+fragment refseq_frag.name = refseq_frag.name+'_'+fragment refseq_frag.description = refseq_frag.description+', fragment '+fragment SeqIO.write(refseq_frag, get_reference_premap_filename(data_folder, adaID, fragment), 'fasta')
def score_consensus(sample, VERBOSE=0): '''Score a consensus based on completeness and quality''' data_folder = sample.sequencing_run.folder adaID = sample.adapter frag_spec = filter(lambda x: fragment in x, sample.regions_complete) if not len(frag_spec): field = '' return (True, '') fn = get_consensus_filename(data_folder, adaID, fragment) if not os.path.isfile(fn): return (False, 'MISS') frag_spec = frag_spec[0] fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec) if not os.path.isfile(fn_ref): if frag_spec[:3] == 'F3a': frag_spec = frag_spec.replace('a', '') fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec) if not os.path.isfile(fn_ref): return (False, 'MISSREF') else: return (False, 'MISSREF') ref = SeqIO.read(fn_ref, 'fasta') cons = SeqIO.read(fn, 'fasta') if len(cons) < len(ref) - 200: return (False, 'SHORT') elif len(cons) > len(ref) + 200: return (False, 'LONG') #ali = align_global(str(ref.seq), str(cons.seq), band=200) #alim1 = np.fromstring(ali[1], 'S1') #alim2 = np.fromstring(ali[2], 'S1') #if (alim1 != alim2).sum() > return (True, 'OK')
def score_consensus(sample, VERBOSE=0): '''Score a consensus based on completeness and quality''' data_folder = sample.sequencing_run.folder adaID = sample.adapter frag_spec = filter(lambda x: fragment in x, sample.regions_complete) if not len(frag_spec): field = '' return (True, '') fn = get_consensus_filename(data_folder, adaID, fragment) if not os.path.isfile(fn): return (False, 'MISS') frag_spec = frag_spec[0] fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec) if not os.path.isfile(fn_ref): if frag_spec[:3] == 'F3a': frag_spec = frag_spec.replace('a', '') fn_ref = get_reference_premap_filename(data_folder, adaID, frag_spec) if not os.path.isfile(fn_ref): return (False, 'MISSREF') else: return (False, 'MISSREF') ref = SeqIO.read(fn_ref, 'fasta') cons = SeqIO.read(fn, 'fasta') if len(cons) < len(ref) - 200: return (False, 'SHORT') elif len(cons) > len(ref) + 200: return (False, 'LONG') #ali = align_global(str(ref.seq), str(cons.seq), band=200) #alim1 = np.fromstring(ali[1], 'S1') #alim2 = np.fromstring(ali[2], 'S1') #if (alim1 != alim2).sum() > return (True, 'OK')
def make_index_and_hash(data_folder, adaID, VERBOSE=0, summary=True): '''Make index and hash files for reference or consensus''' if VERBOSE: print 'Making index and hash files: adaID', adaID # 1. Make genome index file for reference if os.path.isfile( get_reference_premap_index_filename(data_folder, adaID, ext=True)): os.remove( get_reference_premap_index_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([ stampy_bin, '--species="HIV"', '--overwrite', '-G', get_reference_premap_index_filename(data_folder, adaID, ext=False), get_reference_premap_filename(data_folder, adaID), ], stderr=sp.STDOUT) if VERBOSE: print 'Built index: ' + adaID # 2. Build a hash file for reference if os.path.isfile( get_reference_premap_hash_filename(data_folder, adaID, ext=True)): os.remove( get_reference_premap_hash_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-H', get_reference_premap_hash_filename(data_folder, adaID, ext=False), ], stderr=sp.STDOUT) if VERBOSE: print 'Built hash: ' + adaID if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Stampy index and hash written.') f.write('\n')
def check_division(data_folder, adaID, fragment, seq_run, qual_min=35, reference='HXB2', maxreads=-1, VERBOSE=0, minor_allele=False): '''Check division into fragments: coverage, etc.''' ref_fn = get_reference_premap_filename(data_folder, adaID, fragment) # FIXME: old nomenclature for F3a if not os.path.isfile(ref_fn): if fragment[:2] == 'F3': ref_fn = ref_fn.replace('F3a', 'F3') refseq = SeqIO.read(ref_fn, 'fasta') # Scan reads input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # FIXME: old nomenclature for F3a if not os.path.isfile(input_filename): if fragment[:2] == 'F3': input_filename = input_filename.replace('F3a', 'F3') counts, inserts = get_allele_counts_insertions_from_file(input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE) # Plot results title = ', '.join( map(lambda x: ' '.join([x[0], str(x[1])]), [ ['run', seq_run], ['adaID', adaID], ['fragment', fragment], ['maxreads', maxreads], ])) plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def trim_and_divide_reads(data_folder, adaID, n_cycles, fragments, maxreads=-1, VERBOSE=0, minisize=100, include_tests=False, summary=True): '''Trim reads and divide them into fragments''' if VERBOSE: print 'Trim and divide into fragments: adaID '+adaID+', fragments: '+\ ' '.join(fragments) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('Fragments used: '+' '.join(fragments)+'\n') ref_filename = get_reference_premap_filename(data_folder, adaID) refseq = SeqIO.read(ref_filename, 'fasta') smat = np.array(refseq, 'S1') len_reference = len(refseq) # Get the positions of fragment start/end, w/ and w/o primers frags_pos = get_fragment_positions(smat, fragments) store_reference_fragmented(data_folder, adaID, refseq, dict(zip(fragments, frags_pos['trim']))) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('Primer positions (for fragments):\n') for (fragment, poss_full, poss_trim) in izip(fragments, frags_pos['full'], frags_pos['trim']): f.write(fragment+': fwd '+str(poss_full[0])+' '+str(poss_trim[0])+\ ', rev '+str(poss_trim[1])+' '+str(poss_full[1])+'\n') write_fragment_positions(data_folder, adaID, fragments, frags_pos) # Get the positions of the unwanted outer primers (in case we DO nested PCR # for that fragment) # NOTE: the LTRs make no problem, because the rev outer primer of F6 # is not in the reference anymore if F6 has undergone nested PCR # FIXME: this might not work if we have mixed fragments (e.g. F5a+b) AND nesting from re import findall primers_out = {'fwd': [], 'rev': []} for i, fr in enumerate(fragments): if (i != 0) and findall(r'F[2-6][a-z]?i', fr): primers_out['fwd'].append(fr[:-1]+'o') if (i != len(fragments) - 1) and findall(r'F[1-5][a-z]?i', fr): primers_out['rev'].append(fr[:-1]+'o') # Get all possible unambiguous primers for the unwanted outer primers from hivwholeseq.data.primers import primers_PCR from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas primers_out_seq = {'fwd': [np.array(map(list, eas(primers_PCR[fr][0])), 'S1', ndmin=2) for fr in primers_out['fwd']], 'rev': [np.array(map(list, eas(primers_PCR[fr][1])), 'S1', ndmin=2) for fr in primers_out['rev']], } primers_out_pos = {'fwd': [], 'rev': []} if primers_out['fwd']: primers_out_pos['fwd'] = map(itemgetter(0), get_primer_positions(smat, primers_out['fwd'], 'fwd')) if primers_out['rev']: primers_out_pos['rev'] = map(itemgetter(1), get_primer_positions(smat, primers_out['rev'], 'rev')) # Input and output files input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): convert_sam_to_bam(input_filename) output_filenames = get_divided_filenames(data_folder, adaID, fragments, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: try: file_handles = [pysam.Samfile(ofn, 'wb', template=bamfile) for ofn in output_filenames[:len(fragments)]] fo_am = pysam.Samfile(output_filenames[-4], 'wb', template=bamfile) fo_cm = pysam.Samfile(output_filenames[-3], 'wb', template=bamfile) fo_um = pysam.Samfile(output_filenames[-2], 'wb', template=bamfile) fo_lq = pysam.Samfile(output_filenames[-1], 'wb', template=bamfile) # Iterate over the mapped reads and assign fragments n_mapped = [0 for fragment in fragments] n_unmapped = 0 n_crossfrag = 0 n_ambiguous = 0 n_outer = 0 n_lowq = 0 for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: if VERBOSE: print 'Maximal number of read pairs reached:', maxreads break if VERBOSE >= 2: if not ((irp+1) % 10000): print irp+1 i_fwd = reads[0].is_reverse # If unmapped or unpaired, mini, or insert size mini, or # divergent read pair (fully cross-overlapping), discard if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \ reads[1].is_unmapped or (not reads[1].is_proper_pair) or \ (reads[0].rlen < 50) or (reads[1].rlen < 50) or \ (reads[i_fwd].isize < minisize): if VERBOSE >= 3: print 'Read pair unmapped/unpaired/tiny/divergent:', reads[0].qname n_unmapped += 1 fo_um.write(reads[0]) fo_um.write(reads[1]) continue # If the insert is a misamplification from the outer primers # in fragments that underwent nested PCR, # trash it (it will have skewed amplification anyway). We cannot # find all of those, rather only the ones still carrying the # primer itself (some others have lost it while shearing). For # those, no matter what happens at the end (reading into adapters, # etc.), ONE of the reads in the pair will start exactly with one # outer primer: if the rev read with a rev primer, if the fwd # with a fwd one. Test all six. if (len(primers_out_pos['fwd']) or len(primers_out_pos['rev'])) and \ test_outer_primer(reads, primers_out_pos, primers_out_seq, len_reference): if VERBOSE >= 3: print 'Read pair from outer primer:', reads[0].qname n_outer += 1 fo_um.write(reads[0]) fo_um.write(reads[1]) continue # FIXME: the following becomes a bit harder when we mix parallel # PCRs, e.g. F5a+b, to get more product # Assign to a fragment now, so that primer trimming is faster pair_identity = assign_to_fragment(reads, frags_pos['full'], VERBOSE=VERBOSE) # 1. If no fragments are possible (e.g. one read crosses the # fragment boundary, they map to different fragments), dump it # into a special bucket if pair_identity == 'cross': n_crossfrag += 1 fo_cm.write(reads[0]) fo_cm.write(reads[1]) continue # 2. If 2+ fragments are possible (tie), put into a special bucket # (essentially excluded, because we want two independent measurements # in the overlapping region, but we might want to recover them) elif pair_identity == 'ambiguous': n_ambiguous += 1 fo_am.write(reads[0]) fo_am.write(reads[1]) continue # 3. If the intersection is a single fragment, good: trim the primers # NB: n_frag is the index IN THE POOL. If we sequence only F2-F5, F2 is n_frag = 0 n_frag = int(pair_identity) frag_pos = frags_pos['trim'][n_frag] if not np.isscalar(frag_pos[0]): frag_pos = [frag_pos[0]['inner'], frag_pos[1]['inner']] trashed_primers = trim_primers(reads, frag_pos, include_tests=include_tests) if trashed_primers or (reads[i_fwd].isize < 100): n_unmapped += 1 if VERBOSE >= 3: print 'Read pair is mismapped:', reads[0].qname fo_um.write(reads[0]) fo_um.write(reads[1]) continue # Quality trimming: if no decently long pair survives, trash #trashed_quality = main_block_low_quality(reads, phred_min=20, # include_tests=include_tests) trashed_quality = trim_low_quality(reads, phred_min=20, include_tests=include_tests) if trashed_quality or (reads[i_fwd].isize < 100): n_lowq += 1 if VERBOSE >= 3: print 'Read pair has low phred quality:', reads[0].qname fo_lq.write(reads[0]) fo_lq.write(reads[1]) continue # Check for cross-overhangs or COH (reading into the adapters) # ---------------> # <----------- # In that case, trim to perfect overlap. if test_coh(reads, VERBOSE=False): trim_coh(reads, trim=0, include_tests=include_tests) # Change coordinates into the fragmented reference (primer-trimmed) for read in reads: read.pos -= frag_pos[0] read.mpos -= frag_pos[0] # Here the tests if include_tests: lfr = frags_pos['trim'][n_frag][1] - frags_pos['trim'][n_frag][0] if test_sanity(reads, n_frag, lfr): print 'Tests failed:', reads[0].qname import ipdb; ipdb.set_trace() # There we go! n_mapped[n_frag] += 1 file_handles[n_frag].write(reads[0]) file_handles[n_frag].write(reads[1]) finally: for f in file_handles: f.close() fo_am.close() fo_cm.close() fo_um.close() fo_lq.close() if VERBOSE: print 'Trim and divide results: adaID '+adaID print 'Total:\t\t', irp print 'Mapped:\t\t', sum(n_mapped), n_mapped print 'Unmapped/unpaired/tiny:\t', n_unmapped print 'Outer primer\t', n_outer print 'Crossfrag:\t', n_crossfrag print 'Ambiguous:\t', n_ambiguous print 'Low-quality:\t', n_lowq # Write summary to file if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Trim and divide results: adaID '+adaID+'\n') f.write('Total:\t\t'+str(irp + 1)+'\n') f.write('Mapped:\t\t'+str(sum(n_mapped))+' '+str(n_mapped)+'\n') f.write('Unmapped/unpaired/tiny insert:\t'+str(n_unmapped)+'\n') f.write('Outer primer\t'+str(n_outer)+'\n') f.write('Crossfrag:\t'+str(n_crossfrag)+'\n') f.write('Ambiguous:\t'+str(n_ambiguous)+'\n') f.write('Low-quality:\t'+str(n_lowq)+'\n')
f.write('Call: python build_consensus.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+fragment+\ ' --block-length '+str(block_len_initial)+\ ' --reads-per-alignment '+str(n_reads_per_ali)+\ ' --verbose '+str(VERBOSE)) if store_allele_counts: f.write(' --allele-counts') f.write('\n') if VERBOSE: print seq_run, adaID, fragment if fragment == 'genomewide': refseq = SeqIO.read( get_reference_premap_filename(data_folder, adaID), 'fasta') bamfilename = get_premapped_filename(data_folder, adaID, type='bam') frag_out = fragment else: fn = get_reference_premap_filename(data_folder, adaID, fragment) bamfilename = get_divided_filename(data_folder, adaID, fragment, type='bam') #FIXME: old nomenclature for F3a is F3 if not os.path.isfile(fn) and fragment[:3] == 'F3a': fn = get_reference_premap_filename(data_folder, adaID,
def make_reference(data_folder, adaID, fragments, refname, VERBOSE=0, summary=True): '''Make reference sequence trimmed to the necessary parts''' from hivwholeseq.reference import load_custom_reference seq = load_custom_reference(refname) output_filename = get_reference_premap_filename(data_folder, adaID) if fragments is None: seq_trim = seq else: # Look for the first fwd and the last rev primers to trim the reference # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)! # If more than one primer is used for the first or last fragment, take the # longest reference from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2 if '+' in fragments[0]: fragment_subs = [ fragments[0][:2] + fsub + fragments[0][-1] for fsub in fragments[0][2:-1].split('+') ] fr_pos_subs = [ primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs ] fragments[0] = fragment_subs[np.argmin(fr_pos_subs)] pr_fwd = primers_PCR[fragments[0]][0] if '+' in fragments[-1]: fragment_subs = [ fragments[-1][:2] + fsub + fragments[-1][-1] for fsub in fragments[-1][2:-1].split('+') ] fr_pos_subs = [ primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs ] fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)] pr_rev = primers_PCR[fragments[-1]][1] smat = np.array(seq) # Get all possible primers from ambiguous nucleotides and get the best match from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1') n_matches_fwd = [ (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max() for i in xrange(len(seq) - len(pr_fwd)) ] pr_fwd_pos = np.argmax(n_matches_fwd) pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1') n_matches_rev = [ (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max() for i in xrange(pr_fwd_pos + len(pr_fwd), len(seq) - len(pr_rev)) ] # Here you come from the right, i.e. look in the 3' LTR first pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax( n_matches_rev[::-1]) output = [['Reference name:', refname]] output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd]) output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev]) output = '\n'.join(map(' '.join, output)) if VERBOSE: print output if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write(output) f.write('\n') # The reference includes both the first fwd primer and the last rev one seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)] seq_trim.id = '_'.join( [seq_trim.id, str(pr_fwd_pos + 1), str(pr_rev_pos + len(pr_rev))]) seq_trim.name = '_'.join([ seq_trim.name, str(pr_fwd_pos + 1), str(pr_rev_pos + len(pr_rev)) ]) seq_trim.description = ' '.join([ seq_trim.description, 'from', str(pr_fwd_pos + 1), 'to', str(pr_rev_pos + len(pr_rev)), '(indices from 1, extremes included)' ]) SeqIO.write(seq_trim, output_filename, 'fasta') if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('Reference sequence written to: ' + output_filename) f.write('\n')
def check_premap(data_folder, adaID, fragments, seq_run, samplename, qual_min=30, match_len_min=10, maxreads=-1, VERBOSE=0, savefig=True, title=None): '''Check premap to reference: coverage, etc.''' refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') # FIXME: do this possibly better than parsing the description! try: fields = refseq.description.split() refseq_start = int(fields[fields.index('(indices') - 3]) except ValueError: refseq_start = 550 fragpos_filename = get_fragment_positions_filename(data_folder, adaID) if os.path.isfile(fragpos_filename): # Load the fragment positions, considering mixed fragments (e.g. F5a+b) fragtmp = [] postmp = [] with open(fragpos_filename, 'r') as f: f.readline() #HEADER for line in f: fields = line[:-1].split('\t') fragtmp.append(fields[0]) if 'inner' not in fields[1]: postmp.append([fields[1], fields[4]]) else: start = int(fields[1].split(',')[1].split(': ')[1].rstrip('}')) end = int(fields[4].split(',')[1].split(': ')[1].rstrip('}')) postmp.append([start, end]) postmp = np.array(postmp, int) # NOTE: In a lot of old files, it says F3o instead of F3ao if 'F3o' in fragtmp: fragtmp[fragtmp.index('F3o')] = 'F3ao' elif 'F3i' in fragtmp: fragtmp[fragtmp.index('F3i')] = 'F3ai' frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T else: frags_pos = None frags_pos_out = None # Open BAM and scan reads input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): if VERBOSE: print 'Premapped BAM file not found' return (None, None) # Count reads if requested n_reads = get_number_reads(input_filename) if VERBOSE: print 'N. of reads:', n_reads # Get counts counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename, len(refseq), qual_min=qual_min, match_len_min=match_len_min, maxreads=maxreads, VERBOSE=VERBOSE) # Plot results if title is None: title=', '.join(['run '+seq_run+' '+adaID, 'sample '+samplename, 'reads '+str(min(maxreads, n_reads))+'/'+str(n_reads), ]) plot_coverage(counts, offset_x=refseq_start, frags_pos=frags_pos, frags_pos_out=frags_pos_out, title=title) if savefig: from hivwholeseq.sequencing.adapter_info import foldername_adapter plt.savefig(data_folder+foldername_adapter(adaID)+'figures/coverage_premapped_'+samplename+'.png') return (counts, inserts)
with open(sfn, 'w') as f: f.write('Call: python build_consensus.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+fragment+\ ' --block-length '+str(block_len_initial)+\ ' --reads-per-alignment '+str(n_reads_per_ali)+\ ' --verbose '+str(VERBOSE)) if store_allele_counts: f.write(' --allele-counts') f.write('\n') if VERBOSE: print seq_run, adaID, fragment if fragment == 'genomewide': refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') bamfilename = get_premapped_filename(data_folder, adaID, type='bam') frag_out = fragment else: fn = get_reference_premap_filename(data_folder, adaID, fragment) bamfilename = get_divided_filename(data_folder, adaID, fragment, type='bam') #FIXME: old nomenclature for F3a is F3 if not os.path.isfile(fn) and fragment[:3] == 'F3a': fn = get_reference_premap_filename(data_folder, adaID, 'F3'+fragment[-1]) if not os.path.isfile(bamfilename) and fragment[:3] == 'F3a': bamfilename = get_divided_filename(data_folder, adaID, 'F3'+fragment[-1], type='bam') refseq = SeqIO.read(fn, 'fasta') frag_out = fragment[:2]
def make_reference(data_folder, adaID, fragments, refname, VERBOSE=0, summary=True): '''Make reference sequence trimmed to the necessary parts''' from hivwholeseq.reference import load_custom_reference seq = load_custom_reference(refname) output_filename = get_reference_premap_filename(data_folder, adaID) if fragments is None: seq_trim = seq else: # Look for the first fwd and the last rev primers to trim the reference # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)! # If more than one primer is used for the first or last fragment, take the # longest reference from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2 if '+' in fragments[0]: fragment_subs = [ fragments[0][:2] + fsub + fragments[0][-1] for fsub in fragments[0][2:-1].split('+') ] fr_pos_subs = [ primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs ] fragments[0] = fragment_subs[np.argmin(fr_pos_subs)] pr_fwd = primers_PCR[fragments[0]][0] if '+' in fragments[-1]: fragment_subs = [ fragments[-1][:2] + fsub + fragments[-1][-1] for fsub in fragments[-1][2:-1].split('+') ] fr_pos_subs = [ primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs ] fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)] pr_rev = primers_PCR[fragments[-1]][1] smat = np.array(seq) # Get all possible primers from ambiguous nucleotides and get the best match from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1') n_matches_fwd = [ (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max() for i in xrange(len(seq) - len(pr_fwd)) ] pr_fwd_pos = np.argmax(n_matches_fwd) pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1') n_matches_rev = [ (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max() for i in xrange(pr_fwd_pos + len(pr_fwd), len(seq) - len(pr_rev)) ] # Here you come from the right, i.e. look in the 3' LTR first pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax( n_matches_rev[::-1]) output = [['Reference name:', refname]] output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd]) output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev]) output = '\n'.join(map(' '.join, output)) if VERBOSE: print output if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write(output) f.write('\n') # The reference includes both the first fwd primer and the last rev one seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)] seq_trim.id = '_'.join( [seq_trim.id, str(pr_fwd_pos + 1), str(pr_rev_pos + len(pr_rev))]) seq_trim.name = '_'.join([ seq_trim.name, str(pr_fwd_pos + 1), str(pr_rev_pos + len(pr_rev)) ]) seq_trim.description = ' '.join([ seq_trim.description, 'from', str(pr_fwd_pos + 1), 'to', str(pr_rev_pos + len(pr_rev)), '(indices from 1, extremes included)' ]) SeqIO.write(seq_trim, output_filename, 'fasta') if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('Reference sequence written to: ' + output_filename) f.write('\n')
def check_premap(data_folder, adaID, fragments, seq_run, samplename, qual_min=30, match_len_min=10, maxreads=-1, VERBOSE=0, savefig=True, title=None): '''Check premap to reference: coverage, etc.''' refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') # FIXME: do this possibly better than parsing the description! try: fields = refseq.description.split() refseq_start = int(fields[fields.index('(indices') - 3]) except ValueError: refseq_start = 550 fragpos_filename = get_fragment_positions_filename(data_folder, adaID) if os.path.isfile(fragpos_filename): # Load the fragment positions, considering mixed fragments (e.g. F5a+b) fragtmp = [] postmp = [] with open(fragpos_filename, 'r') as f: f.readline() #HEADER for line in f: fields = line[:-1].split('\t') fragtmp.append(fields[0]) if 'inner' not in fields[1]: postmp.append([fields[1], fields[4]]) else: start = int( fields[1].split(',')[1].split(': ')[1].rstrip('}')) end = int( fields[4].split(',')[1].split(': ')[1].rstrip('}')) postmp.append([start, end]) postmp = np.array(postmp, int) # NOTE: In a lot of old files, it says F3o instead of F3ao if 'F3o' in fragtmp: fragtmp[fragtmp.index('F3o')] = 'F3ao' elif 'F3i' in fragtmp: fragtmp[fragtmp.index('F3i')] = 'F3ai' frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T else: frags_pos = None frags_pos_out = None # Open BAM and scan reads input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): if VERBOSE: print 'Premapped BAM file not found' return (None, None) # Count reads if requested n_reads = get_number_reads(input_filename) if VERBOSE: print 'N. of reads:', n_reads # Get counts counts, inserts = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), qual_min=qual_min, match_len_min=match_len_min, maxreads=maxreads, VERBOSE=VERBOSE) # Plot results if title is None: title = ', '.join([ 'run ' + seq_run + ' ' + adaID, 'sample ' + samplename, 'reads ' + str(min(maxreads, n_reads)) + '/' + str(n_reads), ]) plot_coverage(counts, offset_x=refseq_start, frags_pos=frags_pos, frags_pos_out=frags_pos_out, title=title) if savefig: from hivwholeseq.sequencing.adapter_info import foldername_adapter plt.savefig(data_folder + foldername_adapter(adaID) + 'figures/coverage_premapped_' + samplename + '.png') return (counts, inserts)