def make_index_and_hash(data_folder, adaID, VERBOSE=0, summary=True): '''Make index and hash files for reference or consensus''' if VERBOSE: print 'Making index and hash files: adaID', adaID # 1. Make genome index file for reference if os.path.isfile(get_reference_premap_index_filename(data_folder, adaID, ext=True)): os.remove(get_reference_premap_index_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([stampy_bin, '--species="HIV"', '--overwrite', '-G', get_reference_premap_index_filename(data_folder, adaID, ext=False), get_reference_premap_filename(data_folder, adaID), ], stderr=sp.STDOUT) if VERBOSE: print 'Built index: '+adaID # 2. Build a hash file for reference if os.path.isfile(get_reference_premap_hash_filename(data_folder, adaID, ext=True)): os.remove(get_reference_premap_hash_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-H', get_reference_premap_hash_filename(data_folder, adaID, ext=False), ], stderr=sp.STDOUT) if VERBOSE: print 'Built hash: '+adaID if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Stampy index and hash written.') f.write('\n')
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True): '''Produce a report on rough coverage on reference (ignore inserts)''' ref_filename = get_reference_premap_filename(data_folder, adaID) refseq = SeqIO.read(ref_filename, 'fasta') # Prepare data structures coverage = np.zeros(len(refseq), int) # Parse the BAM file unmapped = 0 mapped = 0 bamfilename = get_premapped_filename(data_folder, adaID, type='bam') with pysam.Samfile(bamfilename, 'rb') as bamfile: for read in bamfile: if read.is_unmapped or (not read.is_proper_pair) or (not len( read.cigar)): unmapped += 1 continue # Proceed along CIGARs ref_pos = read.pos for (bt, bl) in read.cigar: if bt not in (0, 2): continue # Treat deletions as 'covered' coverage[ref_pos:ref_pos + bl] += 1 ref_pos += bl mapped += 1 # Save results from hivwholeseq.sequencing.filenames import get_coverage_figure_filename import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(13, 6)) ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b') ax.set_xlabel('Position') ax.set_ylabel('Coverage') ax.set_yscale('log') ax.set_title('adaID ' + adaID + ', premapped', fontsize=18) ax.set_xlim(-20, len(refseq) + 20) plt.tight_layout() from hivwholeseq.utils.generic import mkdirs from hivwholeseq.sequencing.filenames import get_figure_folder mkdirs(get_figure_folder(data_folder, adaID)) plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped')) plt.close(fig) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nPremapping results: '+\ str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n') f.write('\nCoverage plotted: '+\ get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
def make_index_and_hash(data_folder, adaID, VERBOSE=0, summary=True): '''Make index and hash files for reference or consensus''' if VERBOSE: print 'Making index and hash files: adaID', adaID # 1. Make genome index file for reference if os.path.isfile( get_reference_premap_index_filename(data_folder, adaID, ext=True)): os.remove( get_reference_premap_index_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([ stampy_bin, '--species="HIV"', '--overwrite', '-G', get_reference_premap_index_filename(data_folder, adaID, ext=False), get_reference_premap_filename(data_folder, adaID), ], stderr=sp.STDOUT) if VERBOSE: print 'Built index: ' + adaID # 2. Build a hash file for reference if os.path.isfile( get_reference_premap_hash_filename(data_folder, adaID, ext=True)): os.remove( get_reference_premap_hash_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-H', get_reference_premap_hash_filename(data_folder, adaID, ext=False), ], stderr=sp.STDOUT) if VERBOSE: print 'Built hash: ' + adaID if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Stampy index and hash written.') f.write('\n')
def report_insert_size(data_folder, adaID, seq_run, VERBOSE=0, summary=True): '''Produce figures of the insert size distribution''' from hivwholeseq.sequencing.check_insert_distribution import get_insert_size_distribution, \ plot_cumulative_histogram, plot_histogram bins = np.linspace(0, 1000, 100) isz, h = get_insert_size_distribution( data_folder, adaID, 'premapped', bins=bins, maxreads=10000, VERBOSE=VERBOSE) plot_cumulative_histogram( data_folder, adaID, 'premapped', isz, savefig=True, title='run ' + str(seq_run) + ', adaID ' + str(adaID) + ', premap', lw=2, c='b') plot_histogram( data_folder, adaID, 'premapped', h, savefig=True, title='run ' + str(seq_run) + ', adaID ' + str(adaID) + ', premap', lw=2, color='b') if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nInsert size distribution plotted:\n') f.write( get_insert_size_distribution_cumulative_filename( data_folder, adaID, 'premapped') + '\n') f.write( get_insert_size_distribution_filename(data_folder, adaID, 'premapped') + '\n')
def report_insert_size(data_folder, adaID, seq_run, VERBOSE=0, summary=True): '''Produce figures of the insert size distribution''' from hivwholeseq.sequencing.check_insert_distribution import get_insert_size_distribution, \ plot_cumulative_histogram, plot_histogram bins = np.linspace(0, 1000, 100) isz, h = get_insert_size_distribution(data_folder, adaID, 'premapped', bins=bins, maxreads=10000, VERBOSE=VERBOSE) plot_cumulative_histogram(data_folder, adaID, 'premapped', isz, savefig=True, title='run ' + str(seq_run) + ', adaID ' + str(adaID) + ', premap', lw=2, c='b') plot_histogram(data_folder, adaID, 'premapped', h, savefig=True, title='run ' + str(seq_run) + ', adaID ' + str(adaID) + ', premap', lw=2, color='b') if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nInsert size distribution plotted:\n') f.write( get_insert_size_distribution_cumulative_filename( data_folder, adaID, 'premapped') + '\n') f.write( get_insert_size_distribution_filename(data_folder, adaID, 'premapped') + '\n')
def make_reference(data_folder, adaID, fragments, refname, VERBOSE=0, summary=True): '''Make reference sequence trimmed to the necessary parts''' from hivwholeseq.reference import load_custom_reference seq = load_custom_reference(refname) output_filename = get_reference_premap_filename(data_folder, adaID) if fragments is None: seq_trim = seq else: # Look for the first fwd and the last rev primers to trim the reference # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)! # If more than one primer is used for the first or last fragment, take the # longest reference from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2 if '+' in fragments[0]: fragment_subs = [ fragments[0][:2] + fsub + fragments[0][-1] for fsub in fragments[0][2:-1].split('+') ] fr_pos_subs = [ primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs ] fragments[0] = fragment_subs[np.argmin(fr_pos_subs)] pr_fwd = primers_PCR[fragments[0]][0] if '+' in fragments[-1]: fragment_subs = [ fragments[-1][:2] + fsub + fragments[-1][-1] for fsub in fragments[-1][2:-1].split('+') ] fr_pos_subs = [ primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs ] fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)] pr_rev = primers_PCR[fragments[-1]][1] smat = np.array(seq) # Get all possible primers from ambiguous nucleotides and get the best match from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1') n_matches_fwd = [ (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max() for i in xrange(len(seq) - len(pr_fwd)) ] pr_fwd_pos = np.argmax(n_matches_fwd) pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1') n_matches_rev = [ (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max() for i in xrange(pr_fwd_pos + len(pr_fwd), len(seq) - len(pr_rev)) ] # Here you come from the right, i.e. look in the 3' LTR first pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax( n_matches_rev[::-1]) output = [['Reference name:', refname]] output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd]) output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev]) output = '\n'.join(map(' '.join, output)) if VERBOSE: print output if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write(output) f.write('\n') # The reference includes both the first fwd primer and the last rev one seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)] seq_trim.id = '_'.join( [seq_trim.id, str(pr_fwd_pos + 1), str(pr_rev_pos + len(pr_rev))]) seq_trim.name = '_'.join([ seq_trim.name, str(pr_fwd_pos + 1), str(pr_rev_pos + len(pr_rev)) ]) seq_trim.description = ' '.join([ seq_trim.description, 'from', str(pr_fwd_pos + 1), 'to', str(pr_rev_pos + len(pr_rev)), '(indices from 1, extremes included)' ]) SeqIO.write(seq_trim, output_filename, 'fasta') if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('Reference sequence written to: ' + output_filename) f.write('\n')
def premap_stampy(data_folder, adaID, VERBOSE=0, threads=1, summary=True, maxreads=-1, subsrate=0.05, gapopen=40, gapextend=3): '''Call stampy for actual mapping''' if VERBOSE: print 'Premapping: adaID ', adaID if summary: summary_filename = get_premap_summary_filename(data_folder, adaID) # Stampy can handle both gzipped and uncompressed fastq inputs input_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(input_filenames[0]): input_filenames = get_read_filenames(data_folder, adaID, gzip=False) if not all(map(os.path.isfile, input_filenames)): raise OSError('Input files for mapping not found: ' + input_filenames[0]) # parallelize if requested if threads == 1: call_list = [ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename(data_folder, adaID, ext=False), '-o', get_premapped_filename(data_folder, adaID, type='sam'), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), ] if maxreads > 0: call_list.append('--numrecords=' + str(maxreads)) call_list.extend(['-M'] + input_filenames) call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nStampy premapped (single thread).\n') # Convert to compressed BAM convert_sam_to_bam( get_premapped_filename(data_folder, adaID, type='bam')) if summary: with open(summary_filename, 'a') as f: f.write('\nSAM file converted to compressed BAM: '+\ get_premapped_filename(data_folder, adaID, type='bam')+'\n') else: # Multithreading works as follows: call qsub + stampy, monitor the process # IDs with qstat at regular intervals, and finally merge results with pysam output_file_parts = [ get_premapped_filename(data_folder, adaID, type='bam', part=(j + 1)) for j in xrange(threads) ] # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') # Submit map call import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/' JOBLOGOUT = JOBDIR + 'logout' JOBLOGERR = JOBDIR + 'logerr' cluster_time = ['23:59:59', '1:59:59'] vmem = '8G' for j in xrange(threads): call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l', 'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem, stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename( data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename( data_folder, adaID, ext=False), '-o', get_premapped_filename( data_folder, adaID, type='sam', part=(j + 1)), '--processpart=' + str(j + 1) + '/' + str(threads), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), '-M' ] + input_filenames call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if VERBOSE >= 3: print qstat_output if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert premapped reads to BAM for merging: adaID '+\ adaID+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy premapped (' + str(threads) + ' threads).\n') # Concatenate output files if VERBOSE >= 1: print 'Concatenate premapped reads: adaID ' + adaID + '...', output_filename = get_premapped_filename(data_folder, adaID, type='bam', unsorted=True) pysam.cat('-o', output_filename, *output_file_parts) if VERBOSE >= 1: print 'done.' if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort premapped reads: adaID ' + adaID output_filename_sorted = get_premapped_filename(data_folder, adaID, type='bam', unsorted=False) pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader premapped reads: adaID ' + adaID header_filename = get_premapped_filename(data_folder, adaID, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp premapping files removed.\n') f.write('\n')
reference=refname, summary=summary, trimmed=use_trimmed, subsrate=subsrate, gapopen=gapopen, gapextend=gapextend, maxreads=maxreads) continue make_output_folders(data_folder, adaID, VERBOSE=VERBOSE, summary=summary) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'w') as f: outstr = 'Call: python premap_to_reference.py --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --threads '+str(threads)+\ ' --reference '+refname+\ ' --subsrate '+str(subsrate)+\ ' --gapopen '+str(gapopen)+\ ' --gapextend '+str(gapextend)+\ ' --verbose '+str(VERBOSE) if maxreads != -1: outstr = outstr + ' --maxreads ' + str(maxreads) if use_trimmed: outstr = outstr + ' --trimmed' outstr = outstr + '\n' f.write(outstr)
def premap_stampy(data_folder, adaID, VERBOSE=0, threads=1, summary=True, maxreads=-1, subsrate=0.05, gapopen=40, gapextend=3): '''Call stampy for actual mapping''' if VERBOSE: print 'Premapping: adaID ', adaID if summary: summary_filename = get_premap_summary_filename(data_folder, adaID) # Stampy can handle both gzipped and uncompressed fastq inputs input_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(input_filenames[0]): input_filenames = get_read_filenames(data_folder, adaID, gzip=False) if not all(map(os.path.isfile, input_filenames)): raise OSError('Input files for mapping not found: ' + input_filenames[0]) # parallelize if requested if threads == 1: call_list = [ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename(data_folder, adaID, ext=False), '-o', get_premapped_filename(data_folder, adaID, type='sam'), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), ] if maxreads > 0: call_list.append('--numrecords=' + str(maxreads)) call_list.extend(['-M'] + input_filenames) call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nStampy premapped (single thread).\n') # Convert to compressed BAM convert_sam_to_bam( get_premapped_filename(data_folder, adaID, type='bam')) if summary: with open(summary_filename, 'a') as f: f.write('\nSAM file converted to compressed BAM: '+\ get_premapped_filename(data_folder, adaID, type='bam')+'\n') else: # Multithreading works as follows: call qsub + stampy, monitor the process # IDs with qstat at regular intervals, and finally merge results with pysam output_file_parts = [ get_premapped_filename( data_folder, adaID, type='bam', part=(j + 1)) for j in xrange(threads) ] # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') # Submit map call import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/' JOBLOGOUT = JOBDIR + 'logout' JOBLOGERR = JOBDIR + 'logerr' cluster_time = ['23:59:59', '1:59:59'] vmem = '8G' for j in xrange(threads): call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l', 'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem, stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename( data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename( data_folder, adaID, ext=False), '-o', get_premapped_filename( data_folder, adaID, type='sam', part=(j + 1)), '--processpart=' + str(j + 1) + '/' + str(threads), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), '-M' ] + input_filenames call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if VERBOSE >= 3: print qstat_output if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert premapped reads to BAM for merging: adaID '+\ adaID+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy premapped (' + str(threads) + ' threads).\n') # Concatenate output files if VERBOSE >= 1: print 'Concatenate premapped reads: adaID ' + adaID + '...', output_filename = get_premapped_filename( data_folder, adaID, type='bam', unsorted=True) pysam.cat('-o', output_filename, *output_file_parts) if VERBOSE >= 1: print 'done.' if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort premapped reads: adaID ' + adaID output_filename_sorted = get_premapped_filename( data_folder, adaID, type='bam', unsorted=False) pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader premapped reads: adaID ' + adaID header_filename = get_premapped_filename( data_folder, adaID, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp premapping files removed.\n') f.write('\n')
VERBOSE=VERBOSE, threads=threads, reference=refname, summary=summary, trimmed=use_trimmed, subsrate=subsrate, gapopen=gapopen, gapextend=gapextend, maxreads=maxreads) continue make_output_folders( data_folder, adaID, VERBOSE=VERBOSE, summary=summary) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'w') as f: outstr = 'Call: python premap_to_reference.py --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --threads '+str(threads)+\ ' --reference '+refname+\ ' --subsrate '+str(subsrate)+\ ' --gapopen '+str(gapopen)+\ ' --gapextend '+str(gapextend)+\ ' --verbose '+str(VERBOSE) if maxreads != -1: outstr = outstr + ' --maxreads ' + str(maxreads) if use_trimmed: outstr = outstr + ' --trimmed' outstr = outstr + '\n' f.write(outstr)