def get_insert_size_distribution(data_folder, adaID, fragment, bins=None, maxreads=-1, VERBOSE=0, density=True): '''Get the distribution of insert sizes''' if maxreads <= 0: maxreads = 1e6 insert_sizes = np.zeros(maxreads, np.int16) # Open BAM file if fragment == 'premapped': bamfilename = get_premapped_filename(data_folder, adaID, type='bam') else: bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) # Convert from SAM if necessary if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) # Open file with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over single reads (no linkage info needed) n_written = 0 for i, reads in enumerate(pair_generator(bamfile)): if i == maxreads: if VERBOSE >= 2: print 'Max reads reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i +1) % 10000)): print (i+1) # If unmapped or unpaired, mini, or insert size mini, discard if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \ reads[1].is_unmapped or (not reads[1].is_proper_pair): continue # Store insert size i_fwd = reads[0].is_reverse insert_sizes[i] = reads[i_fwd].isize n_written += 1 insert_sizes = insert_sizes[:n_written] insert_sizes.sort() # Bin it if bins is None: h = np.histogram(insert_sizes, density=density) else: h = np.histogram(insert_sizes, bins=bins, density=density) return insert_sizes, h
def make_output_folders(data_folder, adaID, VERBOSE=0, summary=True): '''Make output folders''' from hivwholeseq.utils.generic import mkdirs outfiles = [get_premapped_filename(data_folder, adaID)] if summary: outfiles.append(get_coverage_figure_filename(data_folder, adaID, 'premapped')) for outfile in outfiles: dirname = os.path.dirname(outfile) mkdirs(dirname) if VERBOSE: print 'Folder created:', dirname
def report_coverage(data_folder, adaID, VERBOSE=0, summary=True): '''Produce a report on rough coverage on reference (ignore inserts)''' ref_filename = get_reference_premap_filename(data_folder, adaID) refseq = SeqIO.read(ref_filename, 'fasta') # Prepare data structures coverage = np.zeros(len(refseq), int) # Parse the BAM file unmapped = 0 mapped = 0 bamfilename = get_premapped_filename(data_folder, adaID, type='bam') with pysam.Samfile(bamfilename, 'rb') as bamfile: for read in bamfile: if read.is_unmapped or (not read.is_proper_pair) or (not len( read.cigar)): unmapped += 1 continue # Proceed along CIGARs ref_pos = read.pos for (bt, bl) in read.cigar: if bt not in (0, 2): continue # Treat deletions as 'covered' coverage[ref_pos:ref_pos + bl] += 1 ref_pos += bl mapped += 1 # Save results from hivwholeseq.sequencing.filenames import get_coverage_figure_filename import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(13, 6)) ax.plot(np.arange(len(refseq)), coverage + 1, lw=2, c='b') ax.set_xlabel('Position') ax.set_ylabel('Coverage') ax.set_yscale('log') ax.set_title('adaID ' + adaID + ', premapped', fontsize=18) ax.set_xlim(-20, len(refseq) + 20) plt.tight_layout() from hivwholeseq.utils.generic import mkdirs from hivwholeseq.sequencing.filenames import get_figure_folder mkdirs(get_figure_folder(data_folder, adaID)) plt.savefig(get_coverage_figure_filename(data_folder, adaID, 'premapped')) plt.close(fig) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nPremapping results: '+\ str(mapped)+' read pairs mapped, '+str(unmapped)+' unmapped\n') f.write('\nCoverage plotted: '+\ get_coverage_figure_filename(data_folder, adaID, 'premapped')+'\n')
def make_output_folders(data_folder, adaID, VERBOSE=0, summary=True): '''Make output folders''' from hivwholeseq.utils.generic import mkdirs outfiles = [get_premapped_filename(data_folder, adaID)] if summary: outfiles.append( get_coverage_figure_filename(data_folder, adaID, 'premapped')) for outfile in outfiles: dirname = os.path.dirname(outfile) mkdirs(dirname) if VERBOSE: print 'Folder created:', dirname
def remove_premapped_tempfiles(data_folder, adaID, VERBOSE=0): '''Remove the part files of multi-threaded premapping''' from hivwholeseq.sequencing.filenames import get_premapped_filename dirname = os.path.dirname(get_premapped_filename(data_folder, adaID, type='bam', part=1))+'/' fns = glob.glob(dirname+'premapped_*part*') + \ glob.glob(dirname+'premapped_*unsorted*') fns.append(dirname+'premapped.sam') for fn in fns: os.remove(fn) if VERBOSE >= 3: print 'File removed:', fn
def premap_stampy(data_folder, adaID, VERBOSE=0, threads=1, summary=True, maxreads=-1, subsrate=0.05, gapopen=40, gapextend=3): '''Call stampy for actual mapping''' if VERBOSE: print 'Premapping: adaID ', adaID if summary: summary_filename = get_premap_summary_filename(data_folder, adaID) # Stampy can handle both gzipped and uncompressed fastq inputs input_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(input_filenames[0]): input_filenames = get_read_filenames(data_folder, adaID, gzip=False) if not all(map(os.path.isfile, input_filenames)): raise OSError('Input files for mapping not found: ' + input_filenames[0]) # parallelize if requested if threads == 1: call_list = [ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename(data_folder, adaID, ext=False), '-o', get_premapped_filename(data_folder, adaID, type='sam'), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), ] if maxreads > 0: call_list.append('--numrecords=' + str(maxreads)) call_list.extend(['-M'] + input_filenames) call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nStampy premapped (single thread).\n') # Convert to compressed BAM convert_sam_to_bam( get_premapped_filename(data_folder, adaID, type='bam')) if summary: with open(summary_filename, 'a') as f: f.write('\nSAM file converted to compressed BAM: '+\ get_premapped_filename(data_folder, adaID, type='bam')+'\n') else: # Multithreading works as follows: call qsub + stampy, monitor the process # IDs with qstat at regular intervals, and finally merge results with pysam output_file_parts = [ get_premapped_filename(data_folder, adaID, type='bam', part=(j + 1)) for j in xrange(threads) ] # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') # Submit map call import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/' JOBLOGOUT = JOBDIR + 'logout' JOBLOGERR = JOBDIR + 'logerr' cluster_time = ['23:59:59', '1:59:59'] vmem = '8G' for j in xrange(threads): call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l', 'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem, stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename( data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename( data_folder, adaID, ext=False), '-o', get_premapped_filename( data_folder, adaID, type='sam', part=(j + 1)), '--processpart=' + str(j + 1) + '/' + str(threads), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), '-M' ] + input_filenames call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if VERBOSE >= 3: print qstat_output if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert premapped reads to BAM for merging: adaID '+\ adaID+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy premapped (' + str(threads) + ' threads).\n') # Concatenate output files if VERBOSE >= 1: print 'Concatenate premapped reads: adaID ' + adaID + '...', output_filename = get_premapped_filename(data_folder, adaID, type='bam', unsorted=True) pysam.cat('-o', output_filename, *output_file_parts) if VERBOSE >= 1: print 'done.' if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort premapped reads: adaID ' + adaID output_filename_sorted = get_premapped_filename(data_folder, adaID, type='bam', unsorted=False) pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader premapped reads: adaID ' + adaID header_filename = get_premapped_filename(data_folder, adaID, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp premapping files removed.\n') f.write('\n')
def trim_and_divide_reads(data_folder, adaID, n_cycles, fragments, maxreads=-1, VERBOSE=0, minisize=100, include_tests=False, summary=True): '''Trim reads and divide them into fragments''' if VERBOSE: print 'Trim and divide into fragments: adaID '+adaID+', fragments: '+\ ' '.join(fragments) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('Fragments used: '+' '.join(fragments)+'\n') ref_filename = get_reference_premap_filename(data_folder, adaID) refseq = SeqIO.read(ref_filename, 'fasta') smat = np.array(refseq, 'S1') len_reference = len(refseq) # Get the positions of fragment start/end, w/ and w/o primers frags_pos = get_fragment_positions(smat, fragments) store_reference_fragmented(data_folder, adaID, refseq, dict(zip(fragments, frags_pos['trim']))) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('Primer positions (for fragments):\n') for (fragment, poss_full, poss_trim) in izip(fragments, frags_pos['full'], frags_pos['trim']): f.write(fragment+': fwd '+str(poss_full[0])+' '+str(poss_trim[0])+\ ', rev '+str(poss_trim[1])+' '+str(poss_full[1])+'\n') write_fragment_positions(data_folder, adaID, fragments, frags_pos) # Get the positions of the unwanted outer primers (in case we DO nested PCR # for that fragment) # NOTE: the LTRs make no problem, because the rev outer primer of F6 # is not in the reference anymore if F6 has undergone nested PCR # FIXME: this might not work if we have mixed fragments (e.g. F5a+b) AND nesting from re import findall primers_out = {'fwd': [], 'rev': []} for i, fr in enumerate(fragments): if (i != 0) and findall(r'F[2-6][a-z]?i', fr): primers_out['fwd'].append(fr[:-1]+'o') if (i != len(fragments) - 1) and findall(r'F[1-5][a-z]?i', fr): primers_out['rev'].append(fr[:-1]+'o') # Get all possible unambiguous primers for the unwanted outer primers from hivwholeseq.data.primers import primers_PCR from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas primers_out_seq = {'fwd': [np.array(map(list, eas(primers_PCR[fr][0])), 'S1', ndmin=2) for fr in primers_out['fwd']], 'rev': [np.array(map(list, eas(primers_PCR[fr][1])), 'S1', ndmin=2) for fr in primers_out['rev']], } primers_out_pos = {'fwd': [], 'rev': []} if primers_out['fwd']: primers_out_pos['fwd'] = map(itemgetter(0), get_primer_positions(smat, primers_out['fwd'], 'fwd')) if primers_out['rev']: primers_out_pos['rev'] = map(itemgetter(1), get_primer_positions(smat, primers_out['rev'], 'rev')) # Input and output files input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): convert_sam_to_bam(input_filename) output_filenames = get_divided_filenames(data_folder, adaID, fragments, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: try: file_handles = [pysam.Samfile(ofn, 'wb', template=bamfile) for ofn in output_filenames[:len(fragments)]] fo_am = pysam.Samfile(output_filenames[-4], 'wb', template=bamfile) fo_cm = pysam.Samfile(output_filenames[-3], 'wb', template=bamfile) fo_um = pysam.Samfile(output_filenames[-2], 'wb', template=bamfile) fo_lq = pysam.Samfile(output_filenames[-1], 'wb', template=bamfile) # Iterate over the mapped reads and assign fragments n_mapped = [0 for fragment in fragments] n_unmapped = 0 n_crossfrag = 0 n_ambiguous = 0 n_outer = 0 n_lowq = 0 for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: if VERBOSE: print 'Maximal number of read pairs reached:', maxreads break if VERBOSE >= 2: if not ((irp+1) % 10000): print irp+1 i_fwd = reads[0].is_reverse # If unmapped or unpaired, mini, or insert size mini, or # divergent read pair (fully cross-overlapping), discard if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \ reads[1].is_unmapped or (not reads[1].is_proper_pair) or \ (reads[0].rlen < 50) or (reads[1].rlen < 50) or \ (reads[i_fwd].isize < minisize): if VERBOSE >= 3: print 'Read pair unmapped/unpaired/tiny/divergent:', reads[0].qname n_unmapped += 1 fo_um.write(reads[0]) fo_um.write(reads[1]) continue # If the insert is a misamplification from the outer primers # in fragments that underwent nested PCR, # trash it (it will have skewed amplification anyway). We cannot # find all of those, rather only the ones still carrying the # primer itself (some others have lost it while shearing). For # those, no matter what happens at the end (reading into adapters, # etc.), ONE of the reads in the pair will start exactly with one # outer primer: if the rev read with a rev primer, if the fwd # with a fwd one. Test all six. if (len(primers_out_pos['fwd']) or len(primers_out_pos['rev'])) and \ test_outer_primer(reads, primers_out_pos, primers_out_seq, len_reference): if VERBOSE >= 3: print 'Read pair from outer primer:', reads[0].qname n_outer += 1 fo_um.write(reads[0]) fo_um.write(reads[1]) continue # FIXME: the following becomes a bit harder when we mix parallel # PCRs, e.g. F5a+b, to get more product # Assign to a fragment now, so that primer trimming is faster pair_identity = assign_to_fragment(reads, frags_pos['full'], VERBOSE=VERBOSE) # 1. If no fragments are possible (e.g. one read crosses the # fragment boundary, they map to different fragments), dump it # into a special bucket if pair_identity == 'cross': n_crossfrag += 1 fo_cm.write(reads[0]) fo_cm.write(reads[1]) continue # 2. If 2+ fragments are possible (tie), put into a special bucket # (essentially excluded, because we want two independent measurements # in the overlapping region, but we might want to recover them) elif pair_identity == 'ambiguous': n_ambiguous += 1 fo_am.write(reads[0]) fo_am.write(reads[1]) continue # 3. If the intersection is a single fragment, good: trim the primers # NB: n_frag is the index IN THE POOL. If we sequence only F2-F5, F2 is n_frag = 0 n_frag = int(pair_identity) frag_pos = frags_pos['trim'][n_frag] if not np.isscalar(frag_pos[0]): frag_pos = [frag_pos[0]['inner'], frag_pos[1]['inner']] trashed_primers = trim_primers(reads, frag_pos, include_tests=include_tests) if trashed_primers or (reads[i_fwd].isize < 100): n_unmapped += 1 if VERBOSE >= 3: print 'Read pair is mismapped:', reads[0].qname fo_um.write(reads[0]) fo_um.write(reads[1]) continue # Quality trimming: if no decently long pair survives, trash #trashed_quality = main_block_low_quality(reads, phred_min=20, # include_tests=include_tests) trashed_quality = trim_low_quality(reads, phred_min=20, include_tests=include_tests) if trashed_quality or (reads[i_fwd].isize < 100): n_lowq += 1 if VERBOSE >= 3: print 'Read pair has low phred quality:', reads[0].qname fo_lq.write(reads[0]) fo_lq.write(reads[1]) continue # Check for cross-overhangs or COH (reading into the adapters) # ---------------> # <----------- # In that case, trim to perfect overlap. if test_coh(reads, VERBOSE=False): trim_coh(reads, trim=0, include_tests=include_tests) # Change coordinates into the fragmented reference (primer-trimmed) for read in reads: read.pos -= frag_pos[0] read.mpos -= frag_pos[0] # Here the tests if include_tests: lfr = frags_pos['trim'][n_frag][1] - frags_pos['trim'][n_frag][0] if test_sanity(reads, n_frag, lfr): print 'Tests failed:', reads[0].qname import ipdb; ipdb.set_trace() # There we go! n_mapped[n_frag] += 1 file_handles[n_frag].write(reads[0]) file_handles[n_frag].write(reads[1]) finally: for f in file_handles: f.close() fo_am.close() fo_cm.close() fo_um.close() fo_lq.close() if VERBOSE: print 'Trim and divide results: adaID '+adaID print 'Total:\t\t', irp print 'Mapped:\t\t', sum(n_mapped), n_mapped print 'Unmapped/unpaired/tiny:\t', n_unmapped print 'Outer primer\t', n_outer print 'Crossfrag:\t', n_crossfrag print 'Ambiguous:\t', n_ambiguous print 'Low-quality:\t', n_lowq # Write summary to file if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Trim and divide results: adaID '+adaID+'\n') f.write('Total:\t\t'+str(irp + 1)+'\n') f.write('Mapped:\t\t'+str(sum(n_mapped))+' '+str(n_mapped)+'\n') f.write('Unmapped/unpaired/tiny insert:\t'+str(n_unmapped)+'\n') f.write('Outer primer\t'+str(n_outer)+'\n') f.write('Crossfrag:\t'+str(n_crossfrag)+'\n') f.write('Ambiguous:\t'+str(n_ambiguous)+'\n') f.write('Low-quality:\t'+str(n_lowq)+'\n')
' --adaIDs '+adaID+\ ' --fragments '+fragment+\ ' --block-length '+str(block_len_initial)+\ ' --reads-per-alignment '+str(n_reads_per_ali)+\ ' --verbose '+str(VERBOSE)) if store_allele_counts: f.write(' --allele-counts') f.write('\n') if VERBOSE: print seq_run, adaID, fragment if fragment == 'genomewide': refseq = SeqIO.read( get_reference_premap_filename(data_folder, adaID), 'fasta') bamfilename = get_premapped_filename(data_folder, adaID, type='bam') frag_out = fragment else: fn = get_reference_premap_filename(data_folder, adaID, fragment) bamfilename = get_divided_filename(data_folder, adaID, fragment, type='bam') #FIXME: old nomenclature for F3a is F3 if not os.path.isfile(fn) and fragment[:3] == 'F3a': fn = get_reference_premap_filename(data_folder, adaID, 'F3' + fragment[-1]) if not os.path.isfile(bamfilename) and fragment[:3] == 'F3a':
f.write('Call: python build_consensus.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+fragment+\ ' --block-length '+str(block_len_initial)+\ ' --reads-per-alignment '+str(n_reads_per_ali)+\ ' --verbose '+str(VERBOSE)) if store_allele_counts: f.write(' --allele-counts') f.write('\n') if VERBOSE: print seq_run, adaID, fragment if fragment == 'genomewide': refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') bamfilename = get_premapped_filename(data_folder, adaID, type='bam') frag_out = fragment else: fn = get_reference_premap_filename(data_folder, adaID, fragment) bamfilename = get_divided_filename(data_folder, adaID, fragment, type='bam') #FIXME: old nomenclature for F3a is F3 if not os.path.isfile(fn) and fragment[:3] == 'F3a': fn = get_reference_premap_filename(data_folder, adaID, 'F3'+fragment[-1]) if not os.path.isfile(bamfilename) and fragment[:3] == 'F3a': bamfilename = get_divided_filename(data_folder, adaID, 'F3'+fragment[-1], type='bam') refseq = SeqIO.read(fn, 'fasta') frag_out = fragment[:2] consensus = build_consensus(bamfilename, len(refseq), VERBOSE=VERBOSE,
def premap_stampy(data_folder, adaID, VERBOSE=0, threads=1, summary=True, maxreads=-1, subsrate=0.05, gapopen=40, gapextend=3): '''Call stampy for actual mapping''' if VERBOSE: print 'Premapping: adaID ', adaID if summary: summary_filename = get_premap_summary_filename(data_folder, adaID) # Stampy can handle both gzipped and uncompressed fastq inputs input_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(input_filenames[0]): input_filenames = get_read_filenames(data_folder, adaID, gzip=False) if not all(map(os.path.isfile, input_filenames)): raise OSError('Input files for mapping not found: ' + input_filenames[0]) # parallelize if requested if threads == 1: call_list = [ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename(data_folder, adaID, ext=False), '-o', get_premapped_filename(data_folder, adaID, type='sam'), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), ] if maxreads > 0: call_list.append('--numrecords=' + str(maxreads)) call_list.extend(['-M'] + input_filenames) call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nStampy premapped (single thread).\n') # Convert to compressed BAM convert_sam_to_bam( get_premapped_filename(data_folder, adaID, type='bam')) if summary: with open(summary_filename, 'a') as f: f.write('\nSAM file converted to compressed BAM: '+\ get_premapped_filename(data_folder, adaID, type='bam')+'\n') else: # Multithreading works as follows: call qsub + stampy, monitor the process # IDs with qstat at regular intervals, and finally merge results with pysam output_file_parts = [ get_premapped_filename( data_folder, adaID, type='bam', part=(j + 1)) for j in xrange(threads) ] # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') # Submit map call import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/' JOBLOGOUT = JOBDIR + 'logout' JOBLOGERR = JOBDIR + 'logerr' cluster_time = ['23:59:59', '1:59:59'] vmem = '8G' for j in xrange(threads): call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l', 'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem, stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename( data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename( data_folder, adaID, ext=False), '-o', get_premapped_filename( data_folder, adaID, type='sam', part=(j + 1)), '--processpart=' + str(j + 1) + '/' + str(threads), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), '-M' ] + input_filenames call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if VERBOSE >= 3: print qstat_output if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert premapped reads to BAM for merging: adaID '+\ adaID+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy premapped (' + str(threads) + ' threads).\n') # Concatenate output files if VERBOSE >= 1: print 'Concatenate premapped reads: adaID ' + adaID + '...', output_filename = get_premapped_filename( data_folder, adaID, type='bam', unsorted=True) pysam.cat('-o', output_filename, *output_file_parts) if VERBOSE >= 1: print 'done.' if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort premapped reads: adaID ' + adaID output_filename_sorted = get_premapped_filename( data_folder, adaID, type='bam', unsorted=False) pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader premapped reads: adaID ' + adaID header_filename = get_premapped_filename( data_folder, adaID, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp premapping files removed.\n') f.write('\n')
def check_premap(data_folder, adaID, fragments, seq_run, samplename, qual_min=30, match_len_min=10, maxreads=-1, VERBOSE=0, savefig=True, title=None): '''Check premap to reference: coverage, etc.''' refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') # FIXME: do this possibly better than parsing the description! try: fields = refseq.description.split() refseq_start = int(fields[fields.index('(indices') - 3]) except ValueError: refseq_start = 550 fragpos_filename = get_fragment_positions_filename(data_folder, adaID) if os.path.isfile(fragpos_filename): # Load the fragment positions, considering mixed fragments (e.g. F5a+b) fragtmp = [] postmp = [] with open(fragpos_filename, 'r') as f: f.readline() #HEADER for line in f: fields = line[:-1].split('\t') fragtmp.append(fields[0]) if 'inner' not in fields[1]: postmp.append([fields[1], fields[4]]) else: start = int(fields[1].split(',')[1].split(': ')[1].rstrip('}')) end = int(fields[4].split(',')[1].split(': ')[1].rstrip('}')) postmp.append([start, end]) postmp = np.array(postmp, int) # NOTE: In a lot of old files, it says F3o instead of F3ao if 'F3o' in fragtmp: fragtmp[fragtmp.index('F3o')] = 'F3ao' elif 'F3i' in fragtmp: fragtmp[fragtmp.index('F3i')] = 'F3ai' frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T else: frags_pos = None frags_pos_out = None # Open BAM and scan reads input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): if VERBOSE: print 'Premapped BAM file not found' return (None, None) # Count reads if requested n_reads = get_number_reads(input_filename) if VERBOSE: print 'N. of reads:', n_reads # Get counts counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename, len(refseq), qual_min=qual_min, match_len_min=match_len_min, maxreads=maxreads, VERBOSE=VERBOSE) # Plot results if title is None: title=', '.join(['run '+seq_run+' '+adaID, 'sample '+samplename, 'reads '+str(min(maxreads, n_reads))+'/'+str(n_reads), ]) plot_coverage(counts, offset_x=refseq_start, frags_pos=frags_pos, frags_pos_out=frags_pos_out, title=title) if savefig: from hivwholeseq.sequencing.adapter_info import foldername_adapter plt.savefig(data_folder+foldername_adapter(adaID)+'figures/coverage_premapped_'+samplename+'.png') return (counts, inserts)
def check_premap(data_folder, adaID, fragments, seq_run, samplename, qual_min=30, match_len_min=10, maxreads=-1, VERBOSE=0, savefig=True, title=None): '''Check premap to reference: coverage, etc.''' refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') # FIXME: do this possibly better than parsing the description! try: fields = refseq.description.split() refseq_start = int(fields[fields.index('(indices') - 3]) except ValueError: refseq_start = 550 fragpos_filename = get_fragment_positions_filename(data_folder, adaID) if os.path.isfile(fragpos_filename): # Load the fragment positions, considering mixed fragments (e.g. F5a+b) fragtmp = [] postmp = [] with open(fragpos_filename, 'r') as f: f.readline() #HEADER for line in f: fields = line[:-1].split('\t') fragtmp.append(fields[0]) if 'inner' not in fields[1]: postmp.append([fields[1], fields[4]]) else: start = int( fields[1].split(',')[1].split(': ')[1].rstrip('}')) end = int( fields[4].split(',')[1].split(': ')[1].rstrip('}')) postmp.append([start, end]) postmp = np.array(postmp, int) # NOTE: In a lot of old files, it says F3o instead of F3ao if 'F3o' in fragtmp: fragtmp[fragtmp.index('F3o')] = 'F3ao' elif 'F3i' in fragtmp: fragtmp[fragtmp.index('F3i')] = 'F3ai' frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T else: frags_pos = None frags_pos_out = None # Open BAM and scan reads input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): if VERBOSE: print 'Premapped BAM file not found' return (None, None) # Count reads if requested n_reads = get_number_reads(input_filename) if VERBOSE: print 'N. of reads:', n_reads # Get counts counts, inserts = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), qual_min=qual_min, match_len_min=match_len_min, maxreads=maxreads, VERBOSE=VERBOSE) # Plot results if title is None: title = ', '.join([ 'run ' + seq_run + ' ' + adaID, 'sample ' + samplename, 'reads ' + str(min(maxreads, n_reads)) + '/' + str(n_reads), ]) plot_coverage(counts, offset_x=refseq_start, frags_pos=frags_pos, frags_pos_out=frags_pos_out, title=title) if savefig: from hivwholeseq.sequencing.adapter_info import foldername_adapter plt.savefig(data_folder + foldername_adapter(adaID) + 'figures/coverage_premapped_' + samplename + '.png') return (counts, inserts)