def split_reads(data_folder, adaID, fragment, chunk_size=10000, maxreads=-1, VERBOSE=0): '''Split reads into chunks for mapping''' input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: if VERBOSE: if maxreads == -1: n_reads = get_number_reads_open(bamfile) // 2 else: n_reads = maxreads print 'Expected number of chunks:', 1 + (n_reads // chunk_size) chunk_number = 0 chunkfile = None for irp, read_pair in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 10000): print irp + 1 if not (irp % chunk_size): if chunkfile is not None: chunkfile.close() chunk_number += 1 chunk_filename = get_divided_filename(data_folder, adaID, fragment, type='bam', chunk=chunk_number) chunkfile = pysam.Samfile(chunk_filename, 'wb', template=bamfile) if VERBOSE >= 2: print 'Chunk n', chunk_number, 'started' chunkfile.write(read_pair[0]) chunkfile.write(read_pair[1]) if chunkfile is not None: chunkfile.close() if VERBOSE: print 'Chunking finished'
def check_division(data_folder, adaID, fragment, seq_run, qual_min=35, reference='HXB2', maxreads=-1, VERBOSE=0, minor_allele=False): '''Check division into fragments: coverage, etc.''' ref_fn = get_reference_premap_filename(data_folder, adaID, fragment) # FIXME: old nomenclature for F3a if not os.path.isfile(ref_fn): if fragment[:2] == 'F3': ref_fn = ref_fn.replace('F3a', 'F3') refseq = SeqIO.read(ref_fn, 'fasta') # Scan reads input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # FIXME: old nomenclature for F3a if not os.path.isfile(input_filename): if fragment[:2] == 'F3': input_filename = input_filename.replace('F3a', 'F3') counts, inserts = get_allele_counts_insertions_from_file(input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE) # Plot results title=', '.join(map(lambda x: ' '.join([x[0], str(x[1])]), [['run', seq_run], ['adaID', adaID], ['fragment', fragment], ['maxreads', maxreads], ])) plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def discard_nondivided_samples(self): '''Discard samples that have no divided reads (e.g. SA, random hexamers)''' import os from hivwholeseq.sequencing.filenames import get_divided_filename ind = [] for sample in self.itersamples(): frag = sample.regions_complete[0] div = os.path.isfile(get_divided_filename(self.folder, sample.adapter, frag)) ind.append(div) self.samples = self.samples.loc[ind]
def discard_nondivided_samples(self): '''Discard samples that have no divided reads (e.g. SA, random hexamers)''' import os from hivwholeseq.sequencing.filenames import get_divided_filename ind = [] for sample in self.itersamples(): frag = sample.regions_complete[0] div = os.path.isfile( get_divided_filename(self.folder, sample.adapter, frag)) ind.append(div) self.samples = self.samples.loc[ind]
def split_reads(data_folder, adaID, fragment, chunk_size=10000, maxreads=-1, VERBOSE=0): '''Split reads into chunks for mapping''' input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: if VERBOSE: if maxreads == -1: n_reads = get_number_reads_open(bamfile) // 2 else: n_reads = maxreads print 'Expected number of chunks:', 1 + (n_reads // chunk_size) chunk_number = 0 chunkfile = None for irp, read_pair in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp+1) % 10000): print irp+1 if not (irp % chunk_size): if chunkfile is not None: chunkfile.close() chunk_number += 1 chunk_filename = get_divided_filename(data_folder, adaID, fragment, type='bam', chunk=chunk_number) chunkfile = pysam.Samfile(chunk_filename, 'wb', template=bamfile) if VERBOSE >= 2: print 'Chunk n', chunk_number, 'started' chunkfile.write(read_pair[0]) chunkfile.write(read_pair[1]) if chunkfile is not None: chunkfile.close() if VERBOSE: print 'Chunking finished'
def get_input_filename(data_folder, adaID, frag_spec, type='bam', only_chunk=None, filtered=True): '''Get filename of input for mapping to initial reference''' # We should take reads filtered after mapping to the auto-consensus if filtered: from hivwholeseq.sequencing.filenames import get_mapped_filename frag_gen = frag_spec[:2] fn = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) else: from hivwholeseq.sequencing.filenames import get_divided_filename fn = get_divided_filename(data_folder, adaID, frag_spec, type='bam', chunk=only_chunk) return fn
def check_division(data_folder, adaID, fragment, seq_run, qual_min=35, reference='HXB2', maxreads=-1, VERBOSE=0, minor_allele=False): '''Check division into fragments: coverage, etc.''' ref_fn = get_reference_premap_filename(data_folder, adaID, fragment) # FIXME: old nomenclature for F3a if not os.path.isfile(ref_fn): if fragment[:2] == 'F3': ref_fn = ref_fn.replace('F3a', 'F3') refseq = SeqIO.read(ref_fn, 'fasta') # Scan reads input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # FIXME: old nomenclature for F3a if not os.path.isfile(input_filename): if fragment[:2] == 'F3': input_filename = input_filename.replace('F3a', 'F3') counts, inserts = get_allele_counts_insertions_from_file(input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE) # Plot results title = ', '.join( map(lambda x: ' '.join([x[0], str(x[1])]), [ ['run', seq_run], ['adaID', adaID], ['fragment', fragment], ['maxreads', maxreads], ])) plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
f.write('\n') if VERBOSE: print seq_run, adaID, fragment if fragment == 'genomewide': refseq = SeqIO.read( get_reference_premap_filename(data_folder, adaID), 'fasta') bamfilename = get_premapped_filename(data_folder, adaID, type='bam') frag_out = fragment else: fn = get_reference_premap_filename(data_folder, adaID, fragment) bamfilename = get_divided_filename(data_folder, adaID, fragment, type='bam') #FIXME: old nomenclature for F3a is F3 if not os.path.isfile(fn) and fragment[:3] == 'F3a': fn = get_reference_premap_filename(data_folder, adaID, 'F3' + fragment[-1]) if not os.path.isfile(bamfilename) and fragment[:3] == 'F3a': bamfilename = get_divided_filename(data_folder, adaID, 'F3' + fragment[-1], type='bam') refseq = SeqIO.read(fn, 'fasta') frag_out = fragment[:2]
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1, cluster_time='23:59:59', maxreads=-1, summary=True, rescue=False, dry=False): '''Map using stampy''' frag_gen = fragment[:2] if summary: summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=rescue) # Set mapping penalty scores: softer for rescues and F3 and F5 global subsrate if rescue: subsrate = '0.2' stampy_gapopen = 5 # Default: 40 stampy_gapextend = 1 # Default: 3 elif frag_gen not in ('F3', 'F5'): stampy_gapopen = 60 # Default: 40 stampy_gapextend = 5 # Default: 3 else: stampy_gapopen = 30 # Default: 40 stampy_gapextend = 2 # Default: 3 if VERBOSE: print 'Map via stampy: '+adaID+' '+frag_gen if not rescue: input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if frag_gen == 'F3': input_filename = input_filename.replace('F3a', 'F3') else: input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename+', fragment '+fragment+': input file not found.') # parallelize if requested if threads == 1: output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', rescue=rescue) # Map call_list = [stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') # Take only a (random) subsample: stampy uses the fraction of reads # intead of the number if maxreads > 0: # FIXME: figure out the -s option and the --numrecords option call_list.extend(['--numrecords', maxreads]) #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2 #frac_pairs = 1.0 * maxreads / n_pairs_tot #random_seed = np.random.randint(1e5) #call_list.extend(['-s', frac_pairs + random_seed]) call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >=2: print ' '.join(call_list) if not dry: sp.call(call_list) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', rescue=rescue) convert_sam_to_bam(output_filename) else: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (single thread).\n') if VERBOSE >= 1: print 'Dry run works (single thread)' return else: # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): # Get output filename output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=(j+1), rescue=rescue) # Map call_list = ['qsub','-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm'+adaID.replace('-', '')+frag_gen+str(j+1), '-l', 'h_rt='+cluster_time, '-l', 'h_vmem='+vmem, stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--processpart='+str(j+1)+'/'+str(threads), '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID if dry: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (multi thread).\n') if VERBOSE >= 1: print 'Dry run works (multi thread)' return # Monitor output output_file_parts = [get_mapped_filename(data_folder, adaID, frag_gen, type='bam', part=(j+1), rescue=rescue) for j in xrange(threads)] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: adaID '+\ adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped ('+str(threads)+' threads).\n') # Concatenate output files output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=True, rescue=rescue) if VERBOSE >= 1: print 'Concatenate mapped reads: adaID '+adaID+', fragment '+frag_gen pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=False, rescue=rescue) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: adaID '+adaID+', fragment '+frag_gen pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: adaID '+adaID+', fragment '+frag_gen header_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=1, rescue=rescue) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') # FIXME: check whether temp files are all deleted if VERBOSE >= 1: print 'Remove temporary files: adaID '+adaID+', fragment '+frag_gen remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1, cluster_time='23:59:59', maxreads=-1, summary=True, rescue=False, dry=False): '''Map using stampy''' frag_gen = fragment[:2] if summary: summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=rescue) # Set mapping penalty scores: softer for rescues and F3 and F5 global subsrate if rescue: subsrate = '0.2' stampy_gapopen = 5 # Default: 40 stampy_gapextend = 1 # Default: 3 elif frag_gen not in ('F3', 'F5'): stampy_gapopen = 60 # Default: 40 stampy_gapextend = 5 # Default: 3 else: stampy_gapopen = 30 # Default: 40 stampy_gapextend = 2 # Default: 3 if VERBOSE: print 'Map via stampy: ' + adaID + ' ' + frag_gen if not rescue: input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if frag_gen == 'F3': input_filename = input_filename.replace('F3a', 'F3') else: input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename + ', fragment ' + fragment + ': input file not found.') # parallelize if requested if threads == 1: output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', rescue=rescue) # Map call_list = [ stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') # Take only a (random) subsample: stampy uses the fraction of reads # intead of the number if maxreads > 0: # FIXME: figure out the -s option and the --numrecords option call_list.extend(['--numrecords', maxreads]) #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2 #frac_pairs = 1.0 * maxreads / n_pairs_tot #random_seed = np.random.randint(1e5) #call_list.extend(['-s', frac_pairs + random_seed]) call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: sp.call(call_list) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', rescue=rescue) convert_sam_to_bam(output_filename) else: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (single thread).\n') if VERBOSE >= 1: print 'Dry run works (single thread)' return else: # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): # Get output filename output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=(j + 1), rescue=rescue) # Map call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm' + adaID.replace('-', '') + frag_gen + str(j + 1), '-l', 'h_rt=' + cluster_time, '-l', 'h_vmem=' + vmem, stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--processpart=' + str(j + 1) + '/' + str(threads), '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID if dry: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (multi thread).\n') if VERBOSE >= 1: print 'Dry run works (multi thread)' return # Monitor output output_file_parts = [ get_mapped_filename(data_folder, adaID, frag_gen, type='bam', part=(j + 1), rescue=rescue) for j in xrange(threads) ] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: adaID '+\ adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (' + str(threads) + ' threads).\n') # Concatenate output files output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=True, rescue=rescue) if VERBOSE >= 1: print 'Concatenate mapped reads: adaID ' + adaID + ', fragment ' + frag_gen pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=False, rescue=rescue) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: adaID ' + adaID + ', fragment ' + frag_gen pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: adaID ' + adaID + ', fragment ' + frag_gen header_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=1, rescue=rescue) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') # FIXME: check whether temp files are all deleted if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID + ', fragment ' + frag_gen remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
' --block-length '+str(block_len_initial)+\ ' --reads-per-alignment '+str(n_reads_per_ali)+\ ' --verbose '+str(VERBOSE)) if store_allele_counts: f.write(' --allele-counts') f.write('\n') if VERBOSE: print seq_run, adaID, fragment if fragment == 'genomewide': refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta') bamfilename = get_premapped_filename(data_folder, adaID, type='bam') frag_out = fragment else: fn = get_reference_premap_filename(data_folder, adaID, fragment) bamfilename = get_divided_filename(data_folder, adaID, fragment, type='bam') #FIXME: old nomenclature for F3a is F3 if not os.path.isfile(fn) and fragment[:3] == 'F3a': fn = get_reference_premap_filename(data_folder, adaID, 'F3'+fragment[-1]) if not os.path.isfile(bamfilename) and fragment[:3] == 'F3a': bamfilename = get_divided_filename(data_folder, adaID, 'F3'+fragment[-1], type='bam') refseq = SeqIO.read(fn, 'fasta') frag_out = fragment[:2] consensus = build_consensus(bamfilename, len(refseq), VERBOSE=VERBOSE, block_len_initial=block_len_initial, reads_per_alignment=n_reads_per_ali, accept_holes=(fragment == 'genomewide'), store_allele_counts=store_allele_counts)