def make_index_and_hash(data_folder, adaID, VERBOSE=0, summary=True): '''Make index and hash files for reference or consensus''' if VERBOSE: print 'Making index and hash files: adaID', adaID # 1. Make genome index file for reference if os.path.isfile(get_reference_premap_index_filename(data_folder, adaID, ext=True)): os.remove(get_reference_premap_index_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([stampy_bin, '--species="HIV"', '--overwrite', '-G', get_reference_premap_index_filename(data_folder, adaID, ext=False), get_reference_premap_filename(data_folder, adaID), ], stderr=sp.STDOUT) if VERBOSE: print 'Built index: '+adaID # 2. Build a hash file for reference if os.path.isfile(get_reference_premap_hash_filename(data_folder, adaID, ext=True)): os.remove(get_reference_premap_hash_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-H', get_reference_premap_hash_filename(data_folder, adaID, ext=False), ], stderr=sp.STDOUT) if VERBOSE: print 'Built hash: '+adaID if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Stampy index and hash written.') f.write('\n')
def make_index_and_hash(data_folder, adaID, VERBOSE=0, summary=True): '''Make index and hash files for reference or consensus''' if VERBOSE: print 'Making index and hash files: adaID', adaID # 1. Make genome index file for reference if os.path.isfile( get_reference_premap_index_filename(data_folder, adaID, ext=True)): os.remove( get_reference_premap_index_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([ stampy_bin, '--species="HIV"', '--overwrite', '-G', get_reference_premap_index_filename(data_folder, adaID, ext=False), get_reference_premap_filename(data_folder, adaID), ], stderr=sp.STDOUT) if VERBOSE: print 'Built index: ' + adaID # 2. Build a hash file for reference if os.path.isfile( get_reference_premap_hash_filename(data_folder, adaID, ext=True)): os.remove( get_reference_premap_hash_filename(data_folder, adaID, ext=True)) stdout = sp.check_output([ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-H', get_reference_premap_hash_filename(data_folder, adaID, ext=False), ], stderr=sp.STDOUT) if VERBOSE: print 'Built hash: ' + adaID if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Stampy index and hash written.') f.write('\n')
def premap_stampy(data_folder, adaID, VERBOSE=0, threads=1, summary=True, maxreads=-1, subsrate=0.05, gapopen=40, gapextend=3): '''Call stampy for actual mapping''' if VERBOSE: print 'Premapping: adaID ', adaID if summary: summary_filename = get_premap_summary_filename(data_folder, adaID) # Stampy can handle both gzipped and uncompressed fastq inputs input_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(input_filenames[0]): input_filenames = get_read_filenames(data_folder, adaID, gzip=False) if not all(map(os.path.isfile, input_filenames)): raise OSError('Input files for mapping not found: ' + input_filenames[0]) # parallelize if requested if threads == 1: call_list = [ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename(data_folder, adaID, ext=False), '-o', get_premapped_filename(data_folder, adaID, type='sam'), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), ] if maxreads > 0: call_list.append('--numrecords=' + str(maxreads)) call_list.extend(['-M'] + input_filenames) call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nStampy premapped (single thread).\n') # Convert to compressed BAM convert_sam_to_bam( get_premapped_filename(data_folder, adaID, type='bam')) if summary: with open(summary_filename, 'a') as f: f.write('\nSAM file converted to compressed BAM: '+\ get_premapped_filename(data_folder, adaID, type='bam')+'\n') else: # Multithreading works as follows: call qsub + stampy, monitor the process # IDs with qstat at regular intervals, and finally merge results with pysam output_file_parts = [ get_premapped_filename(data_folder, adaID, type='bam', part=(j + 1)) for j in xrange(threads) ] # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') # Submit map call import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/' JOBLOGOUT = JOBDIR + 'logout' JOBLOGERR = JOBDIR + 'logerr' cluster_time = ['23:59:59', '1:59:59'] vmem = '8G' for j in xrange(threads): call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l', 'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem, stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename( data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename( data_folder, adaID, ext=False), '-o', get_premapped_filename( data_folder, adaID, type='sam', part=(j + 1)), '--processpart=' + str(j + 1) + '/' + str(threads), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), '-M' ] + input_filenames call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if VERBOSE >= 3: print qstat_output if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert premapped reads to BAM for merging: adaID '+\ adaID+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy premapped (' + str(threads) + ' threads).\n') # Concatenate output files if VERBOSE >= 1: print 'Concatenate premapped reads: adaID ' + adaID + '...', output_filename = get_premapped_filename(data_folder, adaID, type='bam', unsorted=True) pysam.cat('-o', output_filename, *output_file_parts) if VERBOSE >= 1: print 'done.' if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort premapped reads: adaID ' + adaID output_filename_sorted = get_premapped_filename(data_folder, adaID, type='bam', unsorted=False) pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader premapped reads: adaID ' + adaID header_filename = get_premapped_filename(data_folder, adaID, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp premapping files removed.\n') f.write('\n')
def premap_stampy(data_folder, adaID, VERBOSE=0, threads=1, summary=True, maxreads=-1, subsrate=0.05, gapopen=40, gapextend=3): '''Call stampy for actual mapping''' if VERBOSE: print 'Premapping: adaID ', adaID if summary: summary_filename = get_premap_summary_filename(data_folder, adaID) # Stampy can handle both gzipped and uncompressed fastq inputs input_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(input_filenames[0]): input_filenames = get_read_filenames(data_folder, adaID, gzip=False) if not all(map(os.path.isfile, input_filenames)): raise OSError('Input files for mapping not found: ' + input_filenames[0]) # parallelize if requested if threads == 1: call_list = [ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename(data_folder, adaID, ext=False), '-o', get_premapped_filename(data_folder, adaID, type='sam'), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), ] if maxreads > 0: call_list.append('--numrecords=' + str(maxreads)) call_list.extend(['-M'] + input_filenames) call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nStampy premapped (single thread).\n') # Convert to compressed BAM convert_sam_to_bam( get_premapped_filename(data_folder, adaID, type='bam')) if summary: with open(summary_filename, 'a') as f: f.write('\nSAM file converted to compressed BAM: '+\ get_premapped_filename(data_folder, adaID, type='bam')+'\n') else: # Multithreading works as follows: call qsub + stampy, monitor the process # IDs with qstat at regular intervals, and finally merge results with pysam output_file_parts = [ get_premapped_filename( data_folder, adaID, type='bam', part=(j + 1)) for j in xrange(threads) ] # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') # Submit map call import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/' JOBLOGOUT = JOBDIR + 'logout' JOBLOGERR = JOBDIR + 'logerr' cluster_time = ['23:59:59', '1:59:59'] vmem = '8G' for j in xrange(threads): call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l', 'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem, stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename( data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename( data_folder, adaID, ext=False), '-o', get_premapped_filename( data_folder, adaID, type='sam', part=(j + 1)), '--processpart=' + str(j + 1) + '/' + str(threads), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), '-M' ] + input_filenames call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if VERBOSE >= 3: print qstat_output if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert premapped reads to BAM for merging: adaID '+\ adaID+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy premapped (' + str(threads) + ' threads).\n') # Concatenate output files if VERBOSE >= 1: print 'Concatenate premapped reads: adaID ' + adaID + '...', output_filename = get_premapped_filename( data_folder, adaID, type='bam', unsorted=True) pysam.cat('-o', output_filename, *output_file_parts) if VERBOSE >= 1: print 'done.' if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort premapped reads: adaID ' + adaID output_filename_sorted = get_premapped_filename( data_folder, adaID, type='bam', unsorted=False) pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader premapped reads: adaID ' + adaID header_filename = get_premapped_filename( data_folder, adaID, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp premapping files removed.\n') f.write('\n')