def make_output_folders(pname, samplename, PCR=1, VERBOSE=0): '''Make the output folders if necessary for hash and map''' hash_foldername = os.path.dirname(get_initial_hash_filename(pname, 'F0')) map_foldername = get_mapped_to_initial_foldername(pname, samplename, PCR=PCR) if not os.path.isdir(hash_foldername): mkdirs(hash_foldername) if VERBOSE: print 'Folder created:', hash_foldername mkdirs(map_foldername) if VERBOSE: print 'Folder created:', map_foldername
def make_index_and_hash(pname, fragment, VERBOSE=0): '''Make index and hash files for reference''' # 1. Make genome index file stdout = sp.check_output([stampy_bin, '--overwrite', '--species="HIV fragment '+fragment+'"', '-G', get_initial_index_filename(pname, fragment, ext=False), get_initial_reference_filename(pname, fragment), ], stderr=sp.STDOUT) if VERBOSE: print 'Built index: '+pname+' '+fragment # 2. Build a hash file stdout = sp.check_output([stampy_bin, '--overwrite', '-g', get_initial_index_filename(pname, fragment, ext=False), '-H', get_initial_hash_filename(pname, fragment, ext=False), ], stderr=sp.STDOUT) if VERBOSE: print 'Built hash: '+pname+' '+fragment
def map_stampy_multithread(sample, fragment, VERBOSE=0, threads=2, summary=True, filtered=True): '''Map using stampy, multithread (via cluster requests, queueing race conditions possible)''' import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/')+'/' JOBLOGOUT = JOBDIR+'logout/' JOBLOGERR = JOBDIR+'logerr/' cluster_time = ['23:59:59', '0:59:59'] vmem = '8G' pname = patient.id sample = patient.sample_table.loc[samplename] seq_run = sample['run'] data_folder = MiSeq_runs[seq_run]['folder'] adaID = sample['adaID'] if VERBOSE: print 'Map via stampy: '+pname+' '+samplename+' '+fragment if summary: summary_filename = get_map_initial_summary_filename(pname, samplename, fragment) # Specific fragment (e.g. F5 --> F5bi) frag_spec = filter(lambda x: fragment in x, sample['fragments']) if not len(frag_spec): raise ValueError(str(patient)+', '+samplename+': fragment '+fragment+' not found.') frag_spec = frag_spec[0] input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam') # Submit map scripts in parallel to the cluster jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): output_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='sam', part=(j+1)) # Map call_list = ['qsub','-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm '+samplename+fragment+' p'+str(j+1), '-l', 'h_rt='+cluster_time[threads >= 10], '-l', 'h_vmem='+vmem, stampy_bin, '--overwrite', '-g', get_initial_index_filename(pname, fragment, ext=False), '-h', get_initial_hash_filename(pname, fragment, ext=False), '-o', output_filename, '--processpart='+str(j+1)+'/'+str(threads), '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output output_file_parts = [get_mapped_to_initial_filename(pname, samplename, fragment, type='bam', part=(j+1)) for j in xrange(threads)] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: sample '+\ samplename+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped ('+str(threads)+' threads).\n') # Concatenate output files output_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='bam', unsorted=True) if VERBOSE >= 1: print 'Concatenate premapped reads: sample '+samplename pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_to_initial_filename(pname, samplename, fragment, type='bam') # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: sample '+samplename pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: sample '+samplename header_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: sample '+samplename remove_mapped_init_tempfiles(pname, samplename, fragment, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
def map_stampy_singlethread(sample, fragment, VERBOSE=0, n_pairs=-1, summary=True, only_chunk=None, filtered=True): '''Map using stampy, single thread (no cluster queueing race conditions)''' pname = sample.patient samplename_pat = sample['patient sample'] seq_run = sample['seq run'] data_folder = sample.sequencing_run['folder'] adaID = sample['adapter'] PCR = int(sample.PCR) if VERBOSE: print 'Map via stampy (single thread): '+samplename+' '+fragment if summary: summary_filename = get_map_initial_summary_filename(pname, samplename_pat, samplename, fragment, PCR=PCR) # Specific fragment (e.g. F5 --> F5bi) frag_spec = filter(lambda x: fragment in x, sample.regions_complete) if not len(frag_spec): if summary: with open(summary_filename, 'a') as f: f.write('Failed (specific fragment for '+fragment+'not found).\n') raise ValueError(samplename+': fragment '+fragment+' not found.') else: frag_spec = frag_spec[0] input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam', only_chunk=only_chunk, filtered=filtered) # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if fragment == 'F3': input_filename = input_filename.replace('F3a', 'F3') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename+', fragment '+fragment+': input file not found.') # Extract subsample of reads if requested if n_pairs > 0: from hivwholeseq.utils.mapping import extract_mapped_reads_subsample input_filename_sub = get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, PCR=PCR, type='bam')[:-4]+\ '_unmapped.bam' n_written = extract_mapped_reads_subsample(input_filename, input_filename_sub, n_pairs, VERBOSE=VERBOSE) # Get output filename output_filename = get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, PCR=PCR, type='sam', only_chunk=only_chunk) # Map call_list = [stampy_bin, '-g', get_initial_index_filename(pname, fragment, ext=False), '-h', get_initial_hash_filename(pname, fragment, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') if n_pairs > 0: call_list = call_list + ['-M', input_filename_sub] else: call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >=2: print ' '.join(call_list) sp.call(call_list) output_filename_bam = get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR, only_chunk=only_chunk) convert_sam_to_bam(output_filename_bam) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') if only_chunk is None: if VERBOSE >= 1: print 'Remove temporary files: sample '+samplename remove_mapped_init_tempfiles(pname, samplename_pat, samplename, fragment, PCR=PCR, VERBOSE=VERBOSE, only_chunk=only_chunk) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n') if n_pairs > 0: os.remove(input_filename_sub)