def main(): parser = ArgumentParser(description='Add readgroups based on Illumina ' + 'IDs to given BAM') parser.add_argument('-c', '--cn', required=True, help='Sequencing Center Name') parser.add_argument('-l', '--lb', required=True, help='Library') parser.add_argument('-s', '--sm', required=True, help='Sample') parser.add_argument('bam', help='Input BAM') args = parser.parse_args() ReadGroup = namedtuple('ReadGroup', 'ID CN LB PL SM', defaults=[args.cn, args.lb, 'ILLUMINA', args.sm]) in_bam = args.bam in_bam_stem = Path(in_bam).stem # Intermediate BAM with readgroup tagged reads, but no readgroups in header temp_bam = in_bam_stem + '.temp.bam' # Header file with readgroups header_sam = in_bam_stem + '.header.sam' # Final BAM final_bam = in_bam_stem + '.readgroupsadded.bam' readgroups = dict() header_dict = dict() # Add readgroup tag to each read and create readgroups for header with pysam.AlignmentFile(in_bam, 'rb') as in_bam_fh, \ pysam.AlignmentFile(temp_bam, 'wb', template=in_bam_fh) as temp_bam_fh: header_dict = in_bam_fh.header.to_dict() for read in in_bam_fh.fetch(until_eof=True): readgroup_id = '.'.join(read.query_name.split(':')[2:4]) read.set_tag('RG', readgroup_id, 'Z') temp_bam_fh.write(read) if readgroup_id not in readgroups: readgroups[readgroup_id] = ReadGroup(readgroup_id) # Add readgroups to header header_dict['RG'] = [readgroup._asdict() for readgroup in readgroups.values()] # Write updated header to file header_sam_fh = pysam.AlignmentFile(header_sam, 'wh', header=header_dict) header_sam_fh.close() # Create new bam with readgroups in header and read tags Path(final_bam).touch() pysam.reheader(header_sam, temp_bam, save_stdout=final_bam) # Delete intermediate BAM and header file Path(temp_bam).unlink() Path(header_sam).unlink()
def reheader_and_rename_chromosomes(in_bam_file, out_bam_file, replacements): with pysam.Samfile(in_bam_file) as f: h = str(f.header) org_header = h for a, b in replacements.items(): h = h.replace(f"SN:{a}", f"SN:{b}") if h == org_header: raise ValueError("No replacement happened") tf = tempfile.NamedTemporaryFile() tf.write(h.encode("utf-8")) tf.flush() out_bam_file.write_text("") # must be there for save_stdout to work.. pysam.reheader( tf.name, str(in_bam_file.absolute()), save_stdout=str(out_bam_file.absolute()).encode("utf-8"), ) pysam.index(str(out_bam_file))
def main(args): # Detect the input format. try: with open(args.bam, 'r') as f: for line in f: is_sam = True break except UnicodeDecodeError: is_sam = False # Rename the sample(s). old_lines = pysam.view('-H', args.bam, '--no-PG').strip().split('\n') new_lines = [] for old_line in old_lines: old_fields = old_line.split('\t') if old_fields[0] != '@RG': new_lines.append(old_line) continue new_fields = [] for old_field in old_fields: if 'SM:' not in old_field: new_fields.append(old_field) continue new_fields.append(f'SM:{args.name}') new_lines.append('\t'.join(new_fields)) # Write the output file. if is_sam: for new_line in new_lines: sys.stdout.write(new_line + '\n') alignments = pysam.view(args.bam, '--no-PG') sys.stdout.write(alignments) else: with tempfile.TemporaryDirectory() as t: with open(f'{t}/header.sam', 'w') as f: f.write('\n'.join(new_lines)) alignments = pysam.reheader(f'{t}/header.sam', args.bam, '--no-PG') sys.stdout.buffer.write(alignments)
def premap_stampy(data_folder, adaID, VERBOSE=0, threads=1, summary=True, maxreads=-1, subsrate=0.05, gapopen=40, gapextend=3): '''Call stampy for actual mapping''' if VERBOSE: print 'Premapping: adaID ', adaID if summary: summary_filename = get_premap_summary_filename(data_folder, adaID) # Stampy can handle both gzipped and uncompressed fastq inputs input_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(input_filenames[0]): input_filenames = get_read_filenames(data_folder, adaID, gzip=False) if not all(map(os.path.isfile, input_filenames)): raise OSError('Input files for mapping not found: ' + input_filenames[0]) # parallelize if requested if threads == 1: call_list = [ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename(data_folder, adaID, ext=False), '-o', get_premapped_filename(data_folder, adaID, type='sam'), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), ] if maxreads > 0: call_list.append('--numrecords=' + str(maxreads)) call_list.extend(['-M'] + input_filenames) call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nStampy premapped (single thread).\n') # Convert to compressed BAM convert_sam_to_bam( get_premapped_filename(data_folder, adaID, type='bam')) if summary: with open(summary_filename, 'a') as f: f.write('\nSAM file converted to compressed BAM: '+\ get_premapped_filename(data_folder, adaID, type='bam')+'\n') else: # Multithreading works as follows: call qsub + stampy, monitor the process # IDs with qstat at regular intervals, and finally merge results with pysam output_file_parts = [ get_premapped_filename(data_folder, adaID, type='bam', part=(j + 1)) for j in xrange(threads) ] # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') # Submit map call import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/' JOBLOGOUT = JOBDIR + 'logout' JOBLOGERR = JOBDIR + 'logerr' cluster_time = ['23:59:59', '1:59:59'] vmem = '8G' for j in xrange(threads): call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l', 'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem, stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename( data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename( data_folder, adaID, ext=False), '-o', get_premapped_filename( data_folder, adaID, type='sam', part=(j + 1)), '--processpart=' + str(j + 1) + '/' + str(threads), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), '-M' ] + input_filenames call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if VERBOSE >= 3: print qstat_output if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert premapped reads to BAM for merging: adaID '+\ adaID+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy premapped (' + str(threads) + ' threads).\n') # Concatenate output files if VERBOSE >= 1: print 'Concatenate premapped reads: adaID ' + adaID + '...', output_filename = get_premapped_filename(data_folder, adaID, type='bam', unsorted=True) pysam.cat('-o', output_filename, *output_file_parts) if VERBOSE >= 1: print 'done.' if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort premapped reads: adaID ' + adaID output_filename_sorted = get_premapped_filename(data_folder, adaID, type='bam', unsorted=False) pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader premapped reads: adaID ' + adaID header_filename = get_premapped_filename(data_folder, adaID, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp premapping files removed.\n') f.write('\n')
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1, cluster_time='23:59:59', maxreads=-1, summary=True, rescue=False, dry=False): '''Map using stampy''' frag_gen = fragment[:2] if summary: summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=rescue) # Set mapping penalty scores: softer for rescues and F3 and F5 global subsrate if rescue: subsrate = '0.2' stampy_gapopen = 5 # Default: 40 stampy_gapextend = 1 # Default: 3 elif frag_gen not in ('F3', 'F5'): stampy_gapopen = 60 # Default: 40 stampy_gapextend = 5 # Default: 3 else: stampy_gapopen = 30 # Default: 40 stampy_gapextend = 2 # Default: 3 if VERBOSE: print 'Map via stampy: ' + adaID + ' ' + frag_gen if not rescue: input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if frag_gen == 'F3': input_filename = input_filename.replace('F3a', 'F3') else: input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename + ', fragment ' + fragment + ': input file not found.') # parallelize if requested if threads == 1: output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', rescue=rescue) # Map call_list = [ stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') # Take only a (random) subsample: stampy uses the fraction of reads # intead of the number if maxreads > 0: # FIXME: figure out the -s option and the --numrecords option call_list.extend(['--numrecords', maxreads]) #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2 #frac_pairs = 1.0 * maxreads / n_pairs_tot #random_seed = np.random.randint(1e5) #call_list.extend(['-s', frac_pairs + random_seed]) call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: sp.call(call_list) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', rescue=rescue) convert_sam_to_bam(output_filename) else: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (single thread).\n') if VERBOSE >= 1: print 'Dry run works (single thread)' return else: # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): # Get output filename output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=(j + 1), rescue=rescue) # Map call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm' + adaID.replace('-', '') + frag_gen + str(j + 1), '-l', 'h_rt=' + cluster_time, '-l', 'h_vmem=' + vmem, stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--processpart=' + str(j + 1) + '/' + str(threads), '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID if dry: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (multi thread).\n') if VERBOSE >= 1: print 'Dry run works (multi thread)' return # Monitor output output_file_parts = [ get_mapped_filename(data_folder, adaID, frag_gen, type='bam', part=(j + 1), rescue=rescue) for j in xrange(threads) ] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: adaID '+\ adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (' + str(threads) + ' threads).\n') # Concatenate output files output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=True, rescue=rescue) if VERBOSE >= 1: print 'Concatenate mapped reads: adaID ' + adaID + ', fragment ' + frag_gen pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=False, rescue=rescue) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: adaID ' + adaID + ', fragment ' + frag_gen pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: adaID ' + adaID + ', fragment ' + frag_gen header_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=1, rescue=rescue) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') # FIXME: check whether temp files are all deleted if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID + ', fragment ' + frag_gen remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
if cell_barcode in filtered_barcodes: read_groups.add(cell_barcode) # replace RG tag with the cell barcode read.set_tag('RG', cell_barcode, 'Z', True) nkeep += 1 output_bam.write(read) else: nfiltered += 1 else: nmiss += 1 input_bam.close() output_bam.close() print('\n') print('No. of alignments missing CB tag: ' + str(nmiss)) print('No. of alignments contains filtered out CB tag: ' + str(nmiss)) print('No. of Reads with passing-filter CB tag: ' + str(nfiltered)) print('No. of read groups: ' + str(len(read_groups))) print('\n') print("Writing header.sam with updated RG tags.") # write out RG tags to header rg_header = [{'ID': id} for id in read_groups] header['RG'] = rg_header with pysam.AlignmentFile(args.header_file, "wh", header=header) as outf: a = pysam.AlignedSegment() outf.write(a) # re-header the output_bam file in place pysam.reheader("-P", "-i", args.header_file, args.output_bam)
def map_stampy_multithread(sample, fragment, VERBOSE=0, threads=2, summary=True, filtered=True): '''Map using stampy, multithread (via cluster requests, queueing race conditions possible)''' import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/')+'/' JOBLOGOUT = JOBDIR+'logout/' JOBLOGERR = JOBDIR+'logerr/' cluster_time = ['23:59:59', '0:59:59'] vmem = '8G' pname = patient.id sample = patient.sample_table.loc[samplename] seq_run = sample['run'] data_folder = MiSeq_runs[seq_run]['folder'] adaID = sample['adaID'] if VERBOSE: print 'Map via stampy: '+pname+' '+samplename+' '+fragment if summary: summary_filename = get_map_initial_summary_filename(pname, samplename, fragment) # Specific fragment (e.g. F5 --> F5bi) frag_spec = filter(lambda x: fragment in x, sample['fragments']) if not len(frag_spec): raise ValueError(str(patient)+', '+samplename+': fragment '+fragment+' not found.') frag_spec = frag_spec[0] input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam') # Submit map scripts in parallel to the cluster jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): output_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='sam', part=(j+1)) # Map call_list = ['qsub','-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm '+samplename+fragment+' p'+str(j+1), '-l', 'h_rt='+cluster_time[threads >= 10], '-l', 'h_vmem='+vmem, stampy_bin, '--overwrite', '-g', get_initial_index_filename(pname, fragment, ext=False), '-h', get_initial_hash_filename(pname, fragment, ext=False), '-o', output_filename, '--processpart='+str(j+1)+'/'+str(threads), '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output output_file_parts = [get_mapped_to_initial_filename(pname, samplename, fragment, type='bam', part=(j+1)) for j in xrange(threads)] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: sample '+\ samplename+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped ('+str(threads)+' threads).\n') # Concatenate output files output_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='bam', unsorted=True) if VERBOSE >= 1: print 'Concatenate premapped reads: sample '+samplename pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_to_initial_filename(pname, samplename, fragment, type='bam') # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: sample '+samplename pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: sample '+samplename header_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: sample '+samplename remove_mapped_init_tempfiles(pname, samplename, fragment, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')