def map_and_parse_sam(ref_index, query_fa, tags, qry_or_ref, ops, get_unique=True): samfile = ops.outprefix + '.maptags.sam' if get_unique: utils.syscall(external_progs.bowtie2_align + ' -f -x ' + ref_index + ' -U ' + query_fa + ' -S ' + samfile) else: utils.syscall(external_progs.bowtie2_align + ' -a --score-min L,0,0 -f -x ' + ref_index + ' -U ' + query_fa + ' -S ' + samfile) sam_reader = sam.file_reader(samfile) for sam_record in sam_reader: assert sam_record.id in tags if sam_record.is_mapped() and sam_record.tags['AS'][1] == 0: if (get_unique and (('XS' not in sam_record.tags or sam_record.tags['XS'][1] < 0))) \ or not get_unique: if qry_or_ref == 'qry': tags[sam_record.id].qry_hits.add( Hit(sam_record.rname, sam_record.pos, sam_record.query_strand())) elif qry_or_ref == 'ref': tags[sam_record.id].ref_hits.add( Hit(sam_record.rname, sam_record.pos, sam_record.query_strand())) else: print('Error parsing SAM', file=sys.stderr) sys.exit(1) os.unlink(samfile)
def test_file_reader_sam(self): '''file_reader should iterate through a BAM file correctly''' tmp_sam_out = 'tmp.sam' fout = utils.open_file_write(tmp_sam_out) sam_reader = sam.file_reader('sam_unittest.bam') for sam_record in sam_reader: print(sam_record, file=fout) utils.close(fout) self.assertTrue(filecmp.cmp('sam_unittest.sam', tmp_sam_out)) os.unlink(tmp_sam_out)
def map_and_parse_sam(ref_index, tags_fasta, tag_counts, log_fh): samfile = options.outprefix + '.maptags.sam' #utils.syscall('smalt map -d -1 -y 1 -f samsoft -o ' + samfile + ' ' + ref_smalt_index + ' ' + tags_fasta) utils.syscall(external_progs.bowtie2_align + ' -f -x ' + ref_index + ' -U ' + tags_fasta + ' -S ' + samfile) sam_reader = sam.file_reader(samfile) for sam_record in sam_reader: (contig_name, range) = sam_record.id.rsplit(':', 1) assert contig_name not in tag_counts if sam_record.is_mapped() \ and sam_record.tags['AS'][1] == 0 \ and ('XS' not in sam_record.tags or sam_record.tags['XS'][1] < 0): tag_counts[contig_name] = 1 else: tag_counts[contig_name] = 2 os.unlink(samfile)
description = 'Given a fasta/q file of reads, and a second fasta of vector sequences, trims the vectors off the reads. Made specifically for assembled capillary read pairs - uses BWA for mapping. Untested on short reads or unassembled read pairs', usage = '%(prog)s [options] <reads fasta/q> <vectors fasta> <outprefix>') parser.add_argument('--join_distance', type=int, help='Join hits at most this many bases apart [%(default)s]', metavar='INT', default=100) parser.add_argument('reads_in', help='Name of input fasta/q file of reads', metavar='reads fasta/q') parser.add_argument('vectors_in', help='Name of input fasta file of vectors', metavar='vectors fasta') parser.add_argument('outprefix', help='Prefix of names of ouput files') options = parser.parse_args() bwa_index = options.outprefix + '.bwa_index' bwa_sam = options.outprefix + '.map_reads.sam' utils.syscall(' '.join([external_progs.bwa, 'index -p', bwa_index, options.vectors_in])) utils.syscall(' '.join([external_progs.bwa, 'bwasw -f', bwa_sam, bwa_index, options.reads_in])) read_hit_coords = {} # id -> [(start, end), (start, end), ...] sam_reader = sam.file_reader(bwa_sam) for sam_record in sam_reader: if not sam_record.is_mapped(): continue if not sam_record.is_forward_strand(): sam_record.cigar.reverse() hit_start = 1 hit_end = len(sam_record.seq) if sam_record.cigar.operations[0].operator == 'S': hit_start = sam_record.cigar.operations[0].number if sam_record.cigar.operations[-1].operator == 'S':
import argparse import fastn import sam import utils parser = argparse.ArgumentParser( description= 'Report positions in the reference where any read had an error (i.e. difference between read and reference)', usage='%(prog)s [options] <in.bam> <reference.fasta> <outfile>') parser.add_argument('bam_in', help='Name of input bam file') parser.add_argument('fasta_in', help='Name of reference fasta file') parser.add_argument('outfile', help='Name of output file') options = parser.parse_args() sam_reader = sam.file_reader(options.bam_in) errors = {} ref_seqs = {} fastn.file_to_dict(options.fasta_in, ref_seqs) for sam_record in sam_reader: if sam_record.is_mapped(): new_errors = sam_record.get_differences_from_ref( ref_seqs[sam_record.rname]) if sam_record.rname not in errors: errors[sam_record.rname] = {} for e in new_errors: errors[sam_record.rname][e] = errors[sam_record.rname].get(e, 0) + 1
bamfile = options.outprefix + '.map_tags.bam' sorted_bamfile = options.outprefix + '.map_tags.sorted.bam' external_progs.index_with_bowtie2(options.scaffolds_fa) #utils.syscall(external_progs.bowtie2_align + ' -f -a --score-min L,0,0 -x ' + options.scaffolds_fa + ' -U ' + tags_fa_file + ' -S ' + samfile) utils.syscall(external_progs.bowtie2_align + ' -f -x ' + options.scaffolds_fa + ' -U ' + tags_fa_file + ' -S ' + samfile) utils.syscall('samtools view -T ' + options.scaffolds_fa + ' -bS ' + samfile + ' > ' + bamfile) os.unlink(samfile) utils.syscall('samtools sort ' + bamfile + ' ' + sorted_bamfile[0:-4]) #os.unlink(bamfile) # Load the hits into memory previous_sam = None previous_tag = None sam_reader = sam.file_reader(sorted_bamfile) flag_counts = {k: 0 for k in [0, 1, 2, 4, 5, 8, 12, 16]} tags_from_bam = set() tag_distances = [] f_log = utils.open_file_write(options.outprefix + '.log') f_tags_and_sam = utils.open_file_write(options.outprefix + '.tags_and_sam.gz') skipped_tags = 0 for current_sam in sam_reader: if current_sam.is_mapped(): tags_from_bam.add(current_sam.id) if current_sam.tags['AS'][1] != 0: print('Nonzero alignemnt score', current_sam, file=f_log) if 'XS' in current_sam.tags and current_sam.tags['XS'][ 1] >= current_sam.tags['AS'][1]: print('Non-unique best hit', current_sam, file=f_log)
second_coords = {} tag_counts = {} if options.second_fasta: tags_tmp_fa = options.outprefix + '.tags.tmp.fa' f = utils.open_file_write(tags_tmp_fa) for t in unique_tags: #print('>' + t[0] + ':' + str(t[1]) + ':' + str(t[2]) + '\n' + t[3], file=f) print('>' + t[0] + '\n' + t[3], file=f) utils.close(f) samfile = options.outprefix + '.maptags.sam' #utils.syscall('smalt map -d -1 -y 1 -f samsoft -o ' + samfile + ' ' + second_seqs_smalt_index + ' ' + tags_tmp_fa) utils.syscall(external_progs.bowtie2_align + ' -f -x ' + second_seqs_index + ' -U ' + tags_tmp_fa + ' -S ' + samfile) os.unlink(tags_tmp_fa) sam_reader = sam.file_reader(samfile) for sam_record in sam_reader: if sam_record.is_mapped(): tag_counts[sam_record.id] = tag_counts.get(sam_record.id, 0) + 1 second_coords[sam_record.id] = [ sam_record.rname, sam_record.pos + 1 ] else: tag_counts[contig_name] = -1 os.unlink(samfile) #os.unlink(second_seqs_smalt_index + '.smi') #os.unlink(second_seqs_smalt_index + '.sma') for ext in ['1.bt2', '2.bt2', '3.bt2', '4.bt2', 'rev.1.bt2', 'rev.2.bt2']: os.unlink(second_seqs_index + '.' + ext)