def get_sorted_aligned_reads(args, header, sequence): if args.reference_hash and os.path.exists(args.reference_hash): print("Loading index...") ref_index = load_hash(args.reference_hash) else: print("Computing reference index...") ref_index = build_hashtable(sequence, args.kmer, args.stride) save_hash(*ref_index, file=args.reference_hash) print("Verifying hash...") for hash_, offset_ in islice(ref_index[0].iteritems(), 20): if not verify_hash(sequence, offset_, args.kmer, hash_): raise ValueError( 'Index failed to verify: offset {} has mismatching hashes'. format(offset_)) print("Aligning reads...") pair_iterator = read_paired_fasta(args.reads_file) sam_iterator = align_pairs(sequence, ref_index, pair_iterator, 'hw2_rg') sam_iterator = iter(sorted(sam_iterator, cmp=compsam)) if args.out_bam: outfile = Samfile(args.out_bam, 'wb', header=SAM_HEADER(header, sequence)) for read in sam_iterator: outfile.write(read) outfile.close() infile = Samfile(args.out_bam, 'rb') sam_iterator = infile return sam_iterator
def main(args): m260b.debug.debug.DEBUG = args.debug ref_header, ref_sequence = read_basic_fasta(args.reference_file) if args.input_bam: reads = Samfile(args.input_bam) if args.start and args.stop: reads = reads.fetch(ref_header[1:].strip(), args.start, args.stop) else: reads = get_sorted_aligned_reads(args, ref_header, ref_sequence) #vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None chr = ref_header[1:].strip() fail_reasons = Counter() haplo_out = None if args.haplotype_out: haplo_out = Samfile(args.haplotype_out, 'wb', header=SAM_HEADER(ref_header, ref_sequence)) vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None for region, reads in active_regions(reads, ref_sequence, chr, start_offset=0, flank=30, dfrac=1.0): #print('Calling region {}-{}'.format(region.start, region.stop)) haplotype = build_haplotype(region.reference, reads, k=11, min_kmer_count=2) if haplotype.fail_reason: print('Failure {} at window\n{}'.format(haplotype.fail_reason, region)) continue # align the haplotype to the reference sequence offset, cigar, score, mismatch = banded_sw(region.reference, haplotype.seq) haplotype_start = region.start + offset _info = AlignmentInfo(haplotype_start, cigar, False, mismatch) haplo_seq = SeqRecord(Seq(haplotype.seq, DNA), id='Haplotype{}'.format(region.start)) dict.__setitem__(haplo_seq._per_letter_annotations, 'phred_quality', [40] * len(haplotype.seq)) haplo_read = alignment_info_to_sam(haplo_seq, _info, 'nomate', None, 'hw2_rg', False) if haplo_out: haplo_out.write(haplo_read) #print(haplotype) for variant in vcf_from_haplotype(region, haplotype, SAMPLE_NAME, chr): if vcf_stream: vcf_stream.write_record(variant) print(vcf2m260(variant)) if vcf_stream: vcf_stream.flush() vcf_stream.close()
def split_samfile(sam_file, splits, prefix='', path=''): """Take a sam file and split it splits number of times. :path: Where to put the split files. :prefix: A prefix for the outfile names. :returns: A tuple of job files. """ # Determine how many reads will be in each split sam file. num_lines = count_reads(sam_file) num_reads = int(int(num_lines)/splits) + 1 # Get rid of starting path sam_name = os.path.basename(sam_file) # Subset the SAM file into X number of jobs cnt = 0 currjob = 1 suffix = '.split_sam_' + str(currjob).zfill(4) run_file = os.path.join(path, prefix + sam_name + suffix) rmode = 'rb' if sam_name.split('.')[0] == 'bam' else 'r' wmode = 'wb' # Actually split the file outfiles = [run_file] with Samfile(sam_file, rmode) as in_sam: sam_split = Samfile(run_file, wmode, template=in_sam) for line in in_sam: cnt += 1 if cnt < num_reads: sam_split.write(line) elif cnt == num_reads: # Check if next line is mate-pair. If so, don't split. line2 = next(in_sam) currjob += 1 suffix = '.split_sam_' + str(currjob).zfill(4) run_file = os.path.join(path, prefix + sam_name + suffix) new_sam = Samfile(run_file, wmode, template=in_sam) outfiles.append(run_file) if line.qname == line2.qname: sam_split.write(line) sam_split.write(line2) sam_split.close() cnt = 0 else: sam_split.write(line) sam_split.close() new_sam.write(line2) cnt = 0 sam_split = new_sam sam_split.close() return tuple(outfiles)
def parse_barcode(bamfile): """parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome and writes to a file, create file for mutant and wildtype based on barcodes""" samfile = Samfile(bamfile, "rb") multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile) mutant_one = Samfile("MutantOne.bam","wb",template=samfile) wildtype_one = Samfile("WildtypeOne.bam","wb",template=samfile) mutant_two = Samfile("MutantTwo.bam","wb",template=samfile) wildtype_two = Samfile("WildtypeTwo.bam","wb",template=samfile) for line in samfile.fetch(): #if line.is_secondary: ## does this hit to more than one spot in genome # multi_hit_file.write(line) if "#GAGT"in line.qname: ## write to mutant file mutant_one.write(line) elif "#TTAG" in line.qname: mutant_two.write(line) elif "#ACCC" in line.qname: ### write to wildtype file wildtype_one.write(line) elif "#CGTA" in line.qname: ### write to wildtype file wildtype_two.write(line) multi_hit_file.close() mutant_one.close() wildtype_one.close() mutant_two.close() wildtype_two.close() samfile.close()
def subsample(fn, ns=None): if ns is None: fn, ns = fn sample = [] count = 0 outdir_base = path.join(path.dirname(fn), 'subset') sf = Samfile(fn) try: i_weight = float(sf.mapped) / max(ns) print "Read out ", i_weight except ValueError: i_weight = 0.0 for read in sf: i_weight += 1 print "Counted ", i_weight i_weight /= float(max(ns)) sf = Samfile(fn) print fn, count, i_weight for i, read in enumerate(sf): key = random()**i_weight if len(sample) < max(ns): heappush(sample, (key, read, i + count)) else: heappushpop(sample, (key, read, i + count)) count += i for n in ns: if n == min(ns): outdir = outdir_base + '_min' else: outdir = outdir_base + '{:04.1f}M'.format(n / 1e6) try: makedirs(outdir) except OSError: pass sampN = sorted(sample, reverse=True)[:int(n)] print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count) print fn, '->', outdir stdout.flush() of = Samfile(path.join(outdir, 'accepted_hits.bam'), mode='wb', template=sf) sample.sort(key=lambda (key, read, pos): (read.tid, read.pos)) for key, read, pos in sampN: of.write(read) of.close() sf.close() return [count for key, read, count in sample]
def subsample(fn, ns=None): if ns is None: fn, ns = fn sample = [] count = 0 outdir_base = path.join(path.dirname(fn), "subset") sf = Samfile(fn) try: i_weight = float(sf.mapped) / max(ns) print "Read out ", i_weight except ValueError: i_weight = 0.0 for read in sf: i_weight += 1 print "Counted ", i_weight i_weight /= float(max(ns)) sf = Samfile(fn) print fn, count, i_weight for i, read in enumerate(sf): key = random() ** i_weight if len(sample) < max(ns): heappush(sample, (key, read, i + count)) else: heappushpop(sample, (key, read, i + count)) count += i for n in ns: if n == min(ns): outdir = outdir_base + "_min" else: outdir = outdir_base + "{:04.1f}M".format(n / 1e6) try: makedirs(outdir) except OSError: pass sampN = sorted(sample, reverse=True)[: int(n)] print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count) print fn, "->", outdir stdout.flush() of = Samfile(path.join(outdir, "accepted_hits.bam"), mode="wb", template=sf) sample.sort(key=lambda (key, read, pos): (read.tid, read.pos)) for key, read, pos in sampN: of.write(read) of.close() sf.close() return [count for key, read, count in sample]
def get_sorted_aligned_reads(args, header, sequence): if args.ref_idx and os.path.exists(args.ref_idx): print("Loading index...") ref_index = load_hash(args.ref_idx) else: print("Computing reference index...") ref_index = build_hashtable(sequence, args.kmer, args.stride) print("Verifying hash...") for hash_, offset_ in islice(ref_index[0].iteritems(), 20): if not verify_hash(sequence, offset_, args.kmer, hash_): raise ValueError( 'Index failed to verify: offset {} has mismatching hashes'. format(offset_)) print("Aligning reads...") pair_iterator = read_paired_fasta(args.reads_file) sam_iterator = align_pairs(sequence, ref_index, pair_iterator, 'hw1_rg') print('Sorting SAMRecords in memory...') sam_iterator = iter(sorted(sam_iterator, cmp=compsam)) if args.out_bam: header = { 'HD': { 'VN': '1.0' }, 'SQ': [{ 'SN': header[1:], 'LN': len(sequence) }], 'RG': [{ 'ID': 'hw1_rg', 'SM': SAMPLE_NAME, 'PU': 'Unknown', 'PL': 'Unknown', 'LB': 'Unknown' }] } outfile = Samfile(args.out_bam, 'wb', header=header) for read in sam_iterator: outfile.write(read) outfile.close() infile = Samfile(args.out_bam, 'rb') sam_iterator = infile return sam_iterator
def split_reads(reads, ref_reads, alt_reads): read_results = Counter() ref_bam = Samfile(ref_reads, 'wb', template=reads) alt_bam = Samfile(alt_reads, 'wb', template=reads) prev_phase = None prev_read = None prev_qname = None test = 0 for read in reads.fetch(until_eof=True): test += 1 chrom = read.reference_name snps_on_chrom = snp_dict[chrom] phase = get_phase(read, snps_on_chrom) read_qname = read.qname if read_qname == prev_qname: read_results["tot_read"]+=1 phase_set = set([phase, prev_phase]) phase_set.discard(None) if len(phase_set) == 1: read_phase = phase_set.pop() if read_phase == -1: ref_bam.write(read) ref_bam.write(prev_read) read_results["ref_read"]+=1 elif read_phase == 1: alt_bam.write(read) alt_bam.write(prev_read) read_results["alt_read"]+=1 elif read_phase == 0: read_results['misphased_read']+=1 elif len(phase_set)==0: read_results["no_snps_read"]+=1 else: read_results['misphased_read']+=1 prev_read = read prev_phase = phase prev_qname = read_qname return(read_results)
""" ret = [] for i in re.findall("\d+|\^?[ATCGN]+", md): if i.startswith('^'): ret.extend(list(i[1:])) elif i[0] in ["A", "T", "C", "G", "N"]: ret.extend(list(i)) else: ret.extend(['-'] * int(i)) return ret if __name__ == '__main__': f = Samfile(sys.argv[1]) out = Samfile(sys.argv[1][:-4] + "_realign.bam", 'wb', template=f) count = 0.0 n = 0.05 for read in f: q, t = expandAlign(read) query, target = realign(read) replace(read, query, target) out.write(read) count += 1 if (count / f.mapped) > n: n += 0.05 print "[%s] -- parsed %d of %d reads (%.2f)" % ( time.asctime(), int(count), f.mapped, count / f.mapped) out.close()
def _sort_regional_reads(regional_reads, regions, output_dir, basename, header, log_output): """Sorts reads in dict struct by coordinate and writes to BAM files Uses counting sort with the intuition that most positions in a range will map to a non-empty bucket, where a bucket holds reads mapped to a certain position. Args: regional_reads: Complicated dict struct ex. regional_reads[region][chromosome][position] returns list of reads at this position in this chromosome in this region regions: Regions dict used to sort reads during alignment. Produced by _load_regions() output_dir: self-explanatory basename: Prefix for output BAM files header: Dict representation of SAM header (described in pysam docs) log_output: File object for log. Returns: output_paths: List of output file paths """ output_paths = [] region_intervals = {} sequence_order = [chrom["SN"] for chrom in header["SQ"]] region_names = set(["chri"]) # Get region intervals for seq_id in sequence_order: for interval in regions[seq_id]: region_names.add(interval.name) if interval.name in region_intervals: region_intervals[interval.name].append( [seq_id, interval.lower_bound, interval.upper_bound]) else: region_intervals[interval.name] = [[ seq_id, interval.lower_bound, interval.upper_bound ]] #for region in regional_reads.keys(): for region in region_names: output_path = "{}/{}/{}.bam".format(output_dir, region, basename) output_file = Samfile(output_path, 'wb', header=header) if region == "chri": for chromosome in sequence_order: if chromosome in regional_reads[region]: for position in sorted( regional_reads[region][chromosome].keys()): for read in regional_reads[region][chromosome].pop( position): output_file.write(read) elif region in regional_reads: for entries in region_intervals[region]: chromosome = entries[0] low, high = entries[1], entries[2] if chromosome in regional_reads[region]: for position in range(low, high + 1): if position in regional_reads[region][chromosome]: for read in regional_reads[region][chromosome].pop( position): output_file.write(read) else: log_output.write("No reads mapping to " "{} {}\n".format(region, chromosome)) log_output.flush() fsync(log_output.fileno()) output_paths.append(output_path) output_file.close() return output_paths
def main(): parser = OptionParser(usage=usage) #parser.add_option("-s", action="store_true", dest="sam_input", default=False, #help="Input is in SAM format instead of BAM format") (options, args) = parser.parse_args() if len(args) != 4: parser.print_help() sys.exit(1) psl_filename = args[0] ref_filename = args[1] contigs_filename = args[2] bam_filename = args[3] liftover_dir = args[1] references, ref_chromosomes = read_fasta(ref_filename) refname_to_id = dict([(name, i) for i, name in enumerate(ref_chromosomes)]) print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr) contigs, contig_names = read_fasta(contigs_filename) print('Read', len(contig_names), 'contigs.', file=sys.stderr) bam_header = { 'HD': { 'VN': '1.0' }, 'SQ': [ dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes ] } outfile = Samfile(bam_filename, 'wb', header=bam_header) line_nr = 0 header_read = False for line in (s.strip() for s in open(psl_filename)): line_nr += 1 if line.startswith('------'): header_read = True continue if not header_read: continue fields = line.split() assert len( fields ) == 21, 'Error reading PSL file, offending line: %d' % line_nr sizes = [int(x) for x in fields[18].strip(',').split(',')] contig_starts = [int(x) for x in fields[19].strip(',').split(',')] ref_starts = [int(x) for x in fields[20].strip(',').split(',')] assert 0 < len(sizes) == len(contig_starts) == len(ref_starts) strand = fields[8] contig_name = fields[9] ref_name = fields[13] assert strand in ['-', '+'] assert contig_name in contigs assert ref_name in references a = AlignedRead() a.qname = contig_name if strand == '+': a.seq = str(contigs[contig_name]) else: a.seq = str(contigs[contig_name].reverse_complement()) a.flag = (16 if strand == '+' else 0) a.rname = refname_to_id[ref_name] a.pos = ref_starts[0] a.mapq = 255 qpos = contig_starts[0] refpos = ref_starts[0] cigar = [] # soft-clipping at the start? if contig_starts[0] > 0: cigar.append((4, contig_starts[0])) longest_insertion = 0 longest_deletion = 0 total_matches = 0 total_insertion = 0 total_deletion = 0 for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts): assert contig_start >= qpos assert ref_start >= refpos # insertion? if contig_start > qpos: insertion_length = contig_start - qpos longest_insertion = max(longest_insertion, insertion_length) total_insertion += insertion_length append_to_cigar(cigar, 1, insertion_length) qpos = contig_start # deletion? if ref_start > refpos: deletion_length = ref_start - refpos longest_deletion = max(longest_deletion, deletion_length) total_deletion += deletion_length append_to_cigar(cigar, 2, deletion_length) refpos = ref_start # strech of matches/mismatches append_to_cigar(cigar, 0, length) refpos += length qpos += length total_matches += length # soft-clipping at the end? if len(a.seq) > qpos: cigar.append((4, len(a.seq) - qpos)) a.cigar = tuple(cigar) # only use contigs where longest deletion is <= 10000 bp if longest_deletion > 10000: continue # require at least 200 matching positions if total_matches < 200: continue # require the matching positions to make up at least 75 percent of the contig # (without counting parts of the contig that are insertions). if total_matches / (len(a.seq) - total_insertion) < 0.75: continue outfile.write(a) outfile.close()
def subsample(fn, ns=None, paired=False): if ns is None: fn, ns = fn sample = [] count = 0 outdir_base = path.join(path.dirname(fn), 'subset') sf = Samfile(fn) try: i_weight = float(sf.mapped) / max(ns) print("Read out ", i_weight) except ValueError: i_weight = 0.0 for read in sf: i_weight += 1 print("Counted ", i_weight) i_weight /= float(max(ns)) sf = Samfile(fn) if paired: read_2s = {} print(fn, count, i_weight) for i, read in enumerate(sf): key = random()**i_weight if not paired or read.is_read1: if len(sample) < max(ns): heappush(sample, (key, i + count, read)) else: dropped = heappushpop(sample, (key, i + count, read)) if paired: read_2s.pop(dropped[-1].qname, None) elif paired: read_2s[read.qname] = read else: assert ValueError("I don't know how we got here") count += i for n in ns: outdir = outdir_base + '{:04.1f}M'.format(n / 1e6) try: makedirs(outdir) except OSError: pass sampN = sorted(sample, reverse=True)[:int(n)] print("Kept {: >12,} of {: >12,} reads".format(len(sampN), count)) print(fn, '->', outdir) stdout.flush() of = Samfile(path.join(outdir, 'accepted_hits.bam'), mode='wb', template=sf) sample.sort( key=lambda heap_item: (heap_item[-1].tid, heap_item[-1].pos)) missing_mates = 0 for key, pos, read in sampN: of.write(read) if paired and read.is_proper_pair: if read.qname not in read_2s: missing_mates += 1 continue of.write(read_2s[read.qname]) of.close() sf.close() print(missing_mates) return [count for key, read, count in sample]
def main(): parser = OptionParser(usage=usage) #parser.add_option("-s", action="store_true", dest="sam_input", default=False, #help="Input is in SAM format instead of BAM format") (options, args) = parser.parse_args() if len(args) != 4: parser.print_help() sys.exit(1) psl_filename = args[0] ref_filename = args[1] contigs_filename = args[2] bam_filename = args[3] liftover_dir = args[1] references, ref_chromosomes = read_fasta(ref_filename) refname_to_id = dict([(name,i) for i,name in enumerate(ref_chromosomes)]) print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr) contigs, contig_names = read_fasta(contigs_filename) print('Read', len(contig_names), 'contigs.', file=sys.stderr) bam_header = {'HD': {'VN': '1.0'}, 'SQ': [dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes] } outfile = Samfile(bam_filename, 'wb', header=bam_header) line_nr = 0 header_read = False for line in (s.strip() for s in open(psl_filename)): line_nr += 1 if line.startswith('------'): header_read = True continue if not header_read: continue fields = line.split() assert len(fields) == 21, 'Error reading PSL file, offending line: %d'%line_nr sizes = [int(x) for x in fields[18].strip(',').split(',')] contig_starts = [int(x) for x in fields[19].strip(',').split(',')] ref_starts = [int(x) for x in fields[20].strip(',').split(',')] assert 0 < len(sizes) == len(contig_starts) == len(ref_starts) strand = fields[8] contig_name = fields[9] ref_name = fields[13] assert strand in ['-','+'] assert contig_name in contigs assert ref_name in references a = AlignedRead() a.qname = contig_name if strand == '+': a.seq = str(contigs[contig_name]) else: a.seq = str(contigs[contig_name].reverse_complement()) a.flag = (16 if strand == '+' else 0) a.rname = refname_to_id[ref_name] a.pos = ref_starts[0] a.mapq = 255 qpos = contig_starts[0] refpos = ref_starts[0] cigar = [] # soft-clipping at the start? if contig_starts[0] > 0: cigar.append((4,contig_starts[0])) longest_insertion = 0 longest_deletion = 0 total_matches = 0 total_insertion = 0 total_deletion = 0 for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts): assert contig_start >= qpos assert ref_start >= refpos # insertion? if contig_start > qpos: insertion_length = contig_start - qpos longest_insertion = max(longest_insertion, insertion_length) total_insertion += insertion_length append_to_cigar(cigar, 1, insertion_length) qpos = contig_start # deletion? if ref_start > refpos: deletion_length = ref_start - refpos longest_deletion = max(longest_deletion, deletion_length) total_deletion += deletion_length append_to_cigar(cigar, 2, deletion_length) refpos = ref_start # strech of matches/mismatches append_to_cigar(cigar, 0, length) refpos += length qpos += length total_matches += length # soft-clipping at the end? if len(a.seq) > qpos: cigar.append((4,len(a.seq) - qpos)) a.cigar = tuple(cigar) # only use contigs where longest deletion is <= 10000 bp if longest_deletion > 10000: continue # require at least 200 matching positions if total_matches < 200: continue # require the matching positions to make up at least 75 percent of the contig # (without counting parts of the contig that are insertions). if total_matches / (len(a.seq) - total_insertion) < 0.75: continue outfile.write(a) outfile.close()
def print_reads(reads_to_print, ref_name, header): output_name = "{0}_{1}.bam".format(args.output_base, ref_name) output_samfile = Samfile(output_name, "wb", header=header) for aln in reads_to_print: output_samfile.write(aln) output_samfile.close()
def subsample(fn, ns=None, paired=False): if ns is None: fn, ns = fn sample = [] count = 0 outdir_base = path.join(path.dirname(fn), 'subset') sf = Samfile(fn) try: i_weight = float(sf.mapped)/max(ns) print("Read out ", i_weight) except ValueError: i_weight = 0.0 for read in sf: i_weight += 1 print("Counted ", i_weight) i_weight /= float(max(ns)) sf = Samfile(fn) if paired: read_2s = {} print(fn, count, i_weight) for i, read in enumerate(sf): key = random()**i_weight if not paired or read.is_read1: if len(sample) < max(ns): heappush(sample, (key, i+count, read)) else: dropped = heappushpop(sample, (key, i+count, read)) if paired: read_2s.pop(dropped[-1].qname, None) elif paired: read_2s[read.qname] = read else: assert ValueError("I don't know how we got here") count += i for n in ns: outdir = outdir_base + '{:04.1f}M'.format(n/1e6) try: makedirs(outdir) except OSError: pass sampN = sorted(sample, reverse=True)[:int(n)] print("Kept {: >12,} of {: >12,} reads".format(len(sampN), count)) print(fn, '->', outdir) stdout.flush() of = Samfile(path.join(outdir, 'accepted_hits.bam'), mode='wb', template=sf) sample.sort(key=lambda heap_item: (heap_item[-1].tid, heap_item[-1].pos)) missing_mates = 0 for key, pos, read in sampN: of.write(read) if paired and read.is_proper_pair: if read.qname not in read_2s: missing_mates += 1 continue of.write(read_2s[read.qname]) of.close() sf.close() print(missing_mates) return [count for key, read, count in sample]
""" Turns abbreviated MD into a full array """ ret = [] for i in re.findall("\d+|\^?[ATCGN]+", md): if i.startswith('^'): ret.extend(list(i[1:])) elif i[0] in ["A","T","C","G","N"]: ret.extend(list(i)) else: ret.extend(['-']*int(i)) return ret if __name__ == '__main__': f = Samfile(sys.argv[1]) out = Samfile(sys.argv[1][:-4]+"_realign.bam",'wb', template=f) count = 0.0 n = 0.05 for read in f: q,t = expandAlign(read) query, target = realign(read) replace(read, query, target) out.write(read) count += 1 if (count / f.mapped) > n: n += 0.05 print "[%s] -- parsed %d of %d reads (%.2f)" % (time.asctime(), int(count), f.mapped, count/f.mapped ) out.close()