/
splitSpeciesReads.py
133 lines (110 loc) · 4.83 KB
/
splitSpeciesReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
splitSpeciesReads.py
Designed to separate reads from the two species in hybrid data.
Adapted from GetGeneASEbyReads.py by Peter Combs.
NOTE: Input bam files MUST be sorted by mate-pair.
Last edit: 07.27.2016
"""
from __future__ import print_function
from pysam import Samfile
from argparse import ArgumentParser, FileType
from collections import defaultdict, Counter
from os import path
import time
def get_phase(read, snps):
phase = None
for read_pos, ref_pos in read.get_aligned_pairs(matches_only=True):
if ref_pos + 1 in snps and read.query_qualities[read_pos] >= 30:
if phase == None:
try:
# 1 if alternate, -1 if reference
phase = -1 + 2*snps[ref_pos + 1].index(read.seq[read_pos])
except ValueError:
return 0 # This SNP isn't in the dataset
else:
try:
new_phase = -1 + 2*snps[ref_pos + 1].index(read.seq[read_pos])
except ValueError:
return 0
if new_phase != phase:
return 0 # read seems misphased
return phase
def get_snps(snpfile):
snps = defaultdict(dict)
if path.exists(path.join(path.dirname(snpfile), 'true_hets.tsv')):
print("using true hets")
true_hets = {tuple(line.strip().split()):True
for line in open(path.join(path.dirname(snpfile), 'true_hets.tsv'))
}
else:
true_hets = defaultdict(lambda x: True)
if snpfile.endswith('.bed'):
for line in open(snpfile):
chrom, _, start, refalt = line.strip().split()
if true_hets.get((chrom, start), True):
snps[chrom][int(start)] = refalt.split('|')
return snps
def split_reads(reads, ref_reads, alt_reads):
read_results = Counter()
ref_bam = Samfile(ref_reads, 'wb', template=reads)
alt_bam = Samfile(alt_reads, 'wb', template=reads)
prev_phase = None
prev_read = None
prev_qname = None
test = 0
for read in reads.fetch(until_eof=True):
test += 1
chrom = read.reference_name
snps_on_chrom = snp_dict[chrom]
phase = get_phase(read, snps_on_chrom)
read_qname = read.qname
if read_qname == prev_qname:
read_results["tot_read"]+=1
phase_set = set([phase, prev_phase])
phase_set.discard(None)
if len(phase_set) == 1:
read_phase = phase_set.pop()
if read_phase == -1:
ref_bam.write(read)
ref_bam.write(prev_read)
read_results["ref_read"]+=1
elif read_phase == 1:
alt_bam.write(read)
alt_bam.write(prev_read)
read_results["alt_read"]+=1
elif read_phase == 0:
read_results['misphased_read']+=1
elif len(phase_set)==0:
read_results["no_snps_read"]+=1
else:
read_results['misphased_read']+=1
prev_read = read
prev_phase = phase
prev_qname = read_qname
return(read_results)
def parse_args():
parser = ArgumentParser()
parser.add_argument('snp_file', help="Tab delimited text file of known SNPs with 4 columns: chrom, 0-position, 1-position, ref_allele|alt_allele")
parser.add_argument('reads', type=Samfile, help="BAM file that needs splitting")
parser.add_argument('ref_reads', help="Output BAM file of reference reads.")
parser.add_argument('alt_reads', help="Output BAM file of alternate reads.")
args = parser.parse_args()
return args
if __name__ == "__main__":
print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")),"Loading SNPs...")
args = parse_args()
snp_dict = get_snps(args.snp_file)
print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")),"Done with SNPs. Splitting reads...")
read_results = split_reads(args.reads, args.ref_reads, args.alt_reads)
print(time.strftime(("%b %d ") + time.strftime("%I:%M:%S")),"Finished!")
print()
print("RUN STATISTICS:")
print(" Total input reads:", str(read_results['tot_read']))
print(" Reads with no SNPs:", str(read_results['no_snps_read']),
"(" + "%.2f" % ((read_results['no_snps_read']/float(read_results['tot_read']))*100) + "%)")
print(" Reads with misphased SNPs:", str(read_results['misphased_read']),
"(" + "%.2f" % ((read_results['misphased_read']/float(read_results['tot_read']))*100) + "%)")
print(" Reference reads:", str(read_results['ref_read']),
"(" + "%.2f" % ((read_results['ref_read']/float(read_results['tot_read']))*100) + "%)")
print(" Alternate reads:", str(read_results['alt_read']),
"(" + "%.2f" % ((read_results['alt_read']/float(read_results['tot_read']))*100) + "%)")