-
Notifications
You must be signed in to change notification settings - Fork 1
/
woodfox.py
executable file
·309 lines (261 loc) · 14.2 KB
/
woodfox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#!/usr/bin/env python2.7
## If the ASCII art isn't readable below, you're not viewing this file in the intended way;
## you MUST use a monospaced font
# ........... .. . . . ........ . ... ...........................................
# ..........................................?MMMM$................................
# ............. . ................MMMMMMMMMMMMMM............................
# ..................... .............,MMMMMMMMMMMMMMMMMM..........................
# .................................NMMMMMMMMMMMMMMMMMMMMMM........................
# ...............................MMMMMMMMMMMMMMMMMMMMMMMMMMN... .. .. ... . ... ..
# .........M...................:MMMMMMMMMMMMMMMMMMMMMMMMMMMMN ....................
# ........ MM.................MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMI....................
# ........~MM$..............:MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM......... ..........
# ........MMMM............ MMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMD...................
# ........M.=M............MMMMMMMMMM. OMMMMM +MMMMMMMMMMMMMMMM...................
# .......M ..MM ...... MMMMMMMMM.............. OMMMMMMMMMMMMMM..................
# .......,... MMMMMMM8...MMMMMMMM ................ MMMMMMMMMMMMM .................
# ......M, MMMMMMMMMMMMMMMMMMMMM.......... N.........NMMMMMMMMMMMM,...............
# .....MMNMMMMMMMMMMMMMMMMMMMMM......... MMMM,........MMMMMMMMMMDOMMMMMNN.. ......
# ....MMMMMMMMMMMMMMMMMMMMMMMM8........ZMM .?M.........MMMMMMMMMM... ZMMMMI.......
# ....MMMMMMMMMMMMMMMMMMMMMMMM........MMO....NN........$M MMMMMMM........OMN .....
# ....MMMMMMMMMMMMM MMMMMMMMMM...... MM........NM.......D.8MMMMMM..........MD ....
# ....MMMMI..MMMMM.?MMMMMMMMMM.... ~M7..........~MM.... ...MMMMMM....M. ...MM ....
# ....MMMM.........MMMMMMMMMMM$...MM..............MM... ..,MMMMMM....IM ....MM....
# ...NMMMM..........MMMMMMMMMMMM MM ...............MM.....MMMMMMM.....M..... M....
# ...MMMM ..........MMMMMMMMMM? MM................. MM....MMMMMM:.....MM.....M~...
# ..MM...:MMMMMMD. MMMMMMMMMN MZ...................MM....MMMMMM .....MM.....N....
# ..MM...~M$....$M..MMMMMMMMM N=....................MM... MMMMM$,.....MMMM........
# ..M... MM.........:MMMMMMMM8M.....................MM...MMMMMMOM+....DMMM$...M...
# ...DNMM=...........MMMMMMMMM......................MM...MMMMMMMMMM...MMMMZ...M ..
# . MMMMM............ MMMMMMMM......................MM...MMMMM+,MM....MMMM ...M8..
# DMMM...............M=MMMMMMM......................MM ..MMMMM $ ...IMMMMMO.:M...
# .MM?...............M MMMMMMM ......................MM, .MMMMM~.... MMMMMM..MM...
# .. .. . . .DM8? ...MM.MMMMMMZ........................MM .MMMMM... MMMMMMMOMM$...
# ............MMMMMN.MMD MMMMMM......................NM:.MMMMMMM:.=MMMMMMMMMMM....
# .............MMMMMMMMM.MMMMMM,.MM.................MMMMM MMMMMMMMMMMMMMMMMMM.....
# ............. MMMMMMMMM MMMMMMMMM ..............MMMMMMMMMMMMMMMMMMMMMMMMMM:.....
# ..... . .......MMMMMMMM.MMMMMMMMMO...........M?. $MMMMMMMMMMMMMMMMMMMMMMM ......
# ................MMMMMMMM MMMMMMMMM.............8MMMMNMMMMMMMMMMMMMMMMMM$........
# ..... . ..... MM. . ..MMMMMMMMM .......M MMMMMMMMMMMMMMMMMMMMMMMMMM .........
# .................MM8......MMMMMMMMM..... MMMMMMMMMMMMMMMMMMMMMMMMMMZ ...........
# ....... . ....... MM......=MMM.MMMM.....MMMMM.:MMM...$MMMMMMMMMM$ ..............
# ..................MM.......MM..MMMM.....MM ..,MMM....MMMMM......................
# ....... . ... . ..MM ..........MMMM....$... MMM....MMMMM.......................
# ...................MM ..........MMMM........MMM....MMMMM:MI.....................
# .......... . . . ?M,... .......MMM .... .MMM... MMMMMMMD......................
# ............ . ...ZZ...........MMMM.....NM.....:MMMMMMM.......................
# ......... ... .. .... ............MMM.....M .....MM MMMM........................
# ..................................MMM~.... ........MMMM.........................
# ....... ... .. .... . ..MMM... .........MMZ..........................
# ..................... .............,MM........... MM ...........................
# ....... . ........ .... MM ..........$M ............................
# ........... .. .... . .. .M .. ...... N..............................
# ........... .. . ........ . .....M.........................................
# ........... .. ... . .. . .....................................
# ..... . .. . . . ... . ...................................
## Purpose: woodFox is our indel filtering tool.
## Accepts either normal/tumour exome directories as input, or full paths to the relevant files
## It calculates the following metrics with these limits:
## Mapping quality; x > 50 : calculated using Pysam - COMPLETE
## Base quality; > 26 : calculated using Pysam - COMPLETE
## Not somatic (i.e. in normal); no more than 2 indel containing reads allowed : calculated with Pysam - COMPLETE
## Alignability at site must be 1 - using Pybedtools - COMPLETE
## Number of SNVs/indels in the normal sample; should be <0.1 events per read - COMPLETE
import sys # so we can exit the program
import os # to check if directories exist
import argparse # command line args
import csv # line-by-line operations
import pysam # for native bam operations
import re # for regular expressions
import pybedtools # for intersectBed alignability
import socket # Required to find hostname
import logging # For debugging purposes
## Mean function so we don't have to import numpy
def mean(numbers):
if(len(numbers)==0):
x=0
else:
x=float(sum(numbers))/len(numbers)
return(x)
## Gather command line args
## ONLY THE FIRST ARGUMENT (args.t) ; BY DEFAULT THE PROGRAM INFERS args.t1,args.t2
## Supply args.t OR args.t1,args.t2, NOT both
parser = argparse.ArgumentParser()
parser.add_argument("-t", type=str, help="full path to tumour exome directory; e.g. \"/home/chris_w/project_bile_duct_cancer/analysed_data/HK02_c\"",required=False)
parser.add_argument("-t1", type=str, help="full path to tumour bam; e.g. \"/home/chris_w/project_bile_duct_cancer/analysed_data/HK02_c/dedup/dedup.bam\"",required=False)
parser.add_argument("-t2", type=str, help="full path to parsed Strelka VEP file; e.g.\"/home/chris_w/project_bile_duct_cancer/analysed_data/HK02_c/strelka/passed.somatic.indels.vep.parsed.txt\"",required=False)
parser.add_argument("-n", type=str, help="full path to normal exome directory; e.g. \"/home/chris_w/project_bile_duct_cancer/analysed_data/HK02_L\"",required=False)
parser.add_argument("-n1", type=str, help="full path to normal bam; e.g. \"/home/chris_w/project_bile_duct_cancer/analysed_data/HK02_L/dedup/dedup.bam\"",required=False)
parser.add_argument("--debug", help="Turn debugging mode on",action="store_true")
args = parser.parse_args()
## Turn logging on if desired
if(args.debug):
logging.basicConfig(level=logging.DEBUG)
## Use statements like this to print to STDOUT
logging.debug("Debugging mode enabled")
## EXAMPLE INDELS TO FILTER
## /mnt/HPC_haem/cwardell/analysis/exomes/My9-3_My9-CD138+-1/indels/indels.filtered.vcf
#args.t = "/home/chris_w/project_bile_duct_cancer/analysed_data/HK66_c"
#args.n = "/home/chris_w/project_bile_duct_cancer/analysed_data/HK66_L"
## If args.t is supplied, infer args.t1,args.t2
if(args.t!=None):
## If the input argument doesn't end in a forward slash, add one
if(args.t[-1] != "/"):
args.t = args.t+"/"
args.t1 = args.t + "dedup/dedup.bam"
args.t2 = args.t + "strelka/passed.somatic.indels.vep.parsed.txt"
## If args.n is supplied, infer args.n1
if(args.n!=None):
## If the input argument doesn't end in a forward slash, add one
if(args.n[-1] != "/"):
args.n = args.n+"/"
args.n1 = args.n + "dedup/dedup.bam"
## If args.t1,args.t2 don't exist by this point, they haven't been explicitly supplied
## or inferred, so warn the user and exit
if(args.t1==None):
#if(args.t1==None and args.t2==None):
print "You must supply EITHER the -t OR -t1,-t2 arguments"
sys.exit()
## Check that the args.t1 directory exists. If it doesn't, exit
if not os.path.isfile(args.t1):
print args.t1+" does not exist; are you certain that this is a tumour sample and that Strelka has been run?"
sys.exit()
## Echo arguments for debugging purposes
#logging.debug("args.n: "+args.n)
#logging.debug("args.t: "+args.t)
## Get coordinates of all indels and store them in an object - a list of tuples
indels = []
inOrDel=[]
headRE=re.compile("^#") # skip header
for row in csv.reader(open(args.t2),delimiter="\t"):
if not headRE.match(row[0]):
## Determine if the indel is an insertion or deletion
type="insertion"
if row[2]=="-":
type="deletion"
## Append data to relevant object
inOrDel.append(type)
indels.append((row))
## New test; define a window around the event and count the total number
## of SNVs and the depth of the window in the NORMAL sample
## This implies the reliability of the region in non-tumour samples
## High scorers should be false positives
sitedepths=[]
totalevents=[]
## Iterate through every indel
for indel in indels:
## Set properties of indel
CHROM=indel[1].split(":")[0]
POS=int(indel[1].split(":")[1].split("-")[0])-1 # Pysam coordinates are ZERO-based, so we MUST subtract 1
logging.debug(str(CHROM)+":"+str(POS))
# Set the window size and reference genome
window=10
genome="/home/chris_w/resources/b37/human_g1k_v37.fasta"
# Produce a pileup of bam over the specified window
pile=pysam.mpileup("-f","/home/chris_w/resources/b37/human_g1k_v37.fasta","-r",CHROM+":"+str(POS-window)+"-"+str(POS+window),args.n1)
# Count how many SNVs and indels are present
# Indel count is unreliable, e.g. a 3bp insertion could be "+3GAC" which is 4 characters
# However, this inaccuracy is acceptable as indels are arguably worse than SNVs when considering
# the reliability of a region
# We can ignore "^" and "$" characters, as they denote the start/end of reads
# Note that "^" is ALWAYS followed by another character, so we remove that, too
siteevents=0
sitedepth=0
sites=0
for site in pile:
bases=site.split("\t")
events=len(bases[4])-bases[4].count(".")-bases[4].count(",")-2*bases[4].count("^")-bases[4].count("$")
siteevents=siteevents+events
sitedepth=sitedepth+int(bases[3])
sites+=1
sitedepth=sitedepth/sites
sitedepths.append(sitedepth)
totalevents.append(siteevents)
## Look for reads at the same position in the normal sample; how many indel containing reads are there?
logging.debug("Checking normal sample for indels")
somaticKeepers=[]
somaticindels=[]
## Iterate through every indel - this loop ONLY considers reads with indels in them
## We use enumerate() to create a nice index for us to use
for idx,indel in enumerate(indels):
## Set properties of indel
CHROM=indel[1].split(":")[0]
POS=int(indel[1].split(":")[1].split("-")[0])-1 # Pysam coordinates are ZERO-based, so we MUST subtract 1
## Count of indel-containing reads in normal
indelReads = 0
## Open the normal bam file
samfile=pysam.Samfile(args.n1,"rb") # rb = "read bam"
for alignedread in samfile.fetch(CHROM,POS,POS+1):
## Note that "is_proper_pair" excludes reads that map to different chromosomes (i.e. involved in interchromosomal translocations)
if(alignedread.is_proper_pair):
## Only detects deletions
if(any(pair[0] is None for pair in alignedread.aligned_pairs) and inOrDel[idx]=="deletion"):
indelReads+=1
## Only detects insertions
if(any(pair[1] is None for pair in alignedread.aligned_pairs) and inOrDel[idx]=="insertion"):
indelReads+=1
## No more than 2 indel containing reads allowed
if(indelReads < 3):
somaticKeepers.append(idx)
somaticindels.append(indelReads)
## Define a list in which to store the indices of all the lines we want to keep
logging.debug("Calcuating mapping and base quality scores")
mapbaseKeepers=[]
mapqualities=[]
basequalities=[]
## Loop through the list of indels again, this time considering base and mapping quality scores
## We use enumerate() to create a nice index for us to use
for idx,indel in enumerate(indels):
## Set properties of indel
CHROM=indel[1].split(":")[0]
POS=int(indel[1].split(":")[1].split("-")[0])-1 # Pysam coordinates are ZERO-based, so we MUST subtract 1
## Open the bam file
samfile=pysam.Samfile(args.t1,"rb") # rb = "read bam"
## Lists in which to store mapping and base quality data
mapq=[]
baseq=[]
## Get values at the SNV location
for alignedread in samfile.fetch(CHROM,POS,POS+1):
if(alignedread.is_proper_pair):
## Which base in the read is at the position we want? Use the "aligned_pairs" list of tuples to determine this
offset = [item for item in alignedread.aligned_pairs if item[1] == POS][0][0]
## offset == None when there is an indel at the site of the SNV
if(offset!=None):
mapq.append(alignedread.mapq)
baseq.append(ord(alignedread.qual[offset])-33) ## Subtract 33 because SAM specification tells us to
if(mean(mapq) >= 50 and mean(baseq) > 26 ):
mapbaseKeepers.append(idx)
mapqualities.append(mean(mapq))
basequalities.append(mean(baseq))
## Use Pybedtools to investigate alignability
logging.debug("Calculating alignability")
alignKeepers=[]
## We put this in a try clause, as some chromosome names (MT, GL, etc) are
## not in the alignability file and generate errors
alignfile=pysam.Tabixfile("/home/chris_w/resources/bedfiles/genomic_features/alignability/wgEncodeCrgMapabilityAlign100mer.bedGraph.gz")
alignabilities=[]
for idx,indel in enumerate(indels):
## Set properties of indel
CHROM=indel[1].split(":")[0]
POS=int(indel[1].split(":")[1].split("-")[0])-1 # Pysam coordinates are ZERO-based, so we MUST subtract 1
try:
for record in alignfile.fetch(CHROM, POS, POS+1):
alignability=float(record.split("\t")[3])
alignabilities.append(alignability)
except:
alignabilities.append(0)
pass
# Now we can write the output
logging.debug("Writing output")
## Iterate through every indel
iter=0
for row in csv.reader(open(args.t2),delimiter="\t"):
tabrow = "\t".join(row)
# Skip all header rows
if(row[0].startswith("#Uploaded")):
print "\t".join([tabrow,"somaticindels","mean_mapq","mean_baseq","alignability","normaldepth","normalevents"])
if(row[0].startswith("##")):
print tabrow
if not row[0].startswith("#"):
print "\t".join([tabrow,str(somaticindels[iter]),str(mapqualities[iter]),str(basequalities[iter]),str(alignabilities[iter]),str(sitedepths[iter]),str(totalevents[iter])])
iter+=1