-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_needle.py
executable file
·370 lines (262 loc) · 10.5 KB
/
run_needle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#!/usr/local/bin/python
"""Utilities for running and parsing output from EMBOSS needle."""
import sys, os, re, logging, time, copy, random, tempfile, string, commands, logging, pprint, glob
import sequtil
from sequtil import wrap, cast, decode_aln, pid_m1, pid_m2, pid_m3
import io_fasta
###### copy this code into each module to set up logging
log = logging
#######################################
TEMPDIR = tempfile.gettempdir()
ALIGN_SUFFIX = '.needle'
EDNAFULL = os.path.join(os.path.abspath(os.path.split(__file__)[0]),'data','EDNAFULL')
#EDNAFULL = os.path.join(sys.prefix, 'lib', 'python'+sys.version[:3],'site-packages','Seq','data','EDNAFULL')
EXEC_PATH = '/usr/local/bin'
class FormatError(Exception):
pass
def randomname(length=12):
letters = string.letters
chars = string.letters + string.digits
return ''.join([random.choice(letters)]+[random.choice(chars) for i in range(length-1)])
def make_temp_fasta(seq_or_list, tempdir):
"""Writes a sequence object or list of objects to a temporary
file with random name and returns the absolute path to the file."""
fname = os.path.join(tempdir, randomname(12) + '.fasta')
f = open(fname, 'w')
f.write(io_fasta.write(seq_or_list))
f.close()
return fname
def get_path_or_write_file(seq, tempdir=TEMPDIR):
"""If seq is a Seq instance or list of Seq
instances, , returns the absolute path of a temporary file
containing the fasta format sequence; if a readable
file, returns the absolute path."""
try:
assert os.access(seq, os.F_OK)
except AssertionError:
raise IOError, 'The file %s cannot be read' % seq
except TypeError:
return make_temp_fasta(seq, tempdir)
else:
return os.path.abspath(seq)
def needle(query, target, outputfile=None, exec_path=None, submat=None, cleanup=True):
"""Returns a dict keyed by (seqname1,seqname2) pairs containing
alignment data
query, target - either a list of Seq objects or the path to a fasta file
outputfile - optional filename for output of needle
exec_path - optional directory containing needle program; default is /usr/local/bin
submat - optional path to a substitution matrix
cleanup - if True, delete needle output after it is parsed
"""
# see http://bioweb.pasteur.fr/docs/EMBOSS/needle.html
query_file = get_path_or_write_file(query)
target_file = get_path_or_write_file(target)
if exec_path:
needle_prog = os.path.join(exec_path, 'needle')
else:
# use default location for installed apps
# needle_prog = os.path.join(EXEC_PATH, 'needle')
needle_prog = 'needle'
cwd = os.path.abspath(os.getcwd())
if not outputfile:
outputfile = os.path.join(TEMPDIR, randomname(12)+ALIGN_SUFFIX)
else:
outputfile = os.path.join(cwd, outputfile)
if submat:
use_submat = '-datafile "%s"' % os.path.abspath(submat)
else:
use_submat = ''
cmd = """
%(needle_prog)s
%(use_submat)s
-asequence %(query_file)s
-bsequence %(target_file)s
-outfile %(outputfile)s
-aglobal3
-gapopen 10
-gapextend 0.5
-nobrief
-aformat3 markx10"""
cmd = ' '.join(cmd.split()) % locals()
log.debug( cmd )
cmd_output = commands.getoutput(cmd)
log.debug(cmd_output)
if not os.access(outputfile, os.F_OK):
raise IOError, cmd_output
# parse the data
data = parseNeedle(open(outputfile).read())
if cleanup:
tempfiles = set(glob.glob(os.path.join(TEMPDIR, '*.fasta')) + \
glob.glob(os.path.join(TEMPDIR, '*' + ALIGN_SUFFIX)))
for f in [query_file, target_file, outputfile]:
if f in tempfiles:
#log.debug('removing %s' % f)
os.remove(f)
query_file = target_file = outputfile = None
else:
for k in data.keys():
data[k]['file_q'] = query_file
data[k]['file_t'] = target_file
data[k]['file_out'] = outputfile
# pprint.pprint(data)
# sys.exit()
return data
def get_tup(s, prefix=None):
key, val = s.strip().split(':')
key = key.replace('-','_')
if prefix:
return '%s_%s' % (prefix, key), val.strip()
else:
return key, val.strip()
def parseNeedle(instr):
"""Extracts various data from output of needle with
-aformat3 markx10
return a dict of dicts keyed by string seqid.
adds keys q_al_enc, t_al_enc to each dict D
such that the following is true:
aligned = D['q_al_str']
degapped = aligned.replace('-','')
aligned == sequtil.decode_aln(degapped, eval(D['q_al_enc']))
"""
# do we have the correct format?
assert instr.find('Align_format: markx10') != -1
# remove footer
instr, _, _ = instr.rsplit('#',2)
# remove header
datablocks = instr.split('>>>')
datablocks.pop(0)
outputData = {}
for i, block in enumerate(datablocks):
# lop off the commented data and run parameters
block = block.split('#=====')[0]
block = block.split('>>#')[1]
# log.debug('\n%(i)i ------>\n%(block)s\n<------- %(i)i' % locals())
# consumes the first line
align_no, block = block.split('\n',1)
assert int(align_no) == i+1
header, query, target = block.split('>')
# get the sequence names
q_name, query = query.split('..',1)
t_name, target = target.split('..',1)
this_key = tuple(sorted([q_name.strip(), t_name.strip()]))
## process the header info
d = dict([get_tup(e) for e in header.split(';') if e.strip()])
## process the query and target
q_data = dict([get_tup(e, 'q') for e in query.split(';') if e.strip()])
t_data = dict([get_tup(e, 't') for e in target.split(';') if e.strip()])
d['align_num'] = i
## add data
d['q_name'] = q_name
d.update(q_data)
d['t_name'] = t_name
d.update(t_data)
d['q_al_display_start'], q_al_str = d['q_al_display_start'].split('\n',1)
d['t_al_display_start'], t_al_str = d['t_al_display_start'].split('\n',1)
d['q_al_str'] = q_al_str.replace('\n','').upper()
d['t_al_str'] = t_al_str.replace('\n','').upper()
d['q_al_enc'] = `sequtil.encode_aln(d['q_al_str'], gapchar='-', self_check=True)`
d['t_al_enc'] = `sequtil.encode_aln(d['t_al_str'], gapchar='-', self_check=True)`
assert not outputData.has_key(this_key)
d = dict((k,cast(v)) for k,v in d.items())
add_calculated_values(d)
outputData[this_key] = d
# log.debug('output data:')
# log.debug(pprint.pformat(outputData))
return outputData
def add_calculated_values(d):
starting_keys = set(d.keys())
q_al_str, t_al_str = d['q_al_str'], d['t_al_str']
start, stop = find_end_gaps(q_al_str, t_al_str)
q_list, t_list = list(q_al_str)[start:stop], list(t_al_str)[start:stop]
d['trim_start'] = start
d['trim_stop'] = stop
d['pid_m1'] = pid_m1(q_list, t_list)
d['pid_m2'] = pid_m2(q_list, t_list)
d['pid_m3'] = pid_m3(q_list, t_list)
ending_keys = set(d.keys())
#print 'added the following keys: '+`ending_keys-starting_keys`
trim_rexp = re.compile(r'(?P<leading>^-+)?.+?(?P<trailing>-+)?$')
def find_end_gaps(q_al_str, t_al_str):
"""
trim end gaps from align strings:
-----XXXXXXXXXXXXXXXXXXXXXXX
YYYYYYYYYYYYYYYYYYYY--------
becomes
XXXXXXXXXXXXXXX
YYYYYYYYYYYYYYY
return q_al_str, t_al_str
note that the trimmed string is obtained with, eg
q_al_str[startpos:endpos]
"""
assert len(q_al_str) == len(t_al_str)
q_groups = trim_rexp.match(q_al_str).groups()
t_groups = trim_rexp.match(t_al_str).groups()
# determine leading coords
try:
startpos = len(max([q_groups[0], t_groups[0]]))
except TypeError:
startpos = 0
try:
endpos = len(q_al_str) - len(max([q_groups[1], t_groups[1]]))
except TypeError:
endpos = len(q_al_str)
return startpos, endpos
def trim_align(seqlist, align_data):
"""Assumes align_data is keyed by seq.getName(). Returns
a new list of Seq objects, trimmed according to al_start
and al_stop. Reverse-complements the sequence if necessary
according to the orientation in the input alignment.
"""
trimmed_seqs = []
for seq in seqlist:
name = seq.getName()
if not align_data.has_key(name):
log.info('the sequence %(name)s was not found in the alignment data' % locals())
continue
these_results = align_data[name]
start = these_results['al_start']
stop = these_results['al_stop']
substr = seq[start - 1:stop]
log.debug('name: %(name)s start: %(start)s stop: %(stop)s' % locals())
if these_results['fa_frame'] == 'r':
substr = Seq.reverse_complement(substr)
log.debug('seq %s frame=%s, reverse complementing seq:\n%s' % (name, these_results['fa_frame'], substr))
newseq = copy.deepcopy(seq)
newseq.setSeq(substr)
trimmed_seqs.append(newseq)
return trimmed_seqs
def print_align(q_al_str, t_al_str, width):
for q,t in zip(wrap(q_al_str,width,'list'), wrap(t_al_str,width,'list')):
print q
print t
print ''
def show_record(row, width=60, align=True):
for c in sorted(row.keys()):
if c.endswith('_al_str'):
continue
print '%s : %s'%(c,row[c])
if align:
start, stop = row['trim_start'], row['trim_stop']
q_al_str, t_al_str = row['q_al_str'], row['t_al_str']
print 'aligned strings'
print_align(q_al_str, t_al_str, width)
print 'q_aln_str[%(start)s:%(stop)s], t_aln_str[%(start)s:%(stop)s]' % locals()
print_align(q_al_str[start:stop], t_al_str[start:stop], width)
def show_records(records, width=60, align=True):
for k,row in sorted(records.items(), key=lambda x: x[1]['align_num']):
print k
show_record(row, width, align)
def main():
logging.basicConfig(level=logging.DEBUG, format='%(lineno)s %(levelname)s %(message)s', stream=sys.stdout)
test_input = os.path.abspath('testfiles/10patients.fasta')
print 'reading fasta format file %s' % test_input
seqlist = io_fasta.read(open(test_input).read())
query, target = seqlist[0], seqlist[1:]
pairs = needle(query, target, cleanup=True, submat='data/EDNAFULL')
print 'master dict keyed by: %s ...' % `pairs.keys()[:10]`
print 'each dict keyed by: ' + `pairs.values()[0].keys()`
for k,v in sorted(pairs.items()):
show_record(v, align=True)
print '*'*30
if __name__ == '__main__':
main()