forked from argriffing/xgcode
/
20090805a.py
370 lines (348 loc) · 15.7 KB
/
20090805a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
"""Do a tree reconstruction simulation for the research paper.
For each of N sequence lengths, sample K trees;
from each tree sample a distance matrix;
for each distance matrix, reconstruct the tree using different methods.
"""
from StringIO import StringIO
import time
import math
import random
import optparse
import numpy as np
from SnippetUtil import HandlingError
import MatrixUtil
import BranchLenSampler
import TreeSampler
import BuildTreeTopology
import Xtree
import JC69
import Euclid
import Form
import FormOut
import Progress
# define the headers of the R table
g_headers = [
'sequence.length',
'nsamples.accepted',
'nsamples.accepted.atteson',
'nsamples.rejected.zero',
'nsamples.rejected.inf',
'nsamples.rejected.fail',
'nsuccesses.both',
'nsuccesses.neither',
'nsuccesses.nj.only',
'nsuccesses.topdown.only',
'first.split.informative',
'first.split.uninformative',
'first.split.invalid']
class InfiniteDistanceError(Exception): pass
class ZeroDistanceError(Exception): pass
class TimeoutError(Exception): pass
def get_form():
"""
@return: the body of a form
"""
form_objects = [
Form.Integer('ntaxa',
'number of taxa for distance matrix sampling',
20, low=4, high=20),
Form.Integer('nlengths',
'number of sequence lengths to consider (must be odd)',
9, low=3, high=99),
Form.Integer('nsamples',
'number of samples per sequence length',
5, low=1, high=99),
Form.RadioGroup('tree_sampling', 'branch length distribution', [
Form.RadioItem('pachter_length',
str(BranchLenSampler.Pachter()), True),
Form.RadioItem('exponential_length',
str(BranchLenSampler.Exponential())),
Form.RadioItem('uniform_length_a',
str(BranchLenSampler.UniformA())),
Form.RadioItem('uniform_length_b',
str(BranchLenSampler.UniformB()))]),
Form.RadioGroup('distance_options',
'recursive matrix construction method', [
Form.RadioItem('pruning_like',
'go through the Laplacian, like Felsenstein pruning',
True),
Form.RadioItem('nj_like',
'directly use distances, like neighbor joining')])]
return form_objects
def get_form_out():
return FormOut.RTable()
def sample_distance_matrix(xtree_root, sequence_length):
sequences = JC69.sample_xtree_sequences(xtree_root, sequence_length)
nsequences = len(sequences)
pairwise_mismatch_count = np.zeros((nsequences, nsequences))
for i, sa in enumerate(sequences):
for j, sb in enumerate(sequences):
if i < j:
nmismatches = sum(1 for a, b in zip(sa, sb) if a != b)
if not nmismatches:
raise ZeroDistanceError()
if nmismatches * 4 >= sequence_length * 3:
raise InfiniteDistanceError()
pairwise_mismatch_count[i][j] = nmismatches
D = np.zeros_like(pairwise_mismatch_count)
for i in range(nsequences):
for j in range(nsequences):
if i < j:
d_raw = pairwise_mismatch_count[i][j] / float(sequence_length)
b = 0.75
d_mle = -b*math.log(1 - d_raw/b)
D[i][j] = d_mle
D[j][i] = d_mle
return D
def get_response_content(fs):
# allow only two seconds for web access
nseconds = 2
# read the options
ntaxa = fs.ntaxa
nlengths = fs.nlengths
nsamples = fs.nsamples
nj_like = fs.nj_like
# do extra validation
if nlengths % 2 == 0:
raise HandlingError('the number of sequence lengths must be odd')
# define the branch length sampler
if fs.pachter_length:
branch_length_sampler = BranchLenSampler.Pachter()
elif fs.exponential_length:
branch_length_sampler = BranchLenSampler.Exponential()
elif fs.uniform_length_a:
branch_length_sampler = BranchLenSampler.UniformA()
elif fs.uniform_length_b:
branch_length_sampler = BranchLenSampler.UniformB()
# get the response
response_text = process(ntaxa, nseconds, nlengths, nsamples,
nj_like, branch_length_sampler, False)
return response_text + '\n'
def gen_sequence_lengths_helper(n, low, high):
"""
Generate integer lengths that are nearly evenly spaced on a log scale.
@param n: the number of sequence lengths to generate
@param low: the smallest sequence length
@param high: the highest sequence length
"""
yield low
incr = (float(high) / float(low)) ** (1.0 / (n-1))
for i in range(1, n-1):
yield int(low*(incr ** i))
yield high
def get_sequence_lengths(nlengths):
"""
Each length is a positive integer between 100 and 10000.
The lengths 100, 1000, and 10000 are always included.
The ratios of consecutive lengths are roughly constant.
@param nlengths: the number of sequence lengths
@return: a list of sequence lengths
"""
# the number of lengths must be odd and at least 3
assert nlengths % 2
assert 3 <= nlengths
first_lengths = list(gen_sequence_lengths_helper((nlengths+1)/2, 100, 1000))
second_lengths = list(gen_sequence_lengths_helper((nlengths+1)/2, 1000, 10000))
lengths = first_lengths + second_lengths[1:]
return lengths
def incr_attribute(attribute_array, attribute):
"""
Increment an element in an attribute array.
The motivation of this approach is the convenience of numpy array addition.
@param attribute_array: a numpy array conformant to the global header list
@param attribute: an element of the global header list
@return: the attribute array
"""
header_to_index = dict((header, i) for i, header in enumerate(g_headers))
index = header_to_index[attribute]
attribute_array[index] += 1
return attribute_array
def get_attribute(attribute_array, attribute):
"""
Get an element in an attribute array.
The motivation of this approach is the convenience of numpy array addition.
@param attribute_array: an array conformant to the global header list
@param attribute: an element of the global header list
@return: an element of the attribute array
"""
header_to_index = dict((header, i) for i, header in enumerate(g_headers))
index = header_to_index[attribute]
return attribute_array[index]
def set_attribute(attribute_array, attribute, value):
"""
Set an element in an attribute array.
The motivation of this approach is the convenience of numpy array addition.
@param attribute_array: an array conformant to the global header list
@param attribute: an element of the global header list
@param value: the value to set
"""
header_to_index = dict((header, i) for i, header in enumerate(g_headers))
index = header_to_index[attribute]
attribute_array[index] = value
def get_sample_results(sequence_length, ntaxa, nj_like, branch_length_sampler):
"""
@param sequence_length: the length of each sequence in the sampled alignment
@param ntaxa: the number of sequences in the sampled tree
@param nj_like: True to create subsequent distance matrices using a generalized neighbor-joining-like approach
@param branch_length_sampler: the length of each branch is independently sampled by this function
@return: a numpy array conformant to the global header list
"""
# initialize the array that will be returned
attribute_array = np.zeros((len(g_headers),), dtype=np.int)
# first sample a tree and get its set of informative splits
tree = TreeSampler.sample_agglomerated_tree(ntaxa)
true_splits = tree.get_nontrivial_splits()
# sample the branch lengths
for branch in tree.get_branches():
branch.length = branch_length_sampler()
# sample a distance matrix
try:
D = sample_distance_matrix(tree, sequence_length)
except InfiniteDistanceError as e:
return incr_attribute(attribute_array, 'nsamples.rejected.inf')
except ZeroDistanceError as e:
return incr_attribute(attribute_array, 'nsamples.rejected.zero')
except BuildTreeTopology.InvalidSpectralSplitException, e:
return incr_attribute(attribute_array, 'nsamples.rejected.fail')
# see if the top down reconstruction was successful
try:
splitter = BuildTreeTopology.split_using_eigenvector_with_nj_fallback
if nj_like:
updater = BuildTreeTopology.update_generalized_nj
else:
updater = BuildTreeTopology.update_using_laplacian
all_spectral_splits = BuildTreeTopology.get_splits(D, splitter, updater)
top_down_success = (all_spectral_splits == true_splits)
except BuildTreeTopology.InvalidSpectralSplitException, e:
return incr_attribute(attribute_array, 'nsamples.rejected.fail')
# at this point the sample is accepted
incr_attribute(attribute_array, 'nsamples.accepted')
# determine whether or not the distance matrix is Atteson with respect to the tree
if BuildTreeTopology.is_atteson(tree, D):
incr_attribute(attribute_array, 'nsamples.accepted.atteson')
# see if the bottom up reconstruction was successful
nj_splits = BuildTreeTopology.get_splits(D, BuildTreeTopology.split_nj, BuildTreeTopology.update_nj)
nj_success = (nj_splits == true_splits)
# note the joint results of the two reconstruction methods
if top_down_success and nj_success:
incr_attribute(attribute_array, 'nsuccesses.both')
elif (not top_down_success) and (not nj_success):
incr_attribute(attribute_array, 'nsuccesses.neither')
elif top_down_success and (not nj_success):
incr_attribute(attribute_array, 'nsuccesses.topdown.only')
elif (not top_down_success) and nj_success:
incr_attribute(attribute_array, 'nsuccesses.nj.only')
# characterize the result of the first spectral split
try:
eigensplit = BuildTreeTopology.split_using_eigenvector(D)
if eigensplit in true_splits:
incr_attribute(attribute_array, 'first.split.informative')
else:
incr_attribute(attribute_array, 'first.split.invalid')
except BuildTreeTopology.DegenerateSplitException, e:
incr_attribute(attribute_array, 'first.split.uninformative')
# return the attribute array
return attribute_array
def process(ntaxa, nseconds, nlengths, nsamples, nj_like, branch_length_sampler, use_pbar):
"""
@param ntaxa: the number of taxa per tree
@param nseconds: stop after this many seconds
@param nlengths: use this many different sequence lengths
@param nsamples: stop after this many samples per sequence length
@param nj_like: True to use a generalized neighbor-joining-like method of computing successive distance matrices
@param branch_length_sampler: this function samples branch lengths independently
@param use_pbar: True iff a progress bar should be used
@return: a multi-line string of the contents of an R table
"""
# define the sequence lengths
lengths = get_sequence_lengths(nlengths)
# initialize the accumulation matrix
accum = np.zeros((nlengths, len(g_headers)), dtype=np.int)
for i, sequence_length in enumerate(lengths):
set_attribute(accum[i], 'sequence.length', sequence_length)
# Repeatedly analyze samples from each sequence length.
# We might have to stop early if we run out of time or if ctrl-c is pressed.
# If we have to stop early, then show the results of the progress so far.
termination_reason = 'no reason for termination was given'
start_time = time.time()
pbar = None
if use_pbar:
pbar = Progress.Bar(nsamples)
try:
for sample_index in range(nsamples):
# reset the accumulation matrix for this iteration
single_iteration_accum = np.zeros((nlengths, len(g_headers)))
# accumulate attributes of sampling attempts for each sequence length
for sequence_length_index, sequence_length in enumerate(lengths):
# keep trying to get an accepted sample
while True:
# check the time
if nseconds and time.time() - start_time > nseconds:
raise TimeoutError()
# get counts of attributes of a sample
sample_result = get_sample_results(sequence_length, ntaxa, nj_like, branch_length_sampler)
single_iteration_accum[sequence_length_index] += sample_result
# if the sample was accepted then we are done looking
if get_attribute(sample_result, 'nsamples.accepted'):
break
# finish the iteration
accum += single_iteration_accum
if pbar:
pbar.update(sample_index + 1)
else:
termination_reason = 'the requested number of samples per sequence length was attained'
except KeyboardInterrupt, e:
termination_reason = 'keyboard interrupt'
except TimeoutError as e:
termination_reason = 'time limit expired'
if pbar:
pbar.finish()
# define the matrix successor creation method explanatory string
if nj_like:
matrix_successor_explanation = 'like neighbor joining'
else:
matrix_successor_explanation = 'like Felsenstein pruning'
# define the time limit string
if nseconds:
time_limit_string = '%d seconds' % nseconds
else:
time_limit_string = '(no time limit)'
# create the results in convenient R table form
out = StringIO()
print >> out, '#', 'R usage: mytable <- read.table(\'this.filename\')'
print >> out, '#', time.time() - start_time, 'elapsed seconds'
print >> out, '#', 'the simulation was limited to', time_limit_string
print >> out, '#', 'the simulation was limited to', nsamples, 'samples per sequence length'
print >> out, '#', 'reason for termination:', termination_reason
print >> out, '#', 'matrix successor creation method:', matrix_successor_explanation
print >> out, '#', ntaxa, 'taxa per tree'
print >> out, '#', branch_length_sampler
print >> out, '\t'.join(g_headers)
for i, row in enumerate(accum):
print >> out, '\t'.join(str(x) for x in [i+1] + row.tolist())
# return the results
return out.getvalue().strip()
def main(options):
# validate the options
assert 0 <= options.nseconds
assert 4 <= options.ntaxa <= 30
assert options.nlengths % 2
assert 3 <= options.nlengths
assert 1 <= options.nsamples
branch_length_sampler = BranchLenSampler.UniformB()
#branch_length_sampler = BranchLenSampler.Pachter()
use_pbar = True
try:
print process(options.ntaxa, options.nseconds, options.nlengths, options.nsamples, options.nj_like, branch_length_sampler, use_pbar)
except HandlingError as e:
print 'Error:', e
if __name__ == '__main__':
from optparse import OptionParser
parser = OptionParser()
parser.add_option('--ntaxa', dest='ntaxa', type='int', default=20, help='number of taxa in each sampled tree topology')
parser.add_option('--nseconds', dest='nseconds', type='int', default=0, help='seconds to run or 0 to run until ctrl-c')
parser.add_option('--nlengths', dest='nlengths', type='int', default=29, help='number of sequence lengths')
parser.add_option('--nsamples', dest='nsamples', type='int', default=5, help='number of samples per sequence length')
parser.add_option('--nj-like', action='store_true', dest='nj_like', default=False, help='use a generalized NJ-like way to create successor distance matrices')
options, args = parser.parse_args()
main(options)