-
Notifications
You must be signed in to change notification settings - Fork 0
/
CML.py
416 lines (383 loc) · 14.5 KB
/
CML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
import sys
from math import *
import numpy as np
import sympy.mpmath as sy
import scipy.misc as fac
import scipy.special as sp
from scipy.optimize import minimize
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
class CML:
def __init__(self, mu, sigma, density, dataIn,
distIn, szIn, ploidy, approx=True):
self.k = ploidy
self.mu = mu
self.mu2 = -2.0 * self.mu
self.s = sigma
self.ss = sigma * sigma
self.de = density
self.z = exp(self.mu2)
self.sqrz = sqrt(1 - self.z)
self.g0 = log(1 / float(self.sqrz))
# Calling set_data method will initialize these values
self.ndc = 0
self.tsz = 0
self.fhat = 0
self.dist = np.empty(0, dtype=float)
self.dist2 = np.empty(0, dtype=float)
self.data = np.empty(0, dtype=int)
self.sz = np.empty(0, dtype=int)
self.set_data(dataIn, distIn, szIn)
self.ml = np.zeros(2, dtype=float)
self.approx = approx
# switch between modeling the full model or approximated model
if approx:
self.update = self.apx_update
self.likelihood = self.apx_likelihood
self.bootstrap_helper = self.apx_bootstrap_helper
self.gen_data = self.apx_gen_data
else:
self.update = self.full_update
self.likelihood = self.full_likelihood
self.bootstrap_helper = self.full_bootstrap_helper
self.gen_data = self.full_gen_data
def apx_update(self, ar):
self.s = ar[0]
self.de = ar[1]
# print "arguments", ar
if(ar[0] <= 0 or ar[1] <= 0):
print "negative arguments"
return sys.maxint
self.ss = self.s * self.s
for i in xrange(self.ndc):
if self.dist[i] > 6 * self.s:
self.split = i
break
return -self.likelihood()
def full_update(self, ar):
self.s = ar[0]
self.de = ar[1]
self.ss = self.s * self.s
if(ar[0] <= 0 or ar[1] <= 0):
print "negative arguments"
return sys.maxint
return -self.likelihood()
def apx_likelihood(self):
phi = np.zeros((self.ndc))
phi_bar = 0
denom = float(2 * self.k * self.ss * pi * self.de + self.g0)
p = 0
for s in xrange(self.split):
if self.dist[s] == 0:
p = self.g0 / denom
else:
p = self.t_series(self.dist[s]) / denom
phi_bar += p * self.sz[s]
phi[s] = p
for l in xrange(self.split, self.ndc):
p = self.bessel(self.dist[l]) / denom
phi_bar += p * self.sz[l]
phi[l] = p
phi_bar /= self.tsz
cml = 0
for i in xrange(self.ndc):
r = (phi[i] - phi_bar) / (1.0 - phi_bar)
pIBD = self.fhat + (1 - self.fhat) * r
# print i,pIBD
if pIBD <= 0:
# print pIBD, self.s, self.de
print("WARNING: Prabability of IBD has fallen "
"below zero for distance class {}.").format(self.dist[i])
print("This marginal likelihood will not be "
"included in composite likelihood.")
# print("phi[i]", phi[i], "phi_bar", phi_bar,
#"r", r, "pIBD", pIBD)
continue
cml += self.data[i] * log(pIBD) +\
(self.sz[i] - self.data[i]) * log(1 - pIBD)
return cml
def full_likelihood(self):
phi = np.zeros((self.ndc))
phi_bar = 0
denom = 2 * self.k * self.ss * pi * self.de + self.g0
for i, d in enumerate(self.dist2):
if d == 0:
p = self.g0 / denom
else:
p = sy.nsum(lambda t: exp(self.mu2 * t) *
exp(-d / (4.0 * self.ss * t)) / (2.0 * t), [
1, sy.inf], error=False, verbose=False,
method='euler-maclaurin', steps=[100])
p = p / denom
phi_bar += p * self.sz[i]
phi[i] = p
phi_bar /= float(self.tsz)
cml = 0
for i in xrange(self.ndc):
r = (phi[i] - phi_bar) / (1.0 - phi_bar)
pIBD = self.fhat + (1 - self.fhat) * r
if pIBD <= 0:
print("WARNING: Prabability of IBD has fallen "
"below zero for distance class {}.").format(self.dist[i])
print("This marginal likelihood will not be "
"included in composite likelihood.")
continue
cml += self.data[i] * log(pIBD) +\
(self.sz[i] - self.data[i]) * log(1 - pIBD)
return cml
def raw_to_dc(self, rawData):
n = len(rawData)
ibd = np.zeros(n / 2)
sz = np.zeros(n / 2)
for i in xrange(n):
if np.isnan(rawData[i]):
continue
for j in xrange(i + 1, n):
if np.isnan(rawData[j]):
continue
k = abs(j - i)
if k > (n / 2):
k = n - k
if int(rawData[i]) == int(rawData[j]):
ibd[k - 1] += 1
sz[k - 1] += 1
return ibd, sz
def jackknife_CI(self, rawData, alpha, sigma, density, verbose=False):
org_data, org_sz = self.raw_to_dc(rawData)
org_dc = np.array([i + 1 for i in xrange(len(org_data))])
self.set_data(org_data, org_dc, org_sz)
if self.fhat < 0.27 or self.fhat > 0.33:
return False
ml = self.max_likelihood(sigma, density, verbose=verbose)
if not ml.success:
return False
org_nb = self.get_nb()
n = len(rawData)
stat = np.zeros(n)
for i in xrange(n):
jackData = rawData.copy()
jackData[i] = np.nan
jackData, sz = self.raw_to_dc(jackData)
self.set_data(jackData, org_dc, sz)
x = self.max_likelihood(sigma, density)
if x.success is False:
print "JACKKNIIFE FAIL"
stat[i] = np.nan
continue
stat[i] = self.get_nb()
stat = stat[~np.isnan(stat)]
stat.sort()
n = len(stat)
self.data = org_data
self.dist = org_dc
self.ml = ml.x
self.sz = org_sz
return [stat[int((alpha / 2.0) * n)],
org_nb, stat[int((1 - alpha / 2.0) * n)], stat]
def apx_bootstrap_helper(self, stat, samples, dClass,
sz, sigma, density, verbose):
fail = 0
for i, (r, d, s) in enumerate(zip(samples, dClass, sz)):
# approximate model requires the distance classes to be sorted
# to find the appropriate cutoff point
self.sort_data(r, d, s)
x = self.max_likelihood(sigma, density)
if x.success is False:
fail += 1
continue
stat[np.where(stat == 0)[0][0]] = self.get_nb()
return stat, fail
def full_bootstrap_helper(self, stat, samples, dClass,
sz, sigma, density, verbose):
fail = 0
for i, (r, d, s) in enumerate(zip(samples, dClass, sz)):
self.data = r
self.dist = d
self.sz = s
x = self.max_likelihood(sigma, density)
if x.success is False:
print "BOOTFAIL"
fail += 1
continue
stat[np.where(stat == 0)[0][0]] = self.get_nb()
return stat, fail
def bootstrap_CI(self, nSamples, alpha, sigma, density, verbose=False):
# remember original data
org_data = self.data
org_dc = self.dist
org_sz = self.sz
ml = self.max_likelihood(sigma, density, verbose=verbose)
if not ml.success:
return False
org_nb = self.get_nb()
sd = self.ml
fbar = self.fhat
n = len(self.data)
stat = np.zeros(nSamples)
go = nSamples
while go:
# make indexes for sampling with replacement
# carry over the distance classes as well
idx = np.random.randint(0, n, (go, n))
samples = org_data[idx]
dClass = org_dc[idx]
sz = org_sz[idx]
stat, go = self.bootstrap_helper(
stat, samples, dClass, sz, sigma, density, verbose)
stat.sort()
# return data to original values
self.data = org_data
self.dist = org_dc
self.ml = ml.x
return [stat[int((alpha / 2.0) * nSamples)],
org_nb, stat[int((1 - alpha / 2.0) * nSamples)],
sd[0], sd[1], fbar, stat]
def landscape_plot(self, res=0.1, sigLow=0.1, sigUp=4.0,
denLow=0.1, denUp=6.0, fileName=None):
# run after max_likelihood
sig = np.arange(sigLow, sigUp, res)
den = np.arange(denLow, denUp, res)
X, Y = np.meshgrid(sig, den)
Z = np.array([[-log(self.update(np.array([i, j])))
for i in sig] for j in den])
plt.rcParams['xtick.direction'] = 'out'
plt.rcParams['ytick.direction'] = 'out'
plt.figure(figsize=(10, 10))
CS = plt.contour(X, Y, Z)
plt.clabel(CS, inline=1, fontsize=10)
plt.title('Likelihood Landscape of $\sigma$ vs Density')
plt.xlabel('$\sigma$')
plt.ylabel('density')
plt.plot(self.ml[0], self.ml[1], "*k", markersize=20)
plt.show()
if fileName is not None:
plt.savefig(fileName, format='pdf')
def max_likelihood(self, startSig, startDen, max_iter=10000,
tol=0.0001, verbose=False):
start = np.array([startSig, startDen])
# minimum density is 0.1, otherwise we get probabilities less than 0
bnds = ((2 ** (-52), None), (0.1, None))
x = minimize(self.update, start, options={
'maxiter': max_iter, 'disp': verbose},
tol=tol, bounds=bnds, method='TNC')
self.ml = x.x
return x
def sort_data(self, data, dc, sz):
z = zip(dc, data, sz)
z.sort()
self.data = np.array([j for i, j, k in z], dtype=int)
self.dist = np.array([i for i, j, k in z], dtype=float)
self.sz = np.array([k for i, j, k in z], dtype=int)
def apx_gen_data(self, fbar, nSamples, sigma, density, nreps):
if type(nSamples) == int:
totalSize = self.ndc * nSamples
else:
if len(nSamples) == self.ndc:
totalSize = np.sum(nSamples)
else:
raise Exception(
"ERROR: data and distance class"
"arrays are not equal length")
ss = sigma * sigma
split = self.ndc
for i in xrange(self.ndc):
if self.dist[i] > 6 * sigma:
split = i
break
denom = 2.0 * self.k * pi * ss * density + self.g0
phi = np.zeros((self.ndc))
phi_bar = 0
for s in xrange(split):
if self.dist[s] == 0:
p = self.g0 / float(denom)
else:
p = self.t_series(self.dist[s]) / float(denom)
phi_bar += p * nSamples
phi[s] = p
for l in xrange(split, self.ndc):
p = self.bessel(self.dist[l]) / float(denom)
phi_bar += p * nSamples
phi[l] = p
phi_bar /= float(totalSize)
r = (phi - phi_bar) / (1.0 - phi_bar)
pIBD = fbar + (1.0 - fbar) * r
pIBD = np.array(pIBD, dtype=float)
# simulate values from binomial distribution
# np.random.seed(1209840)
counts = np.random.binomial(nSamples, pIBD, (nreps, self.ndc))
return counts
def full_gen_data(self, fbar, nSamples, sigma, density):
totalSize = self.ndc * nSamples
ss = sigma * sigma
denom = 2.0 * self.k * pi * ss * density + self.g0
phi = np.zeros((self.ndc))
phi_bar = 0
for i, d in enumerate(self.dist2):
if d == 0:
p = self.g0 / denom
else:
p = sy.nsum(lambda t: exp(self.mu2 * t) *
exp(-d / (4.0 * ss * t)) / (2.0 * t), [
1, sy.inf], error=False, verbose=False,
method='euler-maclaurin', steps=[1000])
p = p / denom
phi_bar += p * nSamples
phi[i] = p
phi_bar /= float(totalSize)
r = (phi - phi_bar) / (1.0 - phi_bar)
pIBD = fbar + (1.0 - fbar) * r
pIBD = np.array(pIBD, dtype=float)
# simulate values from binomial distribution
# np.random.seed(1209840)
counts = np.random.binomial(nSamples, pIBD)
return counts
def set_data(self, newData, dc, sz):
if type(sz) is int:
sz = [sz for i in xrange(len(newData))]
if len(newData) == len(dc) and len(dc) == len(sz):
self.sort_data(newData, dc, sz)
self.dist2 = self.dist ** 2
self.tsz = np.sum(self.sz)
self.ndc = len(self.dist)
self.fhat = np.sum(self.data) / float(self.tsz)
else:
raise Exception(
"ERROR: data and distance class arrays are not equal length")
def get_nb(self):
s = self.ml[0]
d = self.ml[1]
return 2 * self.k * pi * s * s * d
class ApxCML(CML):
def __init__(self, mu, sigma, density, dataIn,
distIn, szIn, ploidy, n_terms):
CML.__init__(
self, mu, sigma, density, dataIn, distIn, szIn, ploidy, True)
self.split = self.ndc
self.n_t = n_terms
self.plog = np.array([sy.polylog(i + 1, self.z)
for i in range(self.n_t)])
def t_series(self, x):
sum = 0.0
pow2 = 1
for t in xrange(self.n_t):
dt = 2 * t
pow2 <<= 1
powX = 1.0
powS = 1.0
for i in xrange(dt):
powX *= x
powS *= self.s
s = (self.plog[t] * powX) /\
(fac.factorial2(dt, exact=True) * pow2 * powS)
if((t % 2) == 0):
sum += s
else:
sum -= s
return sum
def bessel(self, x):
t = (x / float(self.s)) * self.sqrz
return sp.k0(t)
class FullCML(CML):
def __init__(self, mu, sigma, density, dataIn, distIn, szIn, ploidy):
CML.__init__(
self, mu, sigma, density, dataIn, distIn, szIn, ploidy, False)