-
Notifications
You must be signed in to change notification settings - Fork 1
/
20100607a.py
183 lines (170 loc) · 5.74 KB
/
20100607a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""
Compute the Tracy-Widom statistic given a .hud file.
Following Patterson et al. each row of the input .hud file
gives an OTU name followed by presence or absence of each SNP.
"""
from StringIO import StringIO
import math
import os
import argparse
import numpy as np
from SnippetUtil import HandlingError
import Form
import FormOut
import hud
import EigUtil
g_tags = ['pca:compute']
g_default_hud_string = """
IC31 1 1 0 0
IC32 1 1 1 0
IC33 1 0 1 1
IC34 0 0 1 0
""".strip()
def get_corrected_structure(crit, eigenvalues, m, n_prime):
"""
@param crit: critical value of the Tracy-Widom statistic
@param eigenvalues: all eigenvalues sorted in descending order
@param m: the number of OTUs
@param n_prime: the effective number of SNPs
@return: significant statistics, one insignificant statistic
"""
if len(eigenvalues) != m:
msg_a = 'expected the number of eigenvalues to equal '
msg_b = 'the number of OTUs'
raise ValueError(msg_a + msg_b)
w = eigenvalues[:]
significant_xs = []
insignificant_x = None
while insignificant_x is None:
m_curr = len(w)
proportion = w[0] / w.sum()
L_prime = (m_curr-1)*proportion
x_prime = get_tracy_widom_statistic(m_curr, n_prime, L_prime)
if x_prime > crit:
significant_xs.append(x_prime)
else:
insignificant_x = x_prime
w = w[1:]
return significant_xs, insignificant_x
def get_tracy_widom_statistic(m, n, L):
"""
The interpretation of the parameters is purposely vague.
It depends on whether you are doing linkage correction.
@param m: like the number of OTUs
@param n: like the number of SNPs
@param L: like a normalized principal eigenvalue
@return: the Tracy-Widom statistic
"""
alpha = math.sqrt(n-1) + math.sqrt(m)
mu = (alpha*alpha) / n
sigma = (alpha / n) * (1/math.sqrt(n-1) + 1/math.sqrt(m))**(1./3.)
return (L - mu) / sigma
def process(args, hud_lines):
"""
@param hud_lines: lines of a .hud file
@return: results in convenient text form
"""
out = StringIO()
# get the ordered names from the .hud file
names, data = hud.decode(hud_lines)
# create the floating point count matrix
C_full = np.array(data)
m_full, n_full = C_full.shape
# remove invariant columns
C = np.vstack([v for v in C_full.T if len(set(v))>1]).T
# get the shape of the matrix
m, n = C.shape
# get the column means
u = C.mean(axis=0)
# get the centered and normalized counts matrix
M = (C - u) / np.sqrt(u * (1 - u))
# construct the sample covariance matrix
X = np.dot(M, M.T) / n
# get the eigendecomposition of the covariance matrix
evals, evecs = EigUtil.eigh(X)
L1 = evals.sum()
L2 = np.dot(evals, evals)
proportion = evals[0] / L1
# compute the relative size of the first eigenvalue
L = m*proportion
# compute the Tracy-Widom statistic
x = get_tracy_widom_statistic(m, n, L)
# do linkage correction
n_prime = ((m+1)*L1*L1) / ((m-1)*L2 - L1*L1)
# detect additional structure using alpha level of 0.05
crit = 0.9794
if n_prime < n:
L_prime = (m-1)*proportion
x_prime = get_tracy_widom_statistic(m, n_prime, L_prime)
sigs, insig = get_corrected_structure(crit, evals, m, n_prime)
else:
sigs, insig = get_corrected_structure(crit, evals, m, n)
# print some infos
print >> out, 'number of isolates:'
print >> out, m_full
print >> out
print >> out, 'total number of SNPs:'
print >> out, n_full
print >> out
print >> out, 'number of informative SNPs:'
print >> out, n
print >> out
print >> out, 'effective number of linkage-corrected SNPs:'
if n_prime < n:
print >> out, n_prime
else:
print >> out, '[sample is too degenerate for estimation]'
print >> out
print >> out, 'Tracy-Widom statistic (linkage-naive):'
print >> out, x
print >> out
print >> out, 'Tracy-Widom statistic (linkage-corrected):'
if n_prime < n:
print >> out, x_prime
else:
print >> out, '[sample is too degenerate for estimation]'
print >> out
print >> out, 'proportion of variance explained by principal axis:'
print >> out, proportion
print >> out
print >> out, 'number of significant axes of variation:'
print >> out, len(sigs)
print >> out
print >> out, 'significant Tracy-Widom statistics:'
for sig in sigs:
print >> out, sig
print >> out
print >> out, 'first insignificant Tracy-Widom statistic:'
print >> out, insig
print >> out
print >> out, 'principal axis projection:'
for loading, name in sorted(zip(evecs[0] * evals[0], names)):
print >> out, '\t'.join([name, str(loading)])
print >> out
# evals should sum to the number of OTUs
evals_sum = sum(evals)
if args.sum_to_n:
print >> out, 'eigenvalues normalized to sum to the number of OTUs:'
for w in evals:
print >> out, m_full * w / float(evals_sum)
elif args.sum_to_1:
print >> out, 'eigenvalues normalized to sum to 1.0:'
for w in evals:
print >> out, w / float(evals_sum)
return out.getvalue().rstrip()
def get_form():
"""
@return: the body of a form
"""
form_objects = [
Form.MultiLine('hud',
'contents of a .hud file',
g_default_hud_string),
Form.RadioGroup('normalization', 'eigenvalue normalization', [
Form.RadioItem('sum_to_1', 'sum to 1.0', True),
Form.RadioItem('sum_to_n', 'sum to the number of OTUs')])]
return form_objects
def get_form_out():
return FormOut.Report('out')
def get_response_content(fs):
return process(fs, fs.hud.splitlines()) + '\n'