forked from theJohnnyBrown/permanova
/
permanova.py
305 lines (222 loc) · 10 KB
/
permanova.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2010 - 2012, University of New Orleans
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Author: Johnny Brown
import numpy as np
import random as r
from itertools import permutations, product, chain
from scipy import stats
#stats.ss chokes on generators. Might be worth fixing that.
stats.ss = lambda l: sum(a*a for a in l)
def above_diagonal(n):
row = xrange(n)
for i in row:
for j in xrange(i+1,n):
yield i,j
def permanova_oneway(dm, levels, permutations = 200):
"""
Performs one-way permutational ANOVA on the given distance matrix.
One-way permanova tests the null hypothesis that distances between levels of
the variable are not significantly different from distances within levels of
the variable.
The test-statistic is a multivariate analogue to Fisher's F-ratio and is
calculated directly from any symmetric distance or dissimilarity matrix.
P-values are then obtained using permutations.
In the case of a Euclidean distance matrix calculated from only one
variable, the F-value is the same as the traditional parametric univariate
F-statistic.
Parameters
----------
dm : array_like
The distance matrix of observations x observations. Represents a
symmetric n x n matrix with zeros on the diagonal.
levels : array_like
An array indicating the levels of the variable at each observation, such
that levels[i] == levels[j] means that dm[i] and dm[j] are rows
corresponding to observations in the same level or treatment.
permutations : int
The number of permutations used to compute the p-value. Default is 200.
If there are less than ``permutations`` unique permutations, then all of
them will be used
Returns
-------
F-value : float
The computed F-value of the test.
p-value : float
The associated p-value from the F-distribution, generated by permuting
the levels
Notes
-----
It is assumed that all sample groups are the same size, n.
For example, if the values in levels are placebo, 5mg, and 10mg, then each
value must occur n times in the levels array, and
3n == len(levels) == len(dm) == len(dm[i]) (for all 0 <= i < len(dm))
The algorithm is from Anderson[2]
References
----------
.. [1] Lowry, Richard. "Concepts and Applications of Inferential
Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html
.. [2] Anderson, Marti. A new method for non-parametric multivariate analysis
of variance. 2001.
http://www.entsoc.org/PDF/MUVE/6_NewMethod_MANOVA1_2.pdf
.. [3] Jones, James. Lecture notes for Concepts of Statistics
http://people.richland.edu/james/lecture/m170/ch13-2wy.html
"""
bigf = f_oneway(dm,levels)
above = below = 0
nf = 0
#TODO make this pretty with math and functions
#perms = r.sample(list(perm_unique(levels)),permutations)
shuffledlevels = list(levels)#copy list so we can shuffle it
for i in xrange(permutations):
r.shuffle(shuffledlevels)
f = f_oneway(dm,shuffledlevels)
if f >= bigf:
above += 1
#debug
## print shuffledlevels
## print f
p = above/float(permutations)
return (bigf,p)
#FIXME the right way to reuse code for one-way and n-way is to add an arg f such that
#f(levels[i],levels[j]) returns True iff dm[i][j] should be included in the
#desired sum of squares
def f_oneway(dm,levels):
bign = len(levels)#number of observations
dm = np.asarray(dm)#distance matrix
a = len(set(levels))#number of levels
n = bign/a#number of observations per level
assert dm.shape == (bign,bign) #check the dist matrix is square and the size
#corresponds to the length of levels
#total sum of squared distances
sst = np.sum(stats.ss(r) for r in
(s[n+1:] for n,s in enumerate(dm[:-1])) )/float(bign)
#sum of within-group squares
#itertools.combinations(xrange(len(dm)),2)#top half of dm
ssw = np.sum((dm[i][j]**2 for i,j in
product(xrange(len(dm)),xrange(1,len(dm)))
if i<j and levels[i] == levels[j]))/float(n)
ssa = sst - ssw
fstat = (ssa/float(a-1))/(ssw/float(bign-a))
#print (fstat,sst,ssa,ssw,a,bign,n)
return fstat
def permanova_twoway(dm,levels,permutations=200):
"""Performs one-way permutational ANOVA on the given distance matrix.
One-way permanova tests the null hypothesis that distances between levels of
the variable are not significantly different from distances within levels of
the variable.
The test-statistic is a multivariate analogue to Fisher's F-ratio and is
calculated directly from any symmetric distance or dissimilarity matrix.
P-values are then obtained using permutations.
In the case of a Euclidean distance matrix calculated from only one
variable, the F-value is the same as the traditional parametric univariate
F-statistic.
Parameters
----------
dm : array_like
The distance matrix of observations x observations. Represents a
symmetric n x n matrix with zeros on the diagonal.
levels : array_like
An array of pairs indicating the levels of the variable at each
observation, such that levels[i] == levels[j] means that dm[i] and dm[j]
are rows corresponding to observations in the same level or treatment.
permutations : int
The number of permutations used to compute the p-value. Default is 200.
If there are less than ``permutations`` unique permutations, then all of
them will be used
Returns
-------
F-values : 3-tuple
The computed F-values of each test. F-values[0] corresponds to the
interaction of the two levels, F-values[1] corresponds to the effect
of variable a, and F-values[2] corresponds to the effect of variable
b.
p-value : 3-tuple
The associated p-values from the F-distribution, generated by permuting
the levels. The values correspond to (interaction, a, b) as with the
F-values.
Notes
-----
All sample groups must be the same size, n.
For example, if the variables are drug and dose, with levels 5mg, 10mg, 15mg
and drug1, drug2, then each combination of drug type and dose level must
be of size n, i.e. (5mg, drug1) must occur n times in the levels array and
6n == len(levels) == len(dm)
The algorithm is from Anderson[2]
References
----------
.. [1] Lowry, Richard. "Concepts and Applications of Inferential
Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html
.. [2] Anderson, Marti. A new method for non-parametric multivariate analysis
of variance. 2001.
http://stg-entsoc.bivings.com/PDF/MUVE/6_NewMethod_MANOVA1_2.pdf
.. [3] Jones, James. Lecture notes for Concepts of Statistics
http://people.richland.edu/james/lecture/m170/ch13-2wy.html
"""
bigf_i, bigf_a, bigf_b = f_twoway(dm,levels)
above_i = above_a = above_b = 0
#TODO make this pretty with math and functions
#perms = r.sample(list(perm_unique(levels)),permutations)
shuffledlevels = list(levels)#copy list so we can shuffle it
a_levels = list([l[0] for l in levels])
b_levels = list([l[1] for l in levels])
#permutations
for i in xrange(permutations):
#All these are probably the wrong way to do the permutations. Wrong as in
#incorrect
## r.shuffle(a_levels)
## r.shuffle(b_levels)
## shuffledlevels = zip(a_levels,b_levels)
r.shuffle(shuffledlevels)
f_i, f_a, f_b = f_twoway(dm,shuffledlevels)
if f_i > bigf_i:
above_i += 1
for i in xrange(permutations):
r.shuffle(a_levels)
f_i, f_a, f_b = f_twoway(dm,zip(a_levels, [l[1] for l in levels]))
if f_a > bigf_a:
above_a += 1
for i in xrange(permutations):
r.shuffle(b_levels)
f_i, f_a, f_b = f_twoway(dm,zip([l[0] for l in levels], b_levels))
if f_b > bigf_b:
above_b += 1
p_i,p_a,p_b = [ above/float(permutations) for above in
[above_i, above_a, above_b]]
return ((bigf_i, bigf_a, bigf_b), (p_i, p_a, p_b))
def f_twoway(dm, levels):
bign = len(levels)#number of observations
dm = np.asarray(dm)#distance matrix
l = len(set(levels))#number of levels
a = len(set([l[0] for l in levels]))#number of a-levels
b = len(set([l[1] for l in levels]))#number of b-levels
n = bign/float(a*b)#number of observations per level
#sum of all distances
## sst = np.sum(stats.ss(r) for r in
## (s[n+1:] for n,s in enumerate(dm[:-1])) )/float(bign)
sst = stats.ss(chain(*(r[i+1:] for i,r in enumerate(dm))))/float(bign)
#same level of both a and b (error, within-group)
ssr = select_ss(dm, levels, lambda a,b: a==b)/float(n)
#same level of a
sswa = select_ss(dm, levels, lambda a,b: a[0] == b[0])/float(b*n)
#same level of b
sswb = select_ss(dm, levels, lambda a,b: a[1] == b[1])/float(a*n)
ssa = sst - sswa#effect of a
ssb = sst - sswb#effect of b
ssab = sst - ssa - ssb - ssr #interaction sum-of-squares
#these should each be separate functions?
f_interaction = (ssab/float((a-1)*(b-1)))/(ssr/float(bign - a*b))
f_a = (ssa/float((a-1)))/(ssr/float(bign - a*b))
f_b = (ssb/float((b-1)))/(ssr/float(bign - a*b))
return (f_interaction,f_a,f_b)
def select_ss(dm, levels, included):
bign = len(dm)
distances = (dm[i][j] for i,j in above_diagonal(bign)
if included(levels[i], levels[j]))
return stats.ss(distances)