forked from david-zwicker/python-functions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
stats_functions.py
339 lines (248 loc) · 9.85 KB
/
stats_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
'''
Created on Nov 21, 2014
@author: zwicker
'''
from __future__ import division
import numpy as np
from scipy import integrate, misc, optimize, special, stats
class StatisticsAccumulator(object):
""" class that can be used to calculate statistics of sets of arbitrarily
shaped data sets. This uses an online algorithm, allowing the data to be
added one after another """
def __init__(self, ddof=0, shape=None, dtype=np.double, ret_cov=False):
""" initialize the accumulator
`ddof` is the delta degrees of freedom, which is used in the formula
for the standard deviation.
`shape` is the shape of the data. If omitted it will be read from the
first value
`dtype` is the numpy dtype of the interal accumulator
`ret_cov` determines whether the covariances are also calculated
"""
self.count = 0
self.ddof = ddof
self.dtype = dtype
self.ret_cov = ret_cov
if shape is None:
self.shape = None
self._mean = None
self._M2 = None
else:
self.shape = shape
size = np.prod(shape)
self._mean = np.zeros(size, dtype=dtype)
if ret_cov:
self._M2 = np.zeros((size, size), dtype=dtype)
else:
self._M2 = np.zeros(size, dtype=dtype)
@property
def mean(self):
""" return the mean """
if self.shape is None:
return self._mean
else:
return self._mean.reshape(self.shape)
@property
def cov(self):
""" return the variance """
if self.count <= self.ddof:
raise RuntimeError('Too few items to calculate variance')
elif not self.ret_cov:
raise ValueError('The covariance matrix was not calculated')
else:
return self._M2 / (self.count - self.ddof)
@property
def var(self):
""" return the variance """
if self.count <= self.ddof:
raise RuntimeError('Too few items to calculate variance')
elif self.ret_cov:
var = np.diag(self._M2) / (self.count - self.ddof)
else:
var = self._M2 / (self.count - self.ddof)
if self.shape is None:
return var
else:
return var.reshape(self.shape)
@property
def std(self):
""" return the standard deviation """
return np.sqrt(self.var)
def _initialize(self, value):
""" initialize the internal state with a given value """
# state needs to be initialized
self.count = 1
if hasattr(value, '__iter__'):
# make sure the value is a numpy array
value_arr = np.asarray(value, self.dtype)
self.shape = value_arr.shape
size = value_arr.size
# store 1d version of it
self._mean = np.ravel(value)
if self.ret_cov:
self._M2 = np.zeros((size, size), self.dtype)
else:
self._M2 = np.zeros(size, self.dtype)
else:
self.shape = None
self._mean = value
self._M2 = 0
def add(self, value):
""" add a value to the accumulator """
if self._mean is None:
self._initialize(value)
else:
# update internal state
if self.shape is not None:
value = np.ravel(value)
self.count += 1
delta = value - self._mean
self._mean += delta / self.count
if self.ret_cov:
self._M2 += ((self.count - 1) * np.outer(delta, delta)
- self._M2 / self.count)
else:
self._M2 += delta * (value - self._mean)
def add_many(self, arr_or_iter):
""" adds many values from an array or an iterator """
for value in arr_or_iter:
self.add(value)
def mean_std_online(arr_or_iter, ddof=0, shape=None):
""" calculates the mean and the standard deviation of the given data.
If the data is an iterator, the values are calculated memory efficiently
with an online algorithm, which does not store all the intermediate data.
`ddof` is the delta degrees of freedom, which is used in the formula for
the standard deviation.
"""
acc = StatisticsAccumulator(ddof=ddof, shape=shape)
acc.add_many(arr_or_iter)
if acc.mean is None:
mean = np.nan
else:
mean = acc.mean
try:
std = acc.std
except ValueError:
std = np.nan
return mean, std
def mean_std_frequency_table(frequencies, values=None, ddof=0):
""" calculates the mean and the standard deviation of the given data.
`frequencies` is an array with which `values` were observed. If `values` is
None, it is set to `values = np.arange(len(frequencies))`.
`ddof` is the delta degrees of freedom, which is used in the formula for
the standard deviation.
"""
if values is None:
values = np.arange(len(frequencies))
count = frequencies.sum()
sum1 = (values * frequencies).sum()
sum2 = (values**2 * frequencies).sum()
mean = sum1 / count
var = (sum2 - sum1**2 / count)/(count - ddof)
std = np.sqrt(var)
return mean, std
def lognorm_mean_var_to_mu_sigma(mean, variance, definition='scipy'):
""" determines the parameters of the log-normal distribution such that the
distribution yields a given mean and variance. The optional parameter
`definition` can be used to choose a definition of the resulting parameters
that is suitable for the given software package. """
mean2 = mean**2
mu = mean2/np.sqrt(mean2 + variance)
sigma = np.sqrt(np.log(1 + variance/mean2))
if definition == 'scipy':
return mu, sigma
elif definition == 'numpy':
return np.log(mu), sigma
else:
raise ValueError('Unknown definition `%s`' % definition)
def lognorm_mean_var(mean, variance):
""" returns a lognormal distribution parameterized by its mean and its
variance. """
scale, sigma = lognorm_mean_var_to_mu_sigma(mean, variance, 'scipy')
return stats.lognorm(scale=scale, s=sigma)
def sampling_distribution_std(x, std, num):
"""
Sampling distribution of the standard deviation, see
http://en.wikipedia.org/wiki/Chi_distribution
"""
if x < 0:
return 0
scale = np.sqrt(num)/std
x *= scale
res = 2**(1 - 0.5*num)*x**(num - 1)*np.exp(-0.5*x*x)/special.gamma(0.5*num)
return res*scale
def sampling_distribution_cv(x, cv, num):
"""
Sampling distribution of the coefficient of variation, see
W. A. Hendricks and K. W. Robey. The Sampling Distribution of the
Coefficient of Variation. Ann. Math. Statist. 7, 129-132 (1936).
"""
if x < 0:
return 0
x2 = x*x
# calculate the sum
factorial = misc.factorial
res = sum(
factorial(num - 1) * special.gamma(0.5*(num - i)) * num**(0.5*i) / (
factorial(num - 1 - i) * factorial(i) * 2**(0.5*i) * cv**i
* (1 + x2)**(0.5*i)
)
for i in range(1 - num%2, num, 2)
)
# multiply by the other terms
res *= 2./(np.sqrt(np.pi)*special.gamma(0.5*(num-1)))
res *= x**(num-2)/((1 + x2)**(0.5*num))
res *= np.exp(-0.5*num/(cv**2)*x2/(1 + x2))
return res
def confidence_interval(value, distribution=None, args=None, guess=None,
confidence=0.95):
""" returns the confidence interval of `value`, if its value was estimated
`num` times from the `distribution` """
if guess is None:
guess = 0.1*value
if distribution is None:
distribution = sampling_distribution_std
# create partial function
distr = lambda y: distribution(y, value, *args)
def rhs(x):
""" integrand """
return confidence - integrate.quad(distr, value-x, value+x)[0]
res = optimize.newton(rhs, guess, tol=1e-4)
return res
def confidence_interval_mean(std, num, confidence=0.95):
""" calculates the confidence interval of the mean given a standard
deviation and a number of observations, assuming a normal distribution """
sem = std/np.sqrt(num) # estimate of the standard error of the mean
# get confidence interval from student-t distribution
factor = stats.t(num - 1).ppf(0.5 + 0.5*confidence)
return factor*sem
def confidence_interval_std(std, num, confidence=0.95):
""" calculates the confidence interval of the standard deviation given a
standard deviation and a number of observations, assuming a normal
distribution """
c = stats.chi(num - 1).ppf(0.5 + 0.5*confidence)
lower_bound = np.sqrt(num - 1)*std/c
c = stats.chi(num - 1).ppf(0.5 - 0.5*confidence)
upper_bound = np.sqrt(num - 1)*std/c
return 0.5*(upper_bound - lower_bound)
def estimate_mean(data, confidence=0.95):
""" estimate mean and the associated standard error of the mean of a
data set """
mean = data.mean()
std = data.std(ddof=1)
err = confidence_interval_mean(std, len(data), confidence)
return mean, err
def estimate_std(data, confidence=0.95):
""" estimate the standard deviation and the associated error of a
data set """
std = data.std()
err = confidence_interval_std(std, len(data), confidence)
return std, err
def estimate_cv(data, confidence=0.95):
""" estimate the coefficient of variation and the associated error """
mean = data.mean()
std = data.std()
cv = std/mean
err = confidence_interval(cv, sampling_distribution_cv, (len(data),))
return cv, err
if __name__ == '__main__':
print("This file is intended to be used as a module only.")