/
matustats.py
122 lines (111 loc) · 3.78 KB
/
matustats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np
from scipy import stats
__all__=['lognorm','gamma','weibull','exgaus']
def latinSquare(N=4):
U=np.zeros((2**(N),N),dtype=int)
for i in range(N):
U[:,i]= np.mod(np.arange(U.shape[0])/2**(N-1-i),2)
return U
def lognorm(mu=1,sigma=1,phi=0):
''' Y ~ log(X)+phi
X ~ Normal(mu,sigma)
mu - mean of X
sigma - standard deviation of X
'''
return stats.lognorm(sigma,loc=-phi,scale=np.exp(mu))
def gamma(mu=1,sigma=1,phi=0):
''' Gamma parametrized by mean mu and standard deviation sigma'''
return stats.gamma(a=np.power(mu/sigma,2),scale=np.power(sigma,2)/mu,loc=-phi)
def weibull(scale=1,shape=1,loc=0):
''' pdf =shape/scale* (x/scale+loc)**(shape-1)
* exp(-(x/scale+loc)**shape)
'''
return stats.weibull_min(shape,scale=scale,loc=-loc)
from scipy.special import erfc
def exgaus(x,mu,sigma,lamda):
''' Exponentially modiefied gaussian
mu - gaus mean, sigma - gaus std, lambda - rate of expon
'''
l=lamda/2.
return l*np.exp(l*(mu+l*sigma**2/2-x))*stats.norm.cdf((x-mu-sigma**2*l)/sigma)
def pcaEIG(A,highdim=None):
""" performs principal components analysis
(PCA) on the n-by-p data matrix A
Rows of A correspond to observations, columns to features/attributes.
Returns :
coeff :
is a p-by-p matrix, each column contains coefficients
for one principal component.
score :
the principal component scores ie the representation
of A in the principal component space. Rows of SCORE
correspond to observations, columns to components.
latent :
a vector containing the normalized eigenvalues (percent variance explained)
of the covariance matrix of A.
Reference: Bishop, C. (2006) PRML, Chap. 12.1
"""
A=np.array(A)
n=A.shape[0];m=A.shape[1]
highdim = n<m
assert n!=m
M = (A-A.mean(1)[:,np.newaxis]) # mean-center data
if highdim:
[latent,coeff] = np.linalg.eigh(np.cov(M))
coeff=M.T.dot(coeff)
denom=np.sqrt((A.shape[1]-1)*latent[np.newaxis,:])
coeff/=denom #make unit vector length
else:
[latent,coeff] = np.linalg.eigh(np.cov(M.T))
score = M.dot(coeff)
latent/=latent.sum()
# sort the data
indx=np.argsort(latent)[::-1]
latent=latent[indx]
coeff=coeff[:,indx]
score=score[:,indx]
assert np.allclose(np.linalg.norm(coeff,axis=0),1)
return coeff,score,latent
def pcaNIPALS(K=5,tol=1e-4,verbose=False):
''' Reference:
Section 2.2 in Andrecut, M. (2009).
Parallel GPU implementation of iterative PCA algorithms.
Journal of Computational Biology, 16(11), 1593-1599.
TODO - replace custom linear algebra (e.g. XmeanCenter) with
numpy algebra
'''
if verbose: print('Mean centering columns')
XmeanCenter(1)
latent=[]
for k in range(K):
lam0=0;lam1=np.inf
T=np.matrix(XgetColumn(k))
if verbose: print('Computing PC ',k)
h=0
while abs(lam1-lam0)>tol and h<100:
P=Xleftmult(T,True)
P=P/np.linalg.norm(P)
T=Xleftmult(P)
lam0=lam1
lam1=np.linalg.norm(T)
if verbose: print('\t Iteration '+str(h)+', Convergence =', abs(lam1-lam0))
h+=1
latent.append(lam1)
XminusOuterProduct(T,P)
#np.save(inpath+'T%02d'%k,T)
np.save(inpath+'coeffT%d'%k,P.T)
np.save(inpath+'latent',latent)
def invdigamma(x):
'''x=np.linspace(0,10,11)
plt.plot(x,invdigamma(digamma(x)))
'''
from scipy.special import digamma, polygamma
m=x>=-2.22
y=m*(np.exp(x)+0.5)-(1-m)/(x-digamma(1))
y[np.isnan(y)]=1
print(y)
L=digamma(y)-x
while np.min(L)>1e-8:
y=y-L/polygamma(1,y)
L=digamma(y)-x
return y