forked from alexhuth/ridge
/
ridge.py
469 lines (416 loc) · 21.4 KB
/
ridge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
#import scipy
import numpy as np
import logging
from utils import mult_diag, counter
import random
import itertools as itools
from mpi4py import MPI
import time
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import scikits.cuda.linalg as linalg
import scikits.cuda.misc as misc
linalg.init()
import splitdot
zs = lambda v: (v-v.mean(0))/v.std(0) ## z-score function
ridge_logger = logging.getLogger("ridge_corr")
def ridge(stim, resp, alpha, singcutoff=1e-10, normalpha=False):
"""Uses ridge regression to find a linear transformation of [stim] that approximates
[resp]. The regularization parameter is [alpha].
Parameters
----------
stim : array_like, shape (T, N)
Stimuli with T time points and N features.
resp : array_like, shape (T, M)
Responses with T time points and M separate responses.
alpha : float or array_like, shape (M,)
Regularization parameter. Can be given as a single value (which is applied to
all M responses) or separate values for each response.
normalpha : boolean
Whether ridge parameters should be normalized by the largest singular value of stim. Good for
comparing models with different numbers of parameters.
Returns
-------
wt : array_like, shape (N, M)
Linear regression weights.
"""
try:
# TODO determine if this should be a GPU op
# stim is TRxN (~1000x200 or 5000x15000)
# stim_gpu = gpuarray.to_gpu(stim)
# U_gpu, S_gpu, Vh_gpu = linalg.svd(stim, jobvt="O", jobu="O")
# U = U_gpu.get()
# S = S_gpu.get()
# del S_gpu
# del U_gpu
U,S,Vh = np.linalg.svd(stim, full_matrices=False)
except np.linalg.LinAlgError, e:
logger.info("NORMAL SVD FAILED, trying more robust dgesvd..")
from svd_dgesvd import svd_dgesvd
U,S,Vh = svd_dgesvd(stim, full_matrices=False)
#origsize = S.shape[0]
#ngoodS = np.sum(S>singcutoff)
#nbad = origsize-ngoodS
#U = U[:,:ngoodS]
#S = S[:ngoodS]
#Vh = Vh[:ngoodS]
# EXAMPLE FOR RUNNING GPU LINALG OPERATIONS
# Export data to GPU
# U_gpu = gpuarray.to_gpu(U)
# Do a transpose op on GPU
# UT_gpu = linalg.transpose(U_gpu)
# Export more data to GPU
# resp_gpu = gpuarray.to_gpu(np.nan_to_num(resp))
# Run a dot product
# UR_gpu = linalg.dot(UT_gpu, resp_gpu)
# Fetch data from the GPU
# UR = UR_gpu.get()
# The above GPU code can replace the following line:
UR = np.dot(U.T, np.nan_to_num(resp))
# TODO determine if this should be a GPU op
# U is output from SVD, I think TRxTR (~1000x1000 or 5000x5000)
# resp is TRxM (~1000x3000 or 5000x30000)
# Expand alpha to a collection if it's just a single value
if isinstance(alpha, float):
alpha = np.ones(resp.shape[1]) * alpha
# Normalize alpha by the LSV norm
norm = S[0]
if normalpha:
nalphas = alpha * norm
else:
nalphas = alpha
# Compute weights for each alpha
ualphas = np.unique(nalphas)
wt = np.zeros((stim.shape[1], resp.shape[1]), order='F') # Make wt column major
Vh_gpu = gpuarray.to_gpu(np.copy(Vh, order='F'))
for ua in ualphas:
selvox = np.nonzero(nalphas==ua)[0] # list of indices equal to ua
# TODO determine if this should be a GPU op
# Vh is output from SVD, i think NxN (~200x200 or 15000x15000)
# TODO determine how reduce works
Sd = S/(S**2+ua**2)
Sd_gpu = gpuarray.to_gpu(Sd)
UR_gpu = gpuarray.to_gpu(np.copy(UR[:,selvox], order='F'))
linalg.dot_diag(Sd_gpu, UR_gpu, overwrite=True)
del Sd_gpu
if selvox.shape[0] > 5000:
N=selvox.shape[0]/4
inter_gpu = linalg.dot(Vh_gpu, UR_gpu[:,0:N], transa='T')
wt[:,selvox[0:N]] = inter_gpu.get()
del inter_gpu
inter_gpu = linalg.dot(Vh_gpu, UR_gpu[:,N:2*N], transa='T')
wt[:,selvox[N:2*N]] = inter_gpu.get()
del inter_gpu
inter_gpu = linalg.dot(Vh_gpu, UR_gpu[:,2*N:3*N], transa='T')
wt[:,selvox[2*N:3*N]] = inter_gpu.get()
del inter_gpu
inter_gpu = linalg.dot(Vh_gpu, UR_gpu[:,3*N:], transa='T')
wt[:,selvox[3*N:]] = inter_gpu.get()
del inter_gpu
else:
awt_gpu = linalg.dot(Vh_gpu, UR_gpu, transa='T')
wt[:,selvox] = awt_gpu.get()
del UR_gpu
del Vh_gpu
return wt
def ridge_corr(Rstim, Pstim, Rresp, Presp, alphas, normalpha=False, corrmin=0.2,
singcutoff=1e-10, use_corr=True, logger=ridge_logger):
"""Uses ridge regression to find a linear transformation of [Rstim] that approximates [Rresp],
then tests by comparing the transformation of [Pstim] to [Presp]. This procedure is repeated
for each regularization parameter alpha in [alphas]. The correlation between each prediction and
each response for each alpha is returned. The regression weights are NOT returned, because
computing the correlations without computing regression weights is much, MUCH faster.
Parameters
----------
Rstim : array_like, shape (TR, N)
Training stimuli with TR time points and N features. Each feature should be Z-scored across time.
Pstim : array_like, shape (TP, N)
Test stimuli with TP time points and N features. Each feature should be Z-scored across time.
Rresp : array_like, shape (TR, M)
Training responses with TR time points and M responses (voxels, neurons, what-have-you).
Each response should be Z-scored across time.
Presp : array_like, shape (TP, M)
Test responses with TP time points and M responses.
alphas : list or array_like, shape (A,)
Ridge parameters to be tested. Should probably be log-spaced. np.logspace(0, 3, 20) works well.
normalpha : boolean
Whether ridge parameters should be normalized by the largest singular value (LSV) norm of
Rstim. Good for comparing models with different numbers of parameters.
corrmin : float in [0..1]
Purely for display purposes. After each alpha is tested, the number of responses with correlation
greater than corrmin minus the number of responses with correlation less than negative corrmin
will be printed. For long-running regressions this vague metric of non-centered skewness can
give you a rough sense of how well the model is working before it's done.
singcutoff : float
The first step in ridge regression is computing the singular value decomposition (SVD) of the
stimulus Rstim. If Rstim is not full rank, some singular values will be approximately equal
to zero and the corresponding singular vectors will be noise. These singular values/vectors
should be removed both for speed (the fewer multiplications the better!) and accuracy. Any
singular values less than singcutoff will be removed.
use_corr : boolean
If True, this function will use correlation as its metric of model fit. If False, this function
will instead use variance explained (R-squared) as its metric of model fit. For ridge regression
this can make a big difference -- highly regularized solutions will have very small norms and
will thus explain very little variance while still leading to high correlations, as correlation
is scale-free while R**2 is not.
Returns
-------
Rcorrs : array_like, shape (A, M)
The correlation between each predicted response and each column of Presp for each alpha.
"""
## Calculate SVD of stimulus matrix
logger.info("Doing SVD...")
try:
U,S,Vh = np.linalg.svd(Rstim, full_matrices=False)
except np.linalg.LinAlgError, e:
logger.info("NORMAL SVD FAILED, trying more robust dgesvd..")
from svd_dgesvd import svd_dgesvd
U,S,Vh = svd_dgesvd(Rstim, full_matrices=False)
## Truncate tiny singular values for speed
origsize = S.shape[0]
ngoodS = np.sum(S>singcutoff)
nbad = origsize-ngoodS
U = U[:,:ngoodS]
S = S[:ngoodS]
Vh = Vh[:ngoodS]
logger.info("Dropped %d tiny singular values.. (U is now %s)"%(nbad, str(U.shape)))
## Normalize alpha by the LSV norm
norm = S[0]
logger.info("Training stimulus has LSV norm: %0.03f"%norm)
if normalpha:
nalphas = alphas * norm
else:
nalphas = alphas
## Precompute some products for speed
# TODO determine if this should be a GPU
# U is svd output. I think TRxTR (~1000x1000 or 5000x5000)
# Rresp is TRxM (~1000x3000 or 5000x30000)
UR = np.dot(U.T, Rresp) ## Precompute this matrix product for speed
# TODO determine if this should be a GPU op
# Pstim is TPxN (~200x200 or 1000x15000)
# Vh is output from SVD, I think NxN (~200x200 or 15000x15000)
PVh = np.dot(Pstim, Vh.T) ## Precompute this matrix product for speed
#Prespnorms = np.apply_along_axis(np.linalg.norm, 0, Presp) ## Precompute test response norms
zPresp = zs(Presp)
Prespvar = Presp.var(0)
Rcorrs = [] ## Holds training correlations for each alpha
for na, a in zip(nalphas, alphas):
#D = np.diag(S/(S**2+a**2)) ## Reweight singular vectors by the ridge parameter
D = S/(S**2+na**2) ## Reweight singular vectors by the (normalized?) ridge parameter
# TODO determine if this should be a GPU op
# mult_diag is diagonal matrix.
# UR is TRxM (~1000x3000 or 5000x30000)
pred = np.dot(mult_diag(D, PVh, left=False), UR) ## Best (1.75 seconds to prediction in test)
# pred = np.dot(mult_diag(D, np.dot(Pstim, Vh.T), left=False), UR) ## Better (2.0 seconds to prediction in test)
# pvhd = reduce(np.dot, [Pstim, Vh.T, D]) ## Pretty good (2.4 seconds to prediction in test)
# pred = np.dot(pvhd, UR)
# wt = reduce(np.dot, [Vh.T, D, UR]).astype(dtype) ## Bad (14.2 seconds to prediction in test)
# wt = reduce(np.dot, [Vh.T, D, U.T, Rresp]).astype(dtype) ## Worst
# pred = np.dot(Pstim, wt) ## Predict test responses
if use_corr:
#prednorms = np.apply_along_axis(np.linalg.norm, 0, pred) ## Compute predicted test response norms
#Rcorr = np.array([np.corrcoef(Presp[:,ii], pred[:,ii].ravel())[0,1] for ii in range(Presp.shape[1])]) ## Slowly compute correlations
#Rcorr = np.array(np.sum(np.multiply(Presp, pred), 0)).squeeze()/(prednorms*Prespnorms) ## Efficiently compute correlations
Rcorr = (zPresp*zs(pred)).mean(0)
else:
## Compute variance explained
resvar = (Presp-pred).var(0)
Rcorr = np.clip(1-(resvar/Prespvar), 0, 1)
Rcorr[np.isnan(Rcorr)] = 0
Rcorrs.append(Rcorr)
#log_template = "Training: alpha=%0.3f, mean corr=%0.5f, max corr=%0.5f, over-under(%0.2f)=%d"
#log_msg = log_template % (a,
# np.mean(Rcorr),
# np.max(Rcorr),
# corrmin,
# (Rcorr>corrmin).sum()-(-Rcorr>corrmin).sum())
#logger.info(log_msg)
return Rcorrs
def bootstrap_ridge(Rstim, Rresp, Pstim, Presp, alphas, nboots, chunklen, nchunks,
corrmin=0.2, joined=None, singcutoff=1e-10, normalpha=False, single_alpha=False,
use_corr=True, logger=ridge_logger, test_bootstrap=False):
"""Uses ridge regression with a bootstrapped held-out set to get optimal alpha values for each response.
[nchunks] random chunks of length [chunklen] will be taken from [Rstim] and [Rresp] for each regression
run. [nboots] total regression runs will be performed. The best alpha value for each response will be
averaged across the bootstraps to estimate the best alpha for that response.
If [joined] is given, it should be a list of lists where the STRFs for all the voxels in each sublist
will be given the same regularization parameter (the one that is the best on average).
Parameters
----------
Rstim : array_like, shape (TR, N)
Training stimuli with TR time points and N features. Each feature should be Z-scored across time.
Rresp : array_like, shape (TR, M)
Training responses with TR time points and M different responses (voxels, neurons, what-have-you).
Each response should be Z-scored across time.
Pstim : array_like, shape (TP, N)
Test stimuli with TP time points and N features. Each feature should be Z-scored across time.
Presp : array_like, shape (TP, M)
Test responses with TP time points and M different responses. Each response should be Z-scored across
time.
alphas : list or array_like, shape (A,)
Ridge parameters that will be tested. Should probably be log-spaced. np.logspace(0, 3, 20) works well.
nboots : int
The number of bootstrap samples to run. 15 to 30 works well.
chunklen : int
On each sample, the training data is broken into chunks of this length. This should be a few times
longer than your delay/STRF. e.g. for a STRF with 3 delays, I use chunks of length 10.
nchunks : int
The number of training chunks held out to test ridge parameters for each bootstrap sample. The product
of nchunks and chunklen is the total number of training samples held out for each sample, and this
product should be about 20 percent of the total length of the training data.
corrmin : float in [0..1]
Purely for display purposes. After each alpha is tested for each bootstrap sample, the number of
responses with correlation greater than this value will be printed. For long-running regressions this
can give a rough sense of how well the model works before it's done.
joined : None or list of array_like indices
If you want the STRFs for two (or more) responses to be directly comparable, you need to ensure that
the regularization parameter that they use is the same. To do that, supply a list of the response sets
that should use the same ridge parameter here. For example, if you have four responses, joined could
be [np.array([0,1]), np.array([2,3])], in which case responses 0 and 1 will use the same ridge parameter
(which will be parameter that is best on average for those two), and likewise for responses 2 and 3.
singcutoff : float
The first step in ridge regression is computing the singular value decomposition (SVD) of the
stimulus Rstim. If Rstim is not full rank, some singular values will be approximately equal
to zero and the corresponding singular vectors will be noise. These singular values/vectors
should be removed both for speed (the fewer multiplications the better!) and accuracy. Any
singular values less than singcutoff will be removed.
normalpha : boolean
Whether ridge parameters (alphas) should be normalized by the largest singular value (LSV)
norm of Rstim. Good for rigorously comparing models with different numbers of parameters.
single_alpha : boolean
Whether to use a single alpha for all responses. Good for identification/decoding.
use_corr : boolean
If True, this function will use correlation as its metric of model fit. If False, this function
will instead use variance explained (R-squared) as its metric of model fit. For ridge regression
this can make a big difference -- highly regularized solutions will have very small norms and
will thus explain very little variance while still leading to high correlations, as correlation
is scale-free while R**2 is not.
Returns
-------
wt : array_like, shape (N, M)
Regression weights for N features and M responses.
corrs : array_like, shape (M,)
Validation set correlations. Predicted responses for the validation set are obtained using the regression
weights: pred = np.dot(Pstim, wt), and then the correlation between each predicted response and each
column in Presp is found.
alphas : array_like, shape (M,)
The regularization coefficient (alpha) selected for each voxel using bootstrap cross-validation.
bootstrap_corrs : array_like, shape (A, M, B)
Correlation between predicted and actual responses on randomly held out portions of the training set,
for each of A alphas, M voxels, and B bootstrap samples.
valinds : array_like, shape (TH, B)
The indices of the training data that were used as "validation" for each bootstrap sample.
"""
nresp, nvox = Rresp.shape
bestalphas = np.zeros((nboots, nvox)) # Will hold the best alphas for each voxel
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
# Keep it super simple by enforcing that the num of
# bootstraps be divisible by the number of workers;
# else, the number of bootstraps will be truncated
local_boots = nboots / size
if (nboots/size)*size != nboots:
logger.info("Number of workers do not cleanly divide requested bootstrap count. Doing %d bootstraps total instead" % (local_boots*size,))
if test_bootstrap:
k = rank
else:
k = None
Rcmats = []
valinds = [] # Will hold the indices into the validation data for each bootstrap
for i in range(local_boots):
if k < nboots:
logger.info("Rank " + str(rank) + " running bootstrap " + str(i+1) +
"/"+ str(local_boots) + " with seed " + str(k))
if test_bootstrap:
random.seed(k)
logger.info("Selecting held-out test set..")
allinds = range(nresp)
indchunks = zip(*[iter(allinds)]*chunklen)
random.shuffle(indchunks)
logger.info(str(indchunks[0:3]))
heldinds = list(itools.chain(*indchunks[:nchunks]))
notheldinds = list(set(allinds)-set(heldinds))
RRstim = Rstim[notheldinds,:]
PRstim = Rstim[heldinds,:]
RRresp = Rresp[notheldinds,:]
PRresp = Rresp[heldinds,:]
# Run ridge regression using this test set
Rcmat = ridge_corr(RRstim, PRstim, RRresp, PRresp, alphas,
corrmin=corrmin, singcutoff=singcutoff,
normalpha=normalpha, use_corr=use_corr,
logger=logger)
else:
Rcmat = None
heldinds = None
Rcmat = np.array(Rcmat)
# Allocate an empty numpy array to hold MPI collected data
recv_Rcmats = np.empty((size*len(alphas), nvox), dtype=np.float64)
comm.barrier()
comm.Allgather(Rcmat, recv_Rcmats)
# Split recv'd data into 'size' separate arrays (from each worker)
Rcmats += np.split(recv_Rcmats, size)
comm.barrier()
valinds += comm.allgather(heldinds)
comm.barrier()
if test_bootstrap:
k += size
# for local_bootstrap_result in global_Rcmats:
# Rcmats += local_bootstrap_result
# for local_valinds_result in global_valinds:
# valinds += local_valinds_result
valinds = [ x for x in valinds if x != None ]
valinds = np.array(valinds)
# Find best alphas
if nboots>0:
Rcmats = [ x for x in Rcmats if x != None ]
allRcorrs = np.dstack(Rcmats)
else:
allRcorrs = None
if not single_alpha:
if nboots==0:
raise ValueError("You must run at least one cross-validation step to assign "
"different alphas to each response.")
logger.info("Finding best alpha for each voxel..")
if joined is None:
# Find best alpha for each voxel
meanbootcorrs = allRcorrs.mean(2)
bestalphainds = np.argmax(meanbootcorrs, 0)
valphas = alphas[bestalphainds]
else:
# Find best alpha for each group of voxels
valphas = np.zeros((nvox,))
for jl in joined:
# Mean across voxels in the set, then mean across bootstraps
jcorrs = allRcorrs[:,jl,:].mean(1).mean(1)
bestalpha = np.argmax(jcorrs)
valphas[jl] = alphas[bestalpha]
else:
logger.info("Finding single best alpha..")
if nboots==0:
if len(alphas)==1:
bestalphaind = 0
bestalpha = alphas[0]
else:
raise ValueError("You must run at least one cross-validation step "
"to choose best overall alpha, or only supply one"
"possible alpha value.")
else:
meanbootcorr = allRcorrs.mean(2).mean(1)
bestalphaind = np.argmax(meanbootcorr)
bestalpha = alphas[bestalphaind]
valphas = np.array([bestalpha]*nvox)
logger.info("Best alpha = %0.3f"%bestalpha)
# Find weights
logger.info("Computing weights for each response using entire training set..")
wt = ridge(Rstim, Rresp, valphas, singcutoff=singcutoff, normalpha=normalpha)
# Predict responses on prediction set
logger.info("Predicting responses for predictions set..")
#pred = np.dot(Pstim, wt)
pred = splitdot.left_dot_col_major_gpu(Pstim, wt)
# Find prediction correlations
nnpred = np.nan_to_num(pred)
corrs = np.nan_to_num(np.array([np.corrcoef(Presp[:,ii], nnpred[:,ii].ravel())[0,1]
for ii in range(Presp.shape[1])]))
return wt, corrs, valphas, allRcorrs, valinds