def ridge_corr(Rstim, Pstim, Rresp, Presp, U, S, Vh, alphas, filename, dtype=np.single, corrmin=0.2, use_corr=True): ## Precompute some products for speed UR = np.dot(U.T, Rresp) PVh = np.dot(Pstim, Vh.T) zPresp = zs(Presp) Prespvar = Presp.var(0) Rcorrs = [] ## Holds training correlations for each alpha result = list() for a in alphas: D = S / ( S**2 + a**2 ) ## Reweight singular vectors by the (normalized?) ridge parameter pred = np.dot(mult_diag(D, PVh, left=False), UR) ## Best (1.75 seconds to prediction in test) if use_corr: Rcorr = (zPresp * zs(pred)).mean(0) else: resvar = (Presp - pred).var(0) Rcorr = np.clip(1 - (resvar / Prespvar), 0, 1) Rcorr = np.nan_to_num(Rcorr) Rcorrs.append(Rcorr) result.append([a, np.mean(Rcorr)]) np.save(filename, np.array(result)) return Rcorrs
def ridge_corr(Rstim, Pstim, Rresp, Presp, alphas, normalpha=False, corrmin=0.2, singcutoff=1e-10, use_corr=True, logger=ridge_logger): """Uses ridge regression to find a linear transformation of [Rstim] that approximates [Rresp], then tests by comparing the transformation of [Pstim] to [Presp]. This procedure is repeated for each regularization parameter alpha in [alphas]. The correlation between each prediction and each response for each alpha is returned. The regression weights are NOT returned, because computing the correlations without computing regression weights is much, MUCH faster. Parameters ---------- Rstim : array_like, shape (TR, N) Training stimuli with TR time points and N features. Each feature should be Z-scored across time. Pstim : array_like, shape (TP, N) Test stimuli with TP time points and N features. Each feature should be Z-scored across time. Rresp : array_like, shape (TR, M) Training responses with TR time points and M responses (voxels, neurons, what-have-you). Each response should be Z-scored across time. Presp : array_like, shape (TP, M) Test responses with TP time points and M responses. alphas : list or array_like, shape (A,) Ridge parameters to be tested. Should probably be log-spaced. np.logspace(0, 3, 20) works well. normalpha : boolean Whether ridge parameters should be normalized by the largest singular value (LSV) norm of Rstim. Good for comparing models with different numbers of parameters. corrmin : float in [0..1] Purely for display purposes. After each alpha is tested, the number of responses with correlation greater than corrmin minus the number of responses with correlation less than negative corrmin will be printed. For long-running regressions this vague metric of non-centered skewness can give you a rough sense of how well the model is working before it's done. singcutoff : float The first step in ridge regression is computing the singular value decomposition (SVD) of the stimulus Rstim. If Rstim is not full rank, some singular values will be approximately equal to zero and the corresponding singular vectors will be noise. These singular values/vectors should be removed both for speed (the fewer multiplications the better!) and accuracy. Any singular values less than singcutoff will be removed. use_corr : boolean If True, this function will use correlation as its metric of model fit. If False, this function will instead use variance explained (R-squared) as its metric of model fit. For ridge regression this can make a big difference -- highly regularized solutions will have very small norms and will thus explain very little variance while still leading to high correlations, as correlation is scale-free while R**2 is not. Returns ------- Rcorrs : array_like, shape (A, M) The correlation between each predicted response and each column of Presp for each alpha. """ ## Calculate SVD of stimulus matrix logger.info("Doing SVD...") try: U,S,Vh = np.linalg.svd(Rstim, full_matrices=False) except np.linalg.LinAlgError: logger.info("NORMAL SVD FAILED, trying more robust dgesvd..") from text.regression.svd_dgesvd import svd_dgesvd U,S,Vh = svd_dgesvd(Rstim, full_matrices=False) ## Truncate tiny singular values for speed origsize = S.shape[0] ngoodS = np.sum(S > singcutoff) nbad = origsize-ngoodS U = U[:,:ngoodS] S = S[:ngoodS] Vh = Vh[:ngoodS] logger.info("Dropped %d tiny singular values.. (U is now %s)"%(nbad, str(U.shape))) ## Normalize alpha by the LSV norm norm = S[0] logger.info("Training stimulus has LSV norm: %0.03f"%norm) if normalpha: nalphas = alphas * norm else: nalphas = alphas ## Precompute some products for speed UR = np.dot(U.T, Rresp) ## Precompute this matrix product for speed PVh = np.dot(Pstim, Vh.T) ## Precompute this matrix product for speed #Prespnorms = np.apply_along_axis(np.linalg.norm, 0, Presp) ## Precompute test response norms zPresp = zs(Presp) #Prespvar = Presp.var(0) Prespvar_actual = Presp.var(0) Prespvar = (np.ones_like(Prespvar_actual) + Prespvar_actual) / 2.0 logger.info("Average difference between actual & assumed Prespvar: %0.3f" % (Prespvar_actual - Prespvar).mean()) Rcorrs = [] ## Holds training correlations for each alpha for na, a in zip(nalphas, alphas): #D = np.diag(S/(S**2+a**2)) ## Reweight singular vectors by the ridge parameter D = S / (S ** 2 + na ** 2) ## Reweight singular vectors by the (normalized?) ridge parameter pred = np.dot(mult_diag(D, PVh, left=False), UR) ## Best (1.75 seconds to prediction in test) # pred = np.dot(mult_diag(D, np.dot(Pstim, Vh.T), left=False), UR) ## Better (2.0 seconds to prediction in test) # pvhd = reduce(np.dot, [Pstim, Vh.T, D]) ## Pretty good (2.4 seconds to prediction in test) # pred = np.dot(pvhd, UR) # wt = reduce(np.dot, [Vh.T, D, UR]).astype(dtype) ## Bad (14.2 seconds to prediction in test) # wt = reduce(np.dot, [Vh.T, D, U.T, Rresp]).astype(dtype) ## Worst # pred = np.dot(Pstim, wt) ## Predict test responses if use_corr: #prednorms = np.apply_along_axis(np.linalg.norm, 0, pred) ## Compute predicted test response norms #Rcorr = np.array([np.corrcoef(Presp[:,ii], pred[:,ii].ravel())[0,1] for ii in range(Presp.shape[1])]) ## Slowly compute correlations #Rcorr = np.array(np.sum(np.multiply(Presp, pred), 0)).squeeze()/(prednorms*Prespnorms) ## Efficiently compute correlations Rcorr = (zPresp * zs(pred)).mean(0) else: ## Compute variance explained resvar = (Presp - pred).var(0) Rsq = 1 - (resvar / Prespvar) Rcorr = np.sqrt(np.abs(Rsq)) * np.sign(Rsq) Rcorr[np.isnan(Rcorr)] = 0 Rcorrs.append(Rcorr) log_template = "Training: alpha=%0.3f, mean corr=%0.5f, max corr=%0.5f, over-under(%0.2f)=%d" log_msg = log_template % (a, np.mean(Rcorr), np.max(Rcorr), corrmin, (Rcorr>corrmin).sum()-(-Rcorr>corrmin).sum()) logger.info(log_msg) return Rcorrs
else: nalphas = alphas ## Precompute some products for speed UR = np.dot(U.T, Rresp) ## Precompute this matrix product for speed PVh = np.dot(Pstim, Vh.T) ## Precompute this matrix product for speed #Prespnorms = np.apply_along_axis(np.linalg.norm, 0, Presp) ## Precompute test response norms zPresp = zs(Presp) Prespvar = Presp.var(0) Rcorrs = [] ## Holds training correlations for each alpha for na, a in zip(nalphas, alphas): #D = np.diag(S/(S**2+a**2)) ## Reweight singular vectors by the ridge parameter D = S/(S**2+na**2) ## Reweight singular vectors by the (normalized?) ridge parameter pred = np.dot(mult_diag(D, PVh, left=False), UR) ## Best (1.75 seconds to prediction in test) # pred = np.dot(mult_diag(D, np.dot(Pstim, Vh.T), left=False), UR) ## Better (2.0 seconds to prediction in test) # pvhd = reduce(np.dot, [Pstim, Vh.T, D]) ## Pretty good (2.4 seconds to prediction in test) # pred = np.dot(pvhd, UR) # wt = reduce(np.dot, [Vh.T, D, UR]).astype(dtype) ## Bad (14.2 seconds to prediction in test) # wt = reduce(np.dot, [Vh.T, D, U.T, Rresp]).astype(dtype) ## Worst # pred = np.dot(Pstim, wt) ## Predict test responses if use_corr: #prednorms = np.apply_along_axis(np.linalg.norm, 0, pred) ## Compute predicted test response norms #Rcorr = np.array([np.corrcoef(Presp[:,ii], pred[:,ii].ravel())[0,1] for ii in range(Presp.shape[1])]) ## Slowly compute correlations #Rcorr = np.array(np.sum(np.multiply(Presp, pred), 0)).squeeze()/(prednorms*Prespnorms) ## Efficiently compute correlations Rcorr = (zPresp*zs(pred)).mean(0) else:
# Pstim is TPxN (~200x200 or 1000x15000) # Vh is output from SVD, I think NxN (~200x200 or 15000x15000) PVh = np.dot(Pstim, Vh.T) ## Precompute this matrix product for speed #Prespnorms = np.apply_along_axis(np.linalg.norm, 0, Presp) ## Precompute test response norms zPresp = zs(Presp) Prespvar = Presp.var(0) Rcorrs = [] ## Holds training correlations for each alpha for na, a in zip(nalphas, alphas): #D = np.diag(S/(S**2+a**2)) ## Reweight singular vectors by the ridge parameter D = S/(S**2+na**2) ## Reweight singular vectors by the (normalized?) ridge parameter # TODO determine if this should be a GPU op # mult_diag is diagonal matrix. # UR is TRxM (~1000x3000 or 5000x30000) pred = np.dot(mult_diag(D, PVh, left=False), UR) ## Best (1.75 seconds to prediction in test) # pred = np.dot(mult_diag(D, np.dot(Pstim, Vh.T), left=False), UR) ## Better (2.0 seconds to prediction in test) # pvhd = reduce(np.dot, [Pstim, Vh.T, D]) ## Pretty good (2.4 seconds to prediction in test) # pred = np.dot(pvhd, UR) # wt = reduce(np.dot, [Vh.T, D, UR]).astype(dtype) ## Bad (14.2 seconds to prediction in test) # wt = reduce(np.dot, [Vh.T, D, U.T, Rresp]).astype(dtype) ## Worst # pred = np.dot(Pstim, wt) ## Predict test responses if use_corr: #prednorms = np.apply_along_axis(np.linalg.norm, 0, pred) ## Compute predicted test response norms #Rcorr = np.array([np.corrcoef(Presp[:,ii], pred[:,ii].ravel())[0,1] for ii in range(Presp.shape[1])]) ## Slowly compute correlations #Rcorr = np.array(np.sum(np.multiply(Presp, pred), 0)).squeeze()/(prednorms*Prespnorms) ## Efficiently compute correlations Rcorr = (zPresp*zs(pred)).mean(0) else:
def ridge_corr(Rstim, Pstim, Rresp, Presp, alphas, normalpha=False, corrmin=0.2, singcutoff=1e-10, use_corr=True, logger=ridge_logger): """Uses ridge regression to find a linear transformation of [Rstim] that approximates [Rresp], then tests by comparing the transformation of [Pstim] to [Presp]. This procedure is repeated for each regularization parameter alpha in [alphas]. The correlation between each prediction and each response for each alpha is returned. The regression weights are NOT returned, because computing the correlations without computing regression weights is much, MUCH faster. Parameters ---------- Rstim : array_like, shape (TR, N) Training stimuli with TR time points and N features. Each feature should be Z-scored across time. Pstim : array_like, shape (TP, N) Test stimuli with TP time points and N features. Each feature should be Z-scored across time. Rresp : array_like, shape (TR, M) Training responses with TR time points and M responses (voxels, neurons, what-have-you). Each response should be Z-scored across time. Presp : array_like, shape (TP, M) Test responses with TP time points and M responses. alphas : list or array_like, shape (A,) Ridge parameters to be tested. Should probably be log-spaced. np.logspace(0, 3, 20) works well. normalpha : boolean Whether ridge parameters should be normalized by the largest singular value (LSV) norm of Rstim. Good for comparing models with different numbers of parameters. corrmin : float in [0..1] Purely for display purposes. After each alpha is tested, the number of responses with correlation greater than corrmin minus the number of responses with correlation less than negative corrmin will be printed. For long-running regressions this vague metric of non-centered skewness can give you a rough sense of how well the model is working before it's done. singcutoff : float The first step in ridge regression is computing the singular value decomposition (SVD) of the stimulus Rstim. If Rstim is not full rank, some singular values will be approximately equal to zero and the corresponding singular vectors will be noise. These singular values/vectors should be removed both for speed (the fewer multiplications the better!) and accuracy. Any singular values less than singcutoff will be removed. use_corr : boolean If True, this function will use correlation as its metric of model fit. If False, this function will instead use variance explained (R-squared) as its metric of model fit. For ridge regression this can make a big difference -- highly regularized solutions will have very small norms and will thus explain very little variance while still leading to high correlations, as correlation is scale-free while R**2 is not. Returns ------- Rcorrs : array_like, shape (A, M) The correlation between each predicted response and each column of Presp for each alpha. """ ## Calculate SVD of stimulus matrix logger.info("Doing SVD...") try: U, S, Vh = np.linalg.svd(Rstim, full_matrices=False) except np.linalg.LinAlgError: logger.info("NORMAL SVD FAILED, trying more robust dgesvd..") print(stim.shape) U, S, Vh = dgesvd(stim, full_matrices=False, algo='svd') ## Truncate tiny singular values for speed origsize = S.shape[0] ngoodS = np.sum(S > singcutoff) nbad = origsize - ngoodS U = U[:, :ngoodS] S = S[:ngoodS] Vh = Vh[:ngoodS] logger.info("Dropped %d tiny singular values.. (U is now %s)" % (nbad, str(U.shape))) ## Normalize alpha by the LSV norm norm = S[0] logger.info("Training stimulus has LSV norm: %0.03f" % norm) if normalpha: nalphas = alphas * norm else: nalphas = alphas ## Precompute some products for speed UR = np.dot(U.T, Rresp) ## Precompute this matrix product for speed PVh = np.dot(Pstim, Vh.T) ## Precompute this matrix product for speed #Prespnorms = np.apply_along_axis(np.linalg.norm, 0, Presp) ## Precompute test response norms zPresp = zs(Presp) #Prespvar = Presp.var(0) Prespvar_actual = Presp.var(0) Prespvar = (np.ones_like(Prespvar_actual) + Prespvar_actual) / 2.0 logger.info("Average difference between actual & assumed Prespvar: %0.3f" % (Prespvar_actual - Prespvar).mean()) Rcorrs = [] ## Holds training correlations for each alpha for na, a in zip(nalphas, alphas): #D = np.diag(S/(S**2+a**2)) ## Reweight singular vectors by the ridge parameter D = S / ( S**2 + na**2 ) ## Reweight singular vectors by the (normalized?) ridge parameter pred = np.dot(mult_diag(D, PVh, left=False), UR) ## Best (1.75 seconds to prediction in test) # pred = np.dot(mult_diag(D, np.dot(Pstim, Vh.T), left=False), UR) ## Better (2.0 seconds to prediction in test) # pvhd = reduce(np.dot, [Pstim, Vh.T, D]) ## Pretty good (2.4 seconds to prediction in test) # pred = np.dot(pvhd, UR) # wt = reduce(np.dot, [Vh.T, D, UR]).astype(dtype) ## Bad (14.2 seconds to prediction in test) # wt = reduce(np.dot, [Vh.T, D, U.T, Rresp]).astype(dtype) ## Worst # pred = np.dot(Pstim, wt) ## Predict test responses if use_corr: #prednorms = np.apply_along_axis(np.linalg.norm, 0, pred) ## Compute predicted test response norms #Rcorr = np.array([np.corrcoef(Presp[:,ii], pred[:,ii].ravel())[0,1] for ii in range(Presp.shape[1])]) ## Slowly compute correlations #Rcorr = np.array(np.sum(np.multiply(Presp, pred), 0)).squeeze()/(prednorms*Prespnorms) ## Efficiently compute correlations Rcorr = (zPresp * zs(pred)).mean(0) else: ## Compute variance explained resvar = (Presp - pred).var(0) Rsq = 1 - (resvar / Prespvar) Rcorr = np.sqrt(np.abs(Rsq)) * np.sign(Rsq) Rcorr[np.isnan(Rcorr)] = 0 Rcorrs.append(Rcorr) log_template = "Training: alpha=%0.3f, mean corr=%0.5f, max corr=%0.5f, over-under(%0.2f)=%d" log_msg = log_template % (a, np.mean(Rcorr), np.max(Rcorr), corrmin, (Rcorr > corrmin).sum() - (-Rcorr > corrmin).sum()) logger.info(log_msg) return Rcorrs
PVh = np.dot(Pstim, Vh.T) ## Precompute this matrix product for speed #Prespnorms = np.apply_along_axis(np.linalg.norm, 0, Presp) ## Precompute test response norms zPresp = zs(Presp) Prespvar = Presp.var(0) Rcorrs = [] ## Holds training correlations for each alpha for na, a in zip(nalphas, alphas): #D = np.diag(S/(S**2+a**2)) ## Reweight singular vectors by the ridge parameter D = S / ( S**2 + na**2 ) ## Reweight singular vectors by the (normalized?) ridge parameter # TODO determine if this should be a GPU op # mult_diag is diagonal matrix. # UR is TRxM (~1000x3000 or 5000x30000) pred = np.dot(mult_diag(D, PVh, left=False), UR) ## Best (1.75 seconds to prediction in test) # pred = np.dot(mult_diag(D, np.dot(Pstim, Vh.T), left=False), UR) ## Better (2.0 seconds to prediction in test) # pvhd = reduce(np.dot, [Pstim, Vh.T, D]) ## Pretty good (2.4 seconds to prediction in test) # pred = np.dot(pvhd, UR) # wt = reduce(np.dot, [Vh.T, D, UR]).astype(dtype) ## Bad (14.2 seconds to prediction in test) # wt = reduce(np.dot, [Vh.T, D, U.T, Rresp]).astype(dtype) ## Worst # pred = np.dot(Pstim, wt) ## Predict test responses if use_corr: #prednorms = np.apply_along_axis(np.linalg.norm, 0, pred) ## Compute predicted test response norms #Rcorr = np.array([np.corrcoef(Presp[:,ii], pred[:,ii].ravel())[0,1] for ii in range(Presp.shape[1])]) ## Slowly compute correlations #Rcorr = np.array(np.sum(np.multiply(Presp, pred), 0)).squeeze()/(prednorms*Prespnorms) ## Efficiently compute correlations Rcorr = (zPresp * zs(pred)).mean(0)