def lanczos_iteration(Fvp_fn, dim, k=20): v = torch.FloatTensor(dim).uniform_() v /= torch.norm(v, 2) diag = [] diag_adj = [] w = Fvp_fn(v) alpha = w.dot(v) w -= alpha * v diag.append(alpha) for i in range(k - 1): beta = torch.norm(w, 2) if beta == 0: break v_prev = v.clone() v = w / beta w = Fvp_fn(v) alpha = w.dot(v) diag.append(alpha) diag_adj.append(beta) w = w - alpha * v - beta * v_prev diag, diag_adj = np.array(diag), np.array(diag_adj) # print ("Lanc diag: ", diag) # print ("Lanc diag_adj: ", diag_adj) w = eigvalsh_tridiagonal(np.array(diag), np.array(diag_adj)) return w
def jacobi_sampler_tridiag(M_1, M_2, N, beta=2): """ .. seealso:: :cite:`KiNe04` Theorem 2 """ if not (beta > 0): raise ValueError('`beta` must be positive. Given: {}'.format(beta)) # c_odd = c_1, c_2, ..., c_2N-1 c_odd = np.random.beta(a=0.5 * beta * np.arange(M_1, M_1 - N, step=-1), b=0.5 * beta * np.arange(M_2, M_2 - N, step=-1)) # c_even = c_0, c_2, c_2N-2 c_even = np.zeros(N) c_even[1:] = np.random.beta( a=0.5 * beta * np.arange(N - 1, 0, step=-1), b=0.5 * beta * np.arange(M_1 + M_2 - N, M_1 + M_2 - 2 * N + 1, step=-1)) # xi_odd = xi_2i-1 = (1-c_2i-2) c_2i-1 xi_odd = (1 - c_even) * c_odd # xi_even = xi_0=0, xi_2, xi_2N-2 # xi_2i = (1-c_2i-1)*c_2i xi_even = np.zeros(N) xi_even[1:] = (1 - c_odd[:-1]) * c_even[1:] # alpha_i = xi_2i-2 + xi_2i-1, xi_0 = 0 alpha_coef = xi_even + xi_odd # beta_i+1 = xi_2i-1 * xi_2i beta_coef = xi_odd[:-1] * xi_even[1:] return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def laguerre_sampler_tridiag(M, N, beta=2): """ .. seealso:: :cite:`DuEd02` III-B """ if not (beta > 0): raise ValueError('`beta` must be positive. Given: {}'.format(beta)) # M=>N # xi_odd = xi_1, ... , xi_2N-1 xi_odd = np.random.chisquare(beta * np.arange(M, M - N, step=-1)) # xi_even = xi_0=0, xi_2, ... ,xi_2N-2 xi_even = np.zeros(N) xi_even[1:] = np.random.chisquare(beta * np.arange(N - 1, 0, step=-1)) # alpha_i = xi_2i-2 + xi_2i-1 # alpha_1 = xi_0 + xi_1 = xi_1 alpha_coef = xi_even + xi_odd # beta_i+1 = xi_2i-1 * xi_2i beta_coef = xi_odd[:-1] * xi_even[1:] return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def _levels(Ec, EJ, ng=0.0, gridSize=51, select_range=(0, 10)): n = np.arange(gridSize) - gridSize // 2 w = eigvalsh_tridiagonal(4 * Ec * (n - ng)**2, -EJ / 2 * np.ones(gridSize - 1), select='i', select_range=select_range) return w
def _generate_GOE_tridiagonal_direct(size: int = 100, seed: int = None, dowarn: bool = True) -> ndarray: """See: Edelman, A., Sutton, B. D., & Wang, Y. (2014). Random matrix theory, numerical computation and applications. Modern Aspects of Random Matrix Theory, 72, 53. """ if dowarn: warn( "While this method is fast, and uses the least memory, it appears that" "`eigvalsh_tridiagonal` is considerably less precise, and will result" "in significant deviations from the expected values for the long range" "spectral observables (e.g. spectral rigidity, level number variance)." ) if seed is not None: np.random.seed(seed) size = size + 2 chi_range = size - 1 - np.arange(size - 1) chi = np.sqrt(np.random.chisquare(chi_range)) diagonal = np.random.normal(0, np.sqrt(2), size) / np.sqrt(2) eigs = eigvalsh_tridiagonal( diagonal, chi, # select="a", check_finite=False, select="i", select_range=(1, size - 2), lapack_driver="stebz", tol=4 * np.finfo(np.float64).eps, ) return eigs
def mu_ref_normal_sampler_tridiag(loc=0.0, scale=1.0, beta=2, size=10, random_state=None): """Implementation of the tridiagonal model to sample from .. math:: \\Delta(x_{1}, \\dots, x_{N})^{\\beta} \\prod_{n=1}^{N} \\exp(-\\frac{(x_i-\\mu)^2}{2\\sigma^2} ) dx_i .. seealso:: :cite:`DuEd02` II-C """ rng = check_random_state(random_state) if not (beta > 0): raise ValueError('`beta` must be positive. Given: {}'.format(beta)) # beta/2*[N-1, N-2, ..., 1] b_2_Ni = 0.5 * beta * np.arange(size - 1, 0, step=-1) alpha_coef = rng.normal(loc=loc, scale=scale, size=size) beta_coef = rng.gamma(shape=b_2_Ni, scale=scale**2) return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def mu_ref_gamma_sampler_tridiag(shape=1.0, scale=1.0, beta=2, size=10): """ .. seealso:: :cite:`DuEd02` III-B """ if not (beta > 0): raise ValueError('`beta` must be positive. Given: {}'.format(beta)) # beta/2*[N-1, N-2, ..., 1, 0] b_2_Ni = 0.5 * beta * np.arange(size - 1, -1, step=-1) # xi_odd = xi_1, ... , xi_2N-1 xi_odd = np.random.gamma(shape=b_2_Ni + shape, scale=scale) # odd # xi_even = xi_0=0, xi_2, ... ,xi_2N-2 xi_even = np.zeros(size) xi_even[1:] = np.random.gamma(shape=b_2_Ni[:-1], scale=scale) # even # alpha_i = xi_2i-2 + xi_2i-1 # alpha_1 = xi_0 + xi_1 = xi_1 alpha_coef = xi_even + xi_odd # beta_i+1 = xi_2i-1 * xi_2i beta_coef = xi_odd[:-1] * xi_even[1:] return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def jacobi_sampler_tridiag(M_1, M_2, N, beta=2): """ .. seealso:: :cite:`KiNe04` Theorem 2 """ # c_odd = c_1, c_2, ..., c_2N-1 c_odd = np.random.beta( 0.5*beta*np.arange(M_1, M_1-N, step=-1), 0.5*beta*np.arange(M_2, M_2-N, step=-1)) # c_even = c_0, c_2, c_2N-2 c_even = np.zeros(N) c_even[1:] = np.random.beta( 0.5*beta*np.arange(N-1, 0, step=-1), 0.5*beta*np.arange(M_1+M_2-N, M_1+M_2-2*N+1,step=-1)) # xi_odd = xi_2i-1 = (1-c_2i-2) c_2i-1 xi_odd = (1-c_even)*c_odd # xi_even = xi_0=0, xi_2, xi_2N-2 # xi_2i = (1-c_2i-1)*c_2i xi_even = np.zeros(N) xi_even[1:] = (1-c_odd[:-1])*c_even[1:] # alpha_i = xi_2i-2 + xi_2i-1 # alpha_1 = xi_0 + xi_1 = xi_1 alpha_coef = xi_even + xi_odd # beta_i+1 = xi_2i-1 * xi_2i beta_coef = xi_odd[:-1] * xi_even[1:] return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def mu_ref_beta_sampler_tridiag(a, b, beta=2, size=10): """ .. seealso:: :cite:`KiNe04` Theorem 2 """ # beta/2*[N-1, N-2, ..., 1, 0] b_2_Ni = 0.5*beta*np.arange(size-1,-1,step=-1) # c_odd = c_1, c_2, ..., c_2N-1 c_odd = np.random.beta( b_2_Ni + a, b_2_Ni + b) # c_even = c_0, c_2, c_2N-2 c_even = np.zeros(size) c_even[1:] = np.random.beta(b_2_Ni[:-1], b_2_Ni[1:] + a + b) # xi_odd = xi_2i-1 = (1-c_2i-2) c_2i-1 xi_odd = (1-c_even)*c_odd # xi_even = xi_0=0, xi_2, xi_2N-2 # xi_2i = (1-c_2i-1)*c_2i xi_even = np.zeros(size) xi_even[1:] = (1-c_odd[:-1])*c_even[1:] # alpha_i = xi_2i-2 + xi_2i-1 # alpha_1 = xi_0 + xi_1 = xi_1 alpha_coef = xi_even + xi_odd # beta_i+1 = xi_2i-1 * xi_2i beta_coef = xi_odd[:-1] * xi_even[1:] return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def hermite_sampler_tridiag(N, beta=2): """ .. seealso:: :cite:`DuEd02` II-C """ alpha_coef = np.sqrt(2)*np.random.randn(N) beta_coef = np.random.chisquare(beta*np.arange(N-1, 0, step=-1)) return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def hermite_sampler_tridiag(N, beta=2): """ .. seealso:: :cite:`DuEd02` II-C """ if not (beta > 0): raise ValueError('`beta` must be positive. Given: {}'.format(beta)) alpha_coef = np.sqrt(2) * np.random.randn(N) beta_coef = np.random.chisquare(beta * np.arange(N - 1, 0, step=-1)) return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def muref_normal_sampler_tridiag(loc=0.0, scale=1.0, beta=2, size=10): """ .. seealso:: :cite:`DuEd02` II-C """ # beta/2*[N-1, N-2, ..., 1] b_2_Ni = 0.5*beta*np.arange(size-1, 0, step=-1) alpha_coef = np.random.normal(loc=loc, scale=scale, size=size) beta_coef = np.random.gamma(shape=b_2_Ni, scale=scale**2) return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def mu_ref_normal_sampler_tridiag(loc=0.0, scale=1.0, beta=2, size=10): """ .. seealso:: :cite:`DuEd02` II-C """ if not (beta > 0): raise ValueError('`beta` must be positive. Given: {}'.format(beta)) # beta/2*[N-1, N-2, ..., 1] b_2_Ni = 0.5 * beta * np.arange(size - 1, 0, step=-1) alpha_coef = np.random.normal(loc=loc, scale=scale, size=size) beta_coef = np.random.gamma(shape=b_2_Ni, scale=scale**2) return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def mu_ref_beta_sampler_tridiag(a, b, beta=2, size=10, random_state=None): """ Implementation of the tridiagonal model given by Theorem 2 of :cite:`KiNe04` to sample from .. math:: \\Delta(x_{1}, \\dots, x_{N})^{\\beta} \\prod_{n=1}^{N} x^{a-1} (1-x)^{b-1} dx .. seealso:: :cite:`KiNe04` Theorem 2 """ rng = check_random_state(random_state) if not (beta > 0): raise ValueError('`beta` must be positive. Given: {}'.format(beta)) # beta/2*[N-1, N-2, ..., 1, 0] b_2_Ni = 0.5 * beta * np.arange(size - 1, -1, step=-1) # c_odd = c_1, c_3, ..., c_2N-1 c_odd = rng.beta(b_2_Ni + a, b_2_Ni + b) # c_even = c_0, c_2, c_2N-2 c_even = np.zeros(size) c_even[1:] = rng.beta(b_2_Ni[:-1], b_2_Ni[1:] + a + b) # xi_odd = xi_2i-1 = (1-c_2i-2) c_2i-1 xi_odd = (1 - c_even) * c_odd # xi_even = xi_0=0, xi_2, xi_2N-2 # xi_2i = (1-c_2i-1)*c_2i xi_even = np.zeros(size) xi_even[1:] = (1 - c_odd[:-1]) * c_even[1:] # alpha_i = xi_2i-2 + xi_2i-1 # alpha_1 = xi_0 + xi_1 = xi_1 alpha_coef = xi_even + xi_odd # beta_i+1 = xi_2i-1 * xi_2i beta_coef = xi_odd[:-1] * xi_even[1:] return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def laguerre_sampler_tridiag(M, N, beta=2): """ .. seealso:: :cite:`DuEd02` III-B """ # M=>N # xi_odd = xi_1, ... , xi_2N-1 xi_odd = np.random.chisquare(beta*np.arange(M, M-N, step=-1)) # odd # xi_even = xi_0=0, xi_2, ... ,xi_2N-2 xi_even = np.zeros(N) xi_even[1:] = np.random.chisquare(beta*np.arange(N-1, 0, step=-1)) # even # alpha_i = xi_2i-2 + xi_2i-1 # alpha_1 = xi_0 + xi_1 = xi_1 alpha_coef = xi_even + xi_odd # beta_i+1 = xi_2i-1 * xi_2i beta_coef = xi_odd[:-1] * xi_even[1:] return la.eigvalsh_tridiagonal(alpha_coef, np.sqrt(beta_coef))
def step( self, closure, execute_update=True): #Fvp_fn, execute_update=True, closure=None): """Performs a single optimization step. Arguments: Fvp_fn (callable): A closure that accepts a vector of parameters and a vector of length equal to the number of model paramsters and returns the Fisher-vector product. """ state = self.state # State initialization if len(state) == 0: state['step'] = 0 # Set shrinkage to defaults, i.e. no shrinkage state['rho'] = 0.0 state['diag_shrunk'] = 1.0 state['step'] += 1 # Get flat grad g = gradients_to_vector(self._params) if 'ng_prior' not in state: state['ng_prior'] = torch.zeros_like(g) curv_type = self._param_group['curv_type'] if curv_type not in self.valid_curv_types: raise ValueError("Invalid curv_type.") # Create closure to pass to Lanczos and CG if curv_type == 'fisher': Fvp_theta_fn = make_fvp_fun(closure, self._params) elif curv_type == 'gauss_newton': Fvp_theta_fn = make_gnvp_fun(closure, self._params) shrinkage_method = self._param_group['shrinkage_method'] lanczos_amortization = self._param_group['lanczos_amortization'] if shrinkage_method == 'lanczos' and (state['step'] - 1) % lanczos_amortization == 0: # print ("Computing Lanczos shrinkage at step ", state['step']) w = lanczos_iteration(Fvp_theta_fn, self._numel(), k=self._param_group['lanczos_iters']) rho, diag_shrunk = estimate_shrinkage( w, self._numel(), self._param_group['batch_size']) state['rho'] = rho state['diag_shrunk'] = diag_shrunk M = None if self._param_group['cg_precondition_empirical']: # Empirical Fisher is g * g M = (g * g + self._param_group['cg_precondition_regu_coef'] * torch.ones_like(g))**self._param_group['cg_precondition_exp'] # Do CG solve with hvp fn closure extract_tridiag = self._param_group['shrinkage_method'] == 'cg' cg_result = cg_solve( Fvp_theta_fn, g.data.clone(), x_0=self._param_group['cg_prev_init_coef'] * state['ng_prior'], M=M, cg_iters=self._param_group['cg_iters'], cg_residual_tol=self._param_group['cg_residual_tol'], shrunk=self._param_group['shrinkage_method'] is not None, rho=state['rho'], Dshrunk=state['diag_shrunk'], extract_tridiag=extract_tridiag) if extract_tridiag: # print ("Computing CG shrinkage at step ", state['step']) ng, (diag_elems, off_diag_elems) = cg_result w = eigvalsh_tridiagonal(diag_elems, off_diag_elems) rho, diag_shrunk = estimate_shrinkage( w, self._numel(), self._param_group['batch_size']) state['rho'] = rho state['diag_shrunk'] = diag_shrunk else: ng = cg_result state['ng_prior'] = ng.data.clone() # Normalize NG lr = self._param_group['lr'] alpha = torch.sqrt(torch.abs(lr / (torch.dot(g, ng) + 1e-20))) # Unflatten grad vector_to_gradients(ng, self._params) if execute_update: # Apply step for p in self._params: if p.grad is None: continue d_p = p.grad.data p.data.add_(-alpha, d_p) return dict(alpha=alpha, delta=lr, natural_grad=ng)
def step(self, closure, execute_update=True): """Performs a single optimization step. Arguments: Fvp_fn (callable): A closure that accepts a vector of parameters and a vector of length equal to the number of model paramsters and returns the Fisher-vector product. """ # Update theta old for all blocks first, only approx update is supported params_i = 0 params_j = 0 for gi, group in enumerate(self.param_groups): params = group['params'] params_j += len(params) num_params = self._numel(gi, params) # print ("num_params: ", num_params, params_i, params_j) state = self.state[gi] if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['m'] = torch.zeros(num_params) # Maintain adaptive preconditioner if needed if group['cg_precondition_empirical']: state['M'] = torch.zeros(num_params) # Set shrinkage to defaults, i.e. no shrinkage state['rho'] = 0.0 state['diag_shrunk'] = 1.0 state['lagged'] = [] for i in range(len(params)): state['lagged'].append(params[i] + torch.randn(params[i].shape) * 0.0001) beta1, beta2 = group['betas'] theta = parameters_to_vector(params) theta_old = parameters_to_vector(state['lagged']) # Update theta_old beta2 portion towards theta theta_old = beta2 * theta_old + (1 - beta2) * theta vector_to_parameters(theta_old, state['lagged']) # print (theta_old) # input("") info = {} # If doing block diag, perform the update for each param group params_i = 0 params_j = 0 for gi, group in enumerate(self.param_groups): params = group['params'] params_j += len(params) num_params = self._numel(gi, params) # NOTE: state is initialized above state = self.state[gi] m = state['m'] beta1, beta2 = group['betas'] state['step'] += 1 params_old = state['lagged'] # bias_correction1 = 1 - beta1**state['step'] bias_correction2 = 1 - beta2**state['step'] # Get flat grad g = gradients_to_vector(params) # Update moving average mean m.mul_(beta1).add_(1 - beta1, g) g_hat = m / bias_correction1 if 'ng_prior' not in state: state['ng_prior'] = torch.zeros_like( g) #g_hat) #g_hat.data.clone() curv_type = group['curv_type'] if curv_type not in self.valid_curv_types: raise ValueError("Invalid curv_type.") # Now that theta_old has been updated, do CG with only theta old if curv_type == 'fisher': fvp_fn_div_beta2 = make_fvp_fun_idx( closure, params_old, params_i, params_j, bias_correction2=bias_correction2) elif curv_type == 'gauss_newton': fvp_fn_div_beta2 = make_gnvp_fun( closure, params_old, bias_correction2=bias_correction2) shrinkage_method = group['shrinkage_method'] lanczos_amortization = group['lanczos_amortization'] if shrinkage_method == 'lanczos' and ( state['step'] - 1) % lanczos_amortization == 0: # print ("Computing Lanczos shrinkage at step ", state['step']) w = lanczos_iteration(fvp_fn_div_beta2, num_params, k=group['lanczos_iters']) rho, diag_shrunk = estimate_shrinkage(w, num_params, group['batch_size']) state['rho'] = rho state['diag_shrunk'] = diag_shrunk M = None if group['cg_precondition_empirical']: # Empirical Fisher is g * g V = state['M'] Mt = (g * g + group['cg_precondition_regu_coef'] * torch.ones_like(g))**group['cg_precondition_exp'] Vhat = V.mul(beta2).add(1 - beta2, Mt) / bias_correction2 V = torch.max(V, Vhat) M = V extract_tridiag = group['shrinkage_method'] == 'cg' cg_result = cg_solve(fvp_fn_div_beta2, g_hat.data.clone(), x_0=group['cg_prev_init_coef'] * state['ng_prior'], M=M, cg_iters=group['cg_iters'], cg_residual_tol=group['cg_residual_tol'], shrunk=group['shrinkage_method'] is not None, rho=state['rho'], Dshrunk=state['diag_shrunk'], extract_tridiag=extract_tridiag) if extract_tridiag: # print ("Computing CG shrinkage at step ", state['step']) ng, (diag_elems, off_diag_elems) = cg_result w = eigvalsh_tridiagonal(diag_elems, off_diag_elems) rho, diag_shrunk = estimate_shrinkage(w, num_params, group['batch_size']) state['rho'] = rho state['diag_shrunk'] = diag_shrunk else: ng = cg_result # print ("NG: ", ng) state['ng_prior'] = ng.data.clone() # Normalize NG lr = group['lr'] alpha = torch.sqrt(torch.abs(lr / (torch.dot(g_hat, ng) + 1e-20))) # Unflatten grad vector_to_gradients(ng, params) if execute_update: # Apply step for p in params: if p.grad is None: continue d_p = p.grad.data p.data.add_(-alpha, d_p) params_i = params_j info[gi] = dict(alpha=alpha, delta=lr, natural_grad=ng) return info
return f if __name__ == "__main__": n = 100 P = np.random.random((100, 100)) A = P @ P.T M = np.diag(A) Minv_mat = np.diag(1.0/M) w1 = np.linalg.eigvals(A) w1b = np.linalg.eigvals(Minv_mat @ A) b = np.ones((n,)) fvp_fn = make_fvp_fn(A) cg_result = cg_solve(fvp_fn, b, cg_iters=n, extract_tridiag=True) ng, (diag_elems, off_diag_elems) = cg_result w2 = eigvalsh_tridiagonal(diag_elems, off_diag_elems) cg_result = cg_solve(fvp_fn, b, cg_iters=n, M=M, extract_tridiag=True) ng, (diag_elems, off_diag_elems) = cg_result w3 = eigvalsh_tridiagonal(diag_elems, off_diag_elems) w4 = Minv_mat @ np.diag(w3) print ("Originals: ", np.max(w1), np.linalg.norm(w1)) print ("CG no prec: ", np.max(w2), np.linalg.norm(w2), np.max(w1)-np.max(w2), np.linalg.norm(w1-w2)) print ("CG w/ prec: ", np.max(w3), np.linalg.norm(w3), np.max(w1)-np.max(w3), np.linalg.norm(w1-w3)) print ("CG w/ prec vs orig: ", np.max(w4), np.linalg.norm(w4), np.max(w1)-np.max(w4), np.linalg.norm(w1-w4)) print ("CG w/ prec vs true MinvA: ", np.max(w4), np.linalg.norm(w4), np.max(w1b)-np.max(w4), np.linalg.norm(w1b-w4))
def step(self, closure, execute_update=True): """Performs a single optimization step. Arguments: Fvp_fn (callable): A closure that accepts a vector of length equal to the number of model paramsters and returns the Fisher-vector product. """ state = self.state param_vec = parameters_to_vector(self._params) # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['m'] = torch.zeros_like(param_vec.data) # Maintain adaptive preconditioner if needed if self._param_group['cg_precondition_empirical']: state['M'] = torch.zeros_like(param_vec.data) # Set shrinkage to defaults, i.e. no shrinkage state['rho'] = 0.0 state['diag_shrunk'] = 1.0 m = state['m'] beta1, beta2 = self._param_group['betas'] state['step'] += 1 bias_correction1 = 1 - beta1**state['step'] bias_correction2 = 1 - beta2**state['step'] # Get flat grad g = gradients_to_vector(self._params) # Update moving average mean m.mul_(beta1).add_(1 - beta1, g) g_hat = m / bias_correction1 theta = parameters_to_vector(self._params) theta_old = parameters_to_vector(self._params_old) if 'ng_prior' not in state: state['ng_prior'] = torch.zeros_like(g_hat) #g_hat.data.clone() if 'max_fisher_spectral_norm' not in state: state['max_fisher_spectral_norm'] = 0.0 curv_type = self._param_group['curv_type'] if curv_type not in self.valid_curv_types: raise ValueError("Invalid curv_type.") if curv_type == 'fisher': weighted_fvp_fn_div_beta2 = self._make_combined_fvp_fun( closure, self._params, self._params_old, bias_correction2=bias_correction2) elif curv_type == 'gauss_newton': weighted_fvp_fn_div_beta2 = self._make_combined_gnvp_fun( closure, self._params, self._params_old, bias_correction2=bias_correction2) fisher_norm = lanczos_iteration(weighted_fvp_fn_div_beta2, self._numel(), k=1)[0] is_max_norm = fisher_norm > state['max_fisher_spectral_norm'] or state[ 'step'] == 1 if is_max_norm: state['max_fisher_spectral_norm'] = fisher_norm if is_max_norm: if self._param_group['assume_locally_linear']: # Update theta_old beta2 portion towards theta theta_old = beta2 * theta_old + (1 - beta2) * theta else: # Do linesearch first to update theta_old. Then can do CG with only one HVP at each itr. ng = self.state['ng_prior'].clone( ) if state['step'] > 1 else g_hat.data.clone() if curv_type == 'fisher': weighted_fvp_fn = self._make_combined_fvp_fun( closure, self._params, self._params_old) f = make_fvp_obj_fun(closure, weighted_fvp_fn, ng) elif curv_type == 'gauss_newton': weighted_fvp_fn = self._make_combined_gnvp_fun( closure, self._params, self._params_old) f = make_gnvp_obj_fun(closure, weighted_fvp_fn, ng) xmin, fmin, alpha = randomized_linesearch( f, theta_old.data, theta.data) theta_old = Variable(xmin.float()) vector_to_parameters(theta_old, self._params_old) # Now that theta_old has been updated, do CG with only theta old # If not max norm, then this will remain the old params. if curv_type == 'fisher': fvp_fn_div_beta2 = make_fvp_fun(closure, self._params_old, bias_correction2=bias_correction2) elif curv_type == 'gauss_newton': fvp_fn_div_beta2 = make_gnvp_fun(closure, self._params_old, bias_correction2=bias_correction2) shrinkage_method = self._param_group['shrinkage_method'] lanczos_amortization = self._param_group['lanczos_amortization'] if shrinkage_method == 'lanczos' and (state['step'] - 1) % lanczos_amortization == 0: # print ("Computing Lanczos shrinkage at step ", state['step']) w = lanczos_iteration(fvp_fn_div_beta2, self._numel(), k=self._param_group['lanczos_iters']) rho, diag_shrunk = estimate_shrinkage( w, self._numel(), self._param_group['batch_size']) state['rho'] = rho state['diag_shrunk'] = diag_shrunk M = None if self._param_group['cg_precondition_empirical']: # Empirical Fisher is g * g V = state['M'] Mt = (g * g + self._param_group['cg_precondition_regu_coef'] * torch.ones_like(g))**self._param_group['cg_precondition_exp'] Vhat = V.mul(beta2).add(1 - beta2, Mt) / bias_correction2 V = torch.max(V, Vhat) M = V extract_tridiag = self._param_group['shrinkage_method'] == 'cg' cg_result = cg_solve( fvp_fn_div_beta2, g_hat.data.clone(), x_0=self._param_group['cg_prev_init_coef'] * state['ng_prior'], M=M, cg_iters=self._param_group['cg_iters'], cg_residual_tol=self._param_group['cg_residual_tol'], shrunk=self._param_group['shrinkage_method'] is not None, rho=state['rho'], Dshrunk=state['diag_shrunk'], extract_tridiag=extract_tridiag) if extract_tridiag: # print ("Computing CG shrinkage at step ", state['step']) ng, (diag_elems, off_diag_elems) = cg_result w = eigvalsh_tridiagonal(diag_elems, off_diag_elems) rho, diag_shrunk = estimate_shrinkage( w, self._numel(), self._param_group['batch_size']) state['rho'] = rho state['diag_shrunk'] = diag_shrunk else: ng = cg_result self.state['ng_prior'] = ng.data.clone() # Normalize NG lr = self._param_group['lr'] alpha = torch.sqrt(torch.abs(lr / (torch.dot(g_hat, ng) + 1e-20))) # Unflatten grad vector_to_gradients(ng, self._params) if execute_update: # Apply step for p in self._params: if p.grad is None: continue d_p = p.grad.data p.data.add_(-alpha, d_p) return dict(alpha=alpha, delta=lr, natural_grad=ng)
import numpy as np from scipy.linalg import eigvalsh_tridiagonal, eigvalsh diagonal = np.array([1.5833333333333332593, -0.01259572752922188954, 2.3690214303404664165, 0.06024096385542132559, 1.9941915593928158934, 1.0058084406071843286]) subdiagonal = np.array([-2.3964673074247340168, 0.93475927884341891705,-2.0788632064407330802, 6.3258425909268308882e-016, -0.075991464158134569562]) eigenvectors = eigvalsh_tridiagonal(diagonal, subdiagonal) print("Eigenvectors of matrixA to: ") print(eigenvectors)
import numpy as np from scipy.linalg import eigvalsh_tridiagonal diagonal = np.array([1.58333, -0.0125957, 2.36902, 0.060241, 1.90646, 1.09354]) subdiagonal = np.array([-2.396467, 0.9347593, -2.078863, 1.177896e-15, -0.2911902]) result = eigvalsh_tridiagonal(diagonal, subdiagonal) print("Eigenvalues: ", result)
def sample_mcmc(self, N=10, nb_gibbs_passes=10, sample_exact_cond=False, nb_mala_steps=100, return_chain_of_eig_vals=False, return_chain_of_lambda_max=False, random_state=None): """ Gibbs sampler on Jacobi matrices to sample approximately from the corresponding :math:`\\beta`-ensemble. :param N: Number of points/size of the :math:`\\beta`-ensemble :type N: int :param nb_gibbs_passes: Number of passes/sweeps over the variables using the Gibbs sampler :type nb_gibbs_passes: int :param sample_exact_cond: Flag to force (``True``) exact sampling from the conditionals when it is possible. Otherwise run MALA for ``nb_mala_steps` to sample from the conditionals. :type sample_exact_cond: bool (default 100) :param nb_mala_steps: Number of steps of Metropolis Ajusted Langevin Algorithm (MALA) to perform when the conditionals are sampled approximately :type nb_mala_steps: int, default 100 :param return_chain_of_eig_vals: Flag to return the chain of eigenvalues associated to the chain of Jacobi matrices. If ``True`` the whole chain of eigenvalues is returned If ``False`` only the last sequence of eigenvalues is returned :type return_chain_of_eig_vals: bool (default False) :param return_chain_of_lambda: Flag to return the chain of the **largest** eigenvalues associated to the chain of Jacobi matrices. If ``True`` the whole chain of the **largest** eigenvalues is returned If ``False`` only the **largest** eigenvalue of the last Jacobi matrix is returned :type return_chain_of_eig_vals: bool (default False) """ rng = check_random_state(random_state) if sample_exact_cond: if self.V[3]: raise ValueError( 'Sampling exactly the conditionals a_i |... from V = ... + x^3 + ... is not supported, given g_3={}. Conditionals are not log-concave, cannot use Dev12 sampler' .format(self.V[3])) if self.V.order >= 5: raise ValueError( 'Sampling exactly the conditionals a_i |... from V = ... + x^5 + ... is not supported, deg(V)={}>=5. Conditionals are not log-concave, cannot use Dev12 sampler' .format(self.V.order)) even_coefs_V = self.V.coef[::-1][2::2] if not all(even_coefs_V >= 0): raise ValueError('\n'.join([ 'even coefs of V are not all >=0', ', '.join([ 'g_{}={}'.format(2 * (n + 1), g_2n) for n, g_2n in enumerate(even_coefs_V) ]), 'Conditionals are not log-concave, cannot use Dev12 sampler', 'You may retry swithching `sample_exact_cond` to False' ])) self.N = N self.nb_gibbs_passes = nb_gibbs_passes a, b = np.zeros((2, N + 3)) if return_chain_of_eig_vals: eig_vals = np.zeros((N, nb_gibbs_passes)) elif return_chain_of_lambda_max: lambda_max = np.zeros(nb_gibbs_passes) for p in range(nb_gibbs_passes): if (p + 1) % 50 == 0: print(p + 1) for i in range(1, N + 1): # a_i | ... propto exp - P_a_i P_a_i = 0.5 * self.beta * N * P_a_cond(i, a, b, self.V) if sample_exact_cond: a[i], _ = sampler_exact_convex_quartic(P=P_a_i, random_state=rng) else: a[i] = sampler_mala(a[i], V=P_a_i, sigma=0.01, nb_steps=nb_mala_steps, random_state=rng) # b_i | ... propto x^(shape-1) * exp - P_b_i if i < N: P_b_i = 0.5 * self.beta * N * P_b_cond(i, a, b, self.V) b[i], _ = sampler_exact_convex_quartic(P=P_b_i, shape=0.5 * self.beta * (N - i), random_state=rng) if return_chain_of_eig_vals: eig_vals[:, p] = la.eigvalsh_tridiagonal(a[1:N + 1], np.sqrt(b[1:N])) elif return_chain_of_lambda_max: lambda_max[p] = la.eigvalsh_tridiagonal( a[1:N + 1], np.sqrt(b[1:N]), select='i', select_range=(N - 1, N - 1))[0] if return_chain_of_eig_vals: return eig_vals if return_chain_of_lambda_max: return lambda_max return la.eigvalsh_tridiagonal(a[1:N + 1], np.sqrt(b[1:N]))
def comp_modes(dh, N2, f0=1.0, eivec=False, wmode=False, diag=False): ''' Compute eigenvalues (and eigenvectors) of the sturm-liouville equation d ( f^2 d ) 1 -- ( --- -- psi) + ---- psi = 0 dz ( N^2 dz ) Rd^2 for a given stratification The eigenvectors correspond to the matrices for the mode/layer conversion mod2lay[:,0] is the barotropic mode: should be 1..1 mod2lay[:,i] is the ith baroclinic mode -To convert from physical to modal: u_mod = np.dot(lay2mod[:,:],u_lev) # if u_lev is 1D u_mod = np.einsum('ij,jkl->ikl',lay2mod,u_lev) # if u_lev is 3D u_mod = np.einsum('ijkl,jkl->ikl',lay2mod,u_lev) #if u_lev is 3D and N2 variable -To go back to the physical space: u_lev = np.dot(mod2lay[:,:],u_mod) u_lev = np.einsum('ij,jkl->ikl',mod2lay,u_mod) # if u_mod is 3D u_lev = np.einsum('ijkl,jkl->ikl',mod2lay,u_mod) #if u_mod is 3D and N2 variable the w_modes are related to the p_modes by w_modes = -1/N2 d p_modes/dz Parameters ---------- dh : array [nz] N2 : array [nz (,ny,nx)] f0 : scalar or array [(ny,nx)] eivec : Bool wmode : Bool diag : Bool Use transformation matrix to solve a symetric matrix Returns ------- if eivec == T Rd: array [nz (,ny,nx)] lay2mod: array [nz,nz (,ny,nx)] mod2lay: array [nz,nz (,ny,nx)] if eivec == F Rd: array [nz (,ny,nx)] ''' N2,f0 = reshape3d(dh,N2,f0) nl,si_y,si_x = N2.shape mat_format = "dense" if diag: mat_format = "sym_diag" S = gamma_stretch(dh,N2,f0,wmode=wmode,squeeze=False,mat_format=mat_format) nlt = (N2 == 0).argmax(axis=0) nlt = np.where(nlt == 0,nl,nlt) # put variables in right format Ht = np.cumsum(dh) # Ht = np.sum(dh) dhi = 0.5*(dh[1:] + dh[:-1]) dhcol = dh[:,None] dhicol = dhi[:,None] if wmode: Rd = np.zeros((nl,si_y,si_x)) if eivec: mod2lay = np.zeros((nl,nl,si_y,si_x)) lay2mod = np.zeros((nl,nl,si_y,si_x)) else: nlt = nlt + 1 Rd = np.zeros((nl+1,si_y,si_x)) if eivec: mod2lay = np.zeros((nl+1,nl+1,si_y,si_x)) lay2mod = np.zeros((nl+1,nl+1,si_y,si_x)) for j,i in np.ndindex((si_y,si_x)): if eivec: if diag: iRd2, eigs = la.eigh_tridiagonal(S[1,:nlt[j,i],j,i], S[0,1:nlt[j,i],j,i]) eigr = S[2,:nlt[j,i],j,i,None]*eigs # D*w eigl = eigs/S[2,:nlt[j,i],j,i,None] # w*D^-1 if eigenvectors are stored in lines but eigl is eigl.T so we do D^-1*w else: iRd2, eigl,eigr= la.eig(S[:nlt[j,i],:nlt[j,i],j,i],left=True) else: if diag: iRd2 = la.eigvalsh_tridiagonal(S[1,:nlt[j,i],j,i], S[0,1:nlt[j,i],j,i]) else: iRd2 = la.eig(S[:nlt[j,i],:nlt[j,i],j,i],right=False) iRd2 = -iRd2.real idx = np.argsort(iRd2) iRd2 = iRd2[idx] with np.errstate(divide='ignore', invalid='ignore'): Rd_loc = 1./np.sqrt(iRd2) Rd[:nlt[j,i],j,i] = Rd_loc if eivec: eigl = eigl[:,idx] eigr = eigr[:,idx] # Normalize eigenvectors N2col = N2[:nlt[j,i],j,i][:,None] cm = Rd_loc[:nlt[j,i],None]*f0[j,i] if wmode: scap = np.sum(dhi[:nlt[j,i],None]*eigr*eigr*N2col*cm.T**2,0) Htt = Ht[nlt[j,i]] else: scap = np.sum(dh[:nlt[j,i],None]*eigr*eigr,0) Htt = Ht[nlt[j,i]-1] flip = np.sign(eigr[0,:]) eigr = eigr*np.sqrt(Htt/scap)*flip # # scalar product # if wmode: # check = np.sum(N2col.T*eigr[:,1]*eigr[:,1]*dhicol.T*(Rd_loc[1]*f0[j,i])**2) # else: # check = np.sum(dhcol.T*eigr[:,2]*eigr[:,2])/Ht if diag: eigl = eigl/np.sqrt(Htt/scap)*flip else: scap2 = np.sum(eigl*eigr,0) eigl = eigl/scap2 lay2mod[:nlt[j,i],:nlt[j,i],j,i] = eigl.T mod2lay[:nlt[j,i],:nlt[j,i],j,i] = eigr if eivec: return Rd.squeeze(), lay2mod.squeeze(), mod2lay.squeeze() else: return Rd.squeeze()
def step(self, closure, execute_update=True): """Performs a single optimization step. Arguments: Fvp_fn (callable): A closure that accepts a vector of parameters and a vector of length equal to the number of model paramsters and returns the Fisher-vector product. """ info = {} # If doing block diag, perform the update for each param group params_i = 0 params_j = 0 for gi, group in enumerate(self.param_groups): params = group['params'] params_j += len(params) state = self.state[gi] if len(state) == 0: state['step'] = 0 # Set shrinkage to defaults, i.e. no shrinkage state['rho'] = 0.0 state['diag_shrunk'] = 1.0 state['step'] += 1 g = gradients_to_vector(params) if 'ng_prior' not in state: state['ng_prior'] = torch.zeros_like(g) curv_type = group['curv_type'] if curv_type not in self.valid_curv_types: raise ValueError("Invalid curv_type.") # Create closure to pass to Lanczos and CG if curv_type == 'fisher': Fvp_theta_fn = make_fvp_fun_idx(closure, params, params_i, params_j) elif curv_type == 'gauss_newton': # Pass indices instead of actual params, since these params should be the same at # the model params anyway. Then the closure should set only the subset of params # and only return the tmp_params from that subset. # This would require that the param groups are order in a specific manner? Fvp_theta_fn = make_gnvp_fun_idx(closure, params, params_i, params_j) num_params = self._numel(gi, params) shrinkage_method = group['shrinkage_method'] lanczos_amortization = group['lanczos_amortization'] if shrinkage_method == 'lanczos' and ( state['step'] - 1) % lanczos_amortization == 0: # print ("Computing Lanczos shrinkage at step ", state['step']) w = lanczos_iteration(Fvp_theta_fn, num_params, k=group['lanczos_iters']) rho, diag_shrunk = estimate_shrinkage(w, num_params, group['batch_size']) state['rho'] = rho state['diag_shrunk'] = diag_shrunk M = None if group['cg_precondition_empirical']: # Empirical Fisher is g * g M = (g * g + group['cg_precondition_regu_coef'] * torch.ones_like(g))**group['cg_precondition_exp'] # Do CG solve with hvp fn closure extract_tridiag = group['shrinkage_method'] == 'cg' cg_result = cg_solve(Fvp_theta_fn, g.data.clone(), x_0=group['cg_prev_init_coef'] * state['ng_prior'], M=M, cg_iters=group['cg_iters'], cg_residual_tol=group['cg_residual_tol'], shrunk=group['shrinkage_method'] is not None, rho=state['rho'], Dshrunk=state['diag_shrunk'], extract_tridiag=extract_tridiag) if extract_tridiag: # print ("Computing CG shrinkage at step ", state['step']) ng, (diag_elems, off_diag_elems) = cg_result w = eigvalsh_tridiagonal(diag_elems, off_diag_elems) rho, diag_shrunk = estimate_shrinkage(w, num_params, group['batch_size']) state['rho'] = rho state['diag_shrunk'] = diag_shrunk else: ng = cg_result state['ng_prior'] = ng.data.clone() # Normalize NG lr = group['lr'] alpha = torch.sqrt(torch.abs(lr / (torch.dot(g, ng) + 1e-20))) # Unflatten grad vector_to_gradients(ng, params) if execute_update: # Apply step for p in params: if p.grad is None: continue d_p = p.grad.data p.data.add_(-alpha, d_p) params_i = params_j info[gi] = dict(alpha=alpha, delta=lr, natural_grad=ng) return info