def test_hvp(): fun = lambda a: np.sum(np.sin(a)) a = npr.randn(5) v = npr.randn(5) H = hessian(fun)(a) hvp = make_hvp(fun)(a)[0] check_equivalent(np.dot(H, v), hvp(v))
def test_hessian(): # Check Hessian of a quadratic function. D = 5 H = npr.randn(D, D) def fun(x): return np.dot(np.dot(x, H),x) hess = hessian(fun) x = npr.randn(D) check_equivalent(hess(x), H + H.T)
print "Gap percentiles [1, 50, 99] %s" % str( np.percentile(gaps, [1, 50, 99])) ######################################### # test per mu_n function and gradient # ######################################### n = 0 lbn, lbs = make_lower_bound_MoGn(theta, n, s2min=1e-7) thn = theta[n, :D] assert np.isclose(lower_bound_MoG(theta), lbn(thn)), "per n is bad" from autograd.util import quick_grad_check, nd quick_grad_check(lbn, thn) print "Hessiandiag, numeric hessian diag" hlbn = hessian(lbn) print np.diag(hlbn(thn)) hdiag = numeric_hessian_diag(lbn, thn) print hdiag ##################################### # Test NVPI on a small, 2d example # ##################################### from vbproj.vboost import mog means = np.array([[1., 1.], [-1., -1.], [-1, 1]]) covs = np.array([2 * np.eye(2), 1 * np.eye(2), 1 * np.eye(2)]) icovs = np.array([np.linalg.inv(c) for c in covs]) lndets = np.array([np.linalg.slogdet(c)[1] for c in covs]) pis = np.ones(means.shape[0]) / float(means.shape[0]) lnpdf = lambda z: mog.mog_logprob(z, means, icovs, lndets, pis)
def draw_it(func, **kwargs): view = [10, 150] if 'view' in kwargs: view = kwargs['view'] # generate input space for plotting w_in = np.linspace(-5, 5, 100) w1_vals, w2_vals = np.meshgrid(w_in, w_in) w1_vals.shape = (len(w_in)**2, 1) w2_vals.shape = (len(w_in)**2, 1) w_vals = np.concatenate((w1_vals, w2_vals), axis=1).T w1_vals.shape = (len(w_in), len(w_in)) w2_vals.shape = (len(w_in), len(w_in)) # compute grad vals grad = compute_grad(func) grad_vals = [grad(s) for s in w_vals.T] grad_vals = np.asarray(grad_vals) # compute hessian hess = hessian(func) hess_vals = [hess(s) for s in w_vals.T] # define figure fig = plt.figure(figsize=(9, 6)) ### plot original function ### ax1 = plt.subplot2grid((3, 6), (0, 3), colspan=1, projection='3d') # evaluate function, reshape g_vals = func(w_vals) g_vals.shape = (len(w_in), len(w_in)) # plot function surface ax1.plot_surface(w1_vals, w2_vals, g_vals, alpha=0.1, color='w', zorder=1, rstride=15, cstride=15, linewidth=0.5, edgecolor='k') ax1.set_title(r'$g(w_1,w_2)$', fontsize=10) # cleanup axis cleanup(g_vals, view, ax1) ### plot first derivative functions ### ax2 = plt.subplot2grid((3, 6), (1, 2), colspan=1, projection='3d') ax3 = plt.subplot2grid((3, 6), (1, 4), colspan=1, projection='3d') # plot first function grad_vals1 = grad_vals[:, 0] grad_vals1.shape = (len(w_in), len(w_in)) ax2.plot_surface(w1_vals, w2_vals, grad_vals1, alpha=0.1, color='w', zorder=1, rstride=15, cstride=15, linewidth=0.5, edgecolor='k') ax2.set_title(r'$\frac{\partial}{\partial w_1}g(w_1,w_2)$', fontsize=10) # cleanup axis cleanup(grad_vals1, view, ax2) # plot second grad_vals1 = grad_vals[:, 1] grad_vals1.shape = (len(w_in), len(w_in)) ax3.plot_surface(w1_vals, w2_vals, grad_vals1, alpha=0.1, color='w', zorder=1, rstride=15, cstride=15, linewidth=0.5, edgecolor='k') ax3.set_title(r'$\frac{\partial}{\partial w_2}g(w_1,w_2)$', fontsize=10) # cleanup axis cleanup(grad_vals1, view, ax3) ### plot second derivatives ### ax4 = plt.subplot2grid((3, 6), (2, 1), colspan=1, projection='3d') ax5 = plt.subplot2grid((3, 6), (2, 3), colspan=1, projection='3d') ax6 = plt.subplot2grid((3, 6), (2, 5), colspan=1, projection='3d') # plot first hessian function hess_vals1 = np.asarray([s[0, 0] for s in hess_vals]) hess_vals1.shape = (len(w_in), len(w_in)) ax4.plot_surface(w1_vals, w2_vals, hess_vals1, alpha=0.1, color='w', zorder=1, rstride=15, cstride=15, linewidth=0.5, edgecolor='k') ax4.set_title( r'$\frac{\partial}{\partial w_1}\frac{\partial}{\partial w_1}g(w_1,w_2)$', fontsize=10) # cleanup axis cleanup(hess_vals1, view, ax4) # plot second hessian function hess_vals1 = np.asarray([s[1, 0] for s in hess_vals]) hess_vals1.shape = (len(w_in), len(w_in)) ax5.plot_surface(w1_vals, w2_vals, hess_vals1, alpha=0.1, color='w', zorder=1, rstride=15, cstride=15, linewidth=0.5, edgecolor='k') ax5.set_title( r'$\frac{\partial}{\partial w_1}\frac{\partial}{\partial w_2}g(w_1,w_2)=\frac{\partial}{\partial w_2}\frac{\partial}{\partial w_1}g(w_1,w_2)$', fontsize=10) # cleanup axis cleanup(hess_vals1, view, ax5) # plot first hessian function hess_vals1 = np.asarray([s[1, 1] for s in hess_vals]) hess_vals1.shape = (len(w_in), len(w_in)) ax6.plot_surface(w1_vals, w2_vals, hess_vals1, alpha=0.1, color='w', zorder=1, rstride=15, cstride=15, linewidth=0.5, edgecolor='k') ax6.set_title( r'$\frac{\partial}{\partial w_2}\frac{\partial}{\partial w_2}g(w_1,w_2)$', fontsize=10) # cleanup axis cleanup(hess_vals1, view, ax6) plt.show()
def test_hessian_tensor_product(): fun = lambda a: np.sum(np.sin(a)) a = npr.randn(5, 4, 3) V = npr.randn(5, 4, 3) H = hessian(fun)(a) check_equivalent(np.tensordot(H, V, axes=np.ndim(V)), hessian_vector_product(fun)(a, V))
def test_hessian_vector_product(): fun = lambda a: np.sum(np.sin(a)) a = npr.randn(5) v = npr.randn(5) H = hessian(fun)(a) check_equivalent(np.dot(H, v), hessian_vector_product(fun)(a, v))
def test_hessian_matrix_product(): fun = lambda a: np.sum(np.sin(a)) a = npr.randn(5, 4) V = npr.randn(5, 4) H = hessian(fun)(a) check_equivalent(np.tensordot(H, V), hessian_tensor_product(fun)(a, V))
def optimize(W0, compute_hessian=False): def compute_fprime_(Eta, Xi, s02): return fprime_m(Eta, compute_var(Xi, s02)) * Xi def compute_f_(Eta, Xi, s02): return pop_rate_fn(Eta, compute_var(Xi, s02)) def compute_us(W, fval, fprimeval): W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W( W) u0 = u_fn(XX, fval, W0x, W0y, k0, kappa, T0) u1 = u_fn(XX, fval, W1x, W1y, k0, kappa, T0) + u_fn( XX, fval, W0x, W0y, k1, kappa, T0) + u_fn( XX, fval, W0x, W0y, k0, kappa, T1) u2 = u_fn(XXp, fprimeval, W2x, W2y, k0, kappa, T0) + u_fn( XXp, fprimeval, W0x, W0y, k2, kappa, T0) + u_fn( XXp, fprimeval, W0x, W0y, k0, kappa, T2) u3 = u_fn(XXp, fprimeval, W3x, W3y, k0, kappa, T0) + u_fn( XXp, fprimeval, W0x, W0y, k3, kappa, T0) + u_fn( XXp, fprimeval, W0x, W0y, k0, kappa, T3) return u0, u1, u2, u3 def compute_f_fprime_t_(W, perturbation, max_dist=1): # max dist added 10/14/20 W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W( W) fval = compute_f_(Eta, Xi, s02) fprimeval = compute_fprime_(Eta, Xi, s02) u0, u1, u2, u3 = compute_us(W, fval, fprimeval) resEta = Eta - u0 - u2 resXi = Xi - u1 - u3 YY = fval + perturbation YYp = fprimeval + 0 def dYYdt(YY, Eta1, Xi1): return -YY + compute_f_(Eta1, Xi1, s02) def dYYpdt(YYp, Eta1, Xi1): return -YYp + compute_fprime_(Eta1, Xi1, s02) for t in range(niter): if np.mean(np.abs(YY - fval)) < max_dist: u0, u1, u2, u3 = compute_us(W, YY, YYp) Eta1 = resEta + u0 + u2 Xi1 = resXi + u1 + u3 YY = YY + dt * dYYdt(YY, Eta1, Xi1) YYp = YYp + dt * dYYpdt(YYp, Eta1, Xi1) elif np.remainder(t, 500) == 0: print('unstable fixed point?') #YYprime = compute_fprime_(Eta1,Xi1,s02) return YY, YYp def compute_f_fprime_t_avg_(W, perturbation, burn_in=0.5, max_dist=1): W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W( W) fval = compute_f_(Eta, Xi, s02) fprimeval = compute_fprime_(Eta, Xi, s02) u0, u1, u2, u3 = compute_us(W, fval, fprimeval) resEta = Eta - u0 - u2 resXi = Xi - u1 - u3 YY = fval + perturbation YYp = fprimeval + 0 YYmean = np.zeros_like(Eta) YYprimemean = np.zeros_like(Eta) def dYYdt(YY, Eta1, Xi1): return -YY + compute_f_(Eta1, Xi1, s02) def dYYpdt(YYp, Eta1, Xi1): return -YYp + compute_fprime_(Eta1, Xi1, s02) for t in range(niter): if np.mean(np.abs(YY - fval)) < max_dist: u0, u1, u2, u3 = compute_us(W, YY, YYp) Eta1 = resEta + u0 + u2 Xi1 = resXi + u1 + u3 YY = YY + dt * dYYdt(YY, Eta1, Xi1) YYp = YYp + dt * dYYpdt(YYp, Eta1, Xi1) elif np.remainder(t, 500) == 0: print('unstable fixed point?') if t > niter * burn_in: YYmean = YYmean + 1 / niter / burn_in * YY YYprimemean = YYprimemean + 1 / niter / burn_in * YYp return YYmean, YYprimemean def u_fn(XX, YY, Wx, Wy, K, kappa, T): WWx, WWy = [gen_Weight(W, K, kappa, T) for W in [Wx, Wy]] return XX @ WWx + YY @ WWy def minusLW(W): def compute_sq_error(a, b, wt): return np.sum(wt * (a - b)**2) def compute_kl_error(mu_data, pc_list, mu_model, fprimeval, wt): # how to model variability in X? kl = compute_kl_divergence(fprimeval, noise, mu_data, mu_model, pc_list) return kl #wt*kl # principled way would be to use 1/wt for noise term. Should add later. def compute_opto_error(W, wt=None): if wt is None: wt = np.ones((nN, nQ * nS * nT)) W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W( W) WWy = gen_Weight(W0y, k, kappa, T) Phi = fprime_m(Eta, compute_var(Xi, s02)) dHH = np.zeros((nN, nQ * nS * nT)) dHH[:, np.arange(2, nQ * nS * nT, nQ)] = 1 dHH = dHH * h print('dYY: ' + str(dYY.shape)) print('Phi: ' + str(Phi.shape)) print('dHH: ' + str(dHH.shape)) cost = np.sum(wt * (dYY - (dYY @ WWy) * Phi - dHH * Phi)**2) return cost def compute_opto_error_with_inv(W, wt): if wt is None: wt = np.ones((nN, nQ * nS * nT)) W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W( W) WWy = gen_Weight(W0y, k0, kappa, T0) Phi = fprime_m(Eta, compute_var(Xi, s02)) Phi1 = np.array([np.diag(phi) for phi in Phi]) invmat = np.array([ np.linalg.inv(np.eye(nQ * nS * nT) - WWy @ phi1) for phi1 in Phi1 ]) dHH = np.zeros((nN, nQ * nS * nT)) dHH[:, np.arange(2, nQ * nS * nT, nQ)] = 1 dHH = dHH * h print('dYY: ' + str(dYY.shape)) print('Phi: ' + str(Phi.shape)) print('dHH: ' + str(dHH.shape)) invprod = np.einsum('ij,ijk->ik', dHH, Phi1) invprod = np.einsum('ij,ijk->ik', invprod, invmat) cost = np.sum(wt * (dYY - invprod)**2) return cost def compute_isn_error(W): W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W( W) Phi = fprime_m(Eta, compute_var(Xi, s02)) log_arg = Phi[:, 0] * W0y[0, 0] - 1 cost = utils.minus_sum_log_ceil(log_arg, big_val / nN) return cost def compute_tv_error(W): # sq l2 norm for tv error W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W( W) topo_var_list = [arr.reshape(topo_shape+(-1,)) for arr in \ [XX,XXp,Eta,Xi]] sqdiffy = [ np.sum(np.abs(np.diff(top, axis=0))**2) for top in topo_var_list ] sqdiffx = [ np.sum(np.abs(np.diff(top, axis=1))**2) for top in topo_var_list ] cost = np.sum(sqdiffy + sqdiffx) return cost W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W( W) perturbation = perturbation_size * np.random.randn(*Eta.shape) fval, fprimeval = compute_f_fprime_t_avg_( W, perturbation ) # Eta the mean input per cell, Xi the stdev. input per cell, s02 the baseline variability in input Xterm = compute_kl_error(XXhat, Xpc_list, XX, XXp, wtStim * wtInp) # XX the modeled input layer (L4) Yterm = compute_kl_error( YYhat, Ypc_list, fval, fprimeval, wtStim * wtCell) # fval the modeled output layer (L2/3) u0, u1, u2, u3 = compute_us(W, fval, fprimeval) #u0 = u_fn(XX,fval,W0x,W0y,k,kappa,T) #u1 = u_fn(XX,fval,W1x,W1y,k,kappa,T) #u2 = u_fn(XXp,fprimeval,W2x,W2y,k,kappa,T) #u3 = u_fn(XXp,fprimeval,W3x,W3y,k,kappa,T) Etaterm = compute_sq_error( Eta, u0 + u2, wtStim * wtCell) # magnitude of fudge factor in mean input Xiterm = compute_sq_error( Xi, u1 + u3, wtStim * wtCell) # magnitude of fudge factor in input variability # returns value float Optoterm = compute_opto_error_with_inv( W, wtStimOpto * wtCellOpto) #testing out 8/20/20 cost = wtX * Xterm + wtY * Yterm + wtEta * Etaterm + wtXi * Xiterm + wtOpto * Optoterm if constrain_isn: ISNterm = compute_isn_error(W) cost = cost + wtISN * ISNterm if tv: TVterm = compute_tv_error(W) cost = cost + wtTV * TVterm if isinstance(Xterm, float): print('X:%f' % (wtX * Xterm)) print('Y:%f' % (wtY * Yterm)) print('Eta:%f' % (wtEta * Etaterm)) print('Xi:%f' % (wtXi * Xiterm)) print('Opto:%f' % (wtOpto * Optoterm)) if constrain_isn: print('ISN:%f' % (wtISN * ISNterm)) if tv: print('TV:%f' % (wtTV * TVterm)) lbls = ['cost'] vars = [cost] for lbl, var in zip(lbls, vars): print_labeled(lbl, var) return cost def minusdLdW(W): # returns value (R,) # sum in first dimension: (N,1) times (N,1) times (N,P) # return jacobian(minusLW)(W) return grad(minusLW)(W) def fix_violations(w, bounds): lb = np.array([b[0] for b in bounds]) ub = np.array([b[1] for b in bounds]) #print('w shape: '+str(w.shape)) #print('bd shape: '+str(lb.shape)) lb_violation = w < lb ub_violation = w > ub w[lb_violation] = lb[lb_violation] w[ub_violation] = ub[ub_violation] return w, lb_violation, ub_violation def sorted_r_eigs(w): drW, prW = np.linalg.eig(w) srtinds = np.argsort(drW) return drW[srtinds], prW[:, srtinds] def compute_eig_penalty_(Wmy, K0, kappa, T0): # still need to finish! Hopefully won't need # need to fix this to reflect addition of kappa argument Wsquig = gen_Weight(Wmy, K0, kappa, T0) drW, prW = sorted_r_eigs(Wsquig - np.eye(nQ * nS * nT)) plW = np.linalg.inv(prW) eig_outer_all = [ np.real(np.outer(plW[:, k], prW[k, :])) for k in range(nS * nQ * nT) ] eig_penalty_size_all = [ barrier_wt / np.abs(np.real(drW[k])) for k in range(nS * nQ * nT) ] eig_penalty_dir_w = [ eig_penalty_size * ((eig_outer[:nQ, :nQ] + eig_outer[nQ:, nQ:]) + K0[np.newaxis, :] * (eig_outer[:nQ, nQ:] + kappa * eig_outer[nQ:, :nQ])) for eig_outer, eig_penalty_size in zip(eig_outer_all, eig_penalty_size_all) ] eig_penalty_dir_k = [ eig_penalty_size * ((eig_outer[:nQ, nQ:] + eig_outer[nQ:, :nQ] * kappa) * W0my).sum(0) for eig_outer, eig_penalty_size in zip( eig_outer_all, eig_penalty_size_all) ] eig_penalty_dir_kappa = [ eig_penalty_size * (eig_outer[nQ:, :nQ] * k0[np.newaxis, :] * W0my).sum().reshape( (1, )) for eig_outer, eig_penalty_size in zip( eig_outer_all, eig_penalty_size_all) ] eig_penalty_dir_w = np.array(eig_penalty_dir_w).sum(0) eig_penalty_dir_k = np.array(eig_penalty_dir_k).sum(0) eig_penalty_dir_kappa = np.array(eig_penalty_dir_kappa).sum(0) return eig_penalty_dir_w, eig_penalty_dir_k, eig_penalty_dir_kappa def compute_eig_penalty(W): # still need to finish! Hopefully won't need W0x, W0y, W1x, W1y, W2x, W2y, W3x, W3y, s02, k0, k1, k2, k3, kappa, T0, T1, T2, T3, XX, XXp, Eta, Xi, h = parse_W( W) eig_penalty_dir_w, eig_penalty_dir_k, eig_penalty_dir_kappa = compute_eig_penalty_( W0my, k0, kappa0) eig_penalty_W = unparse_W(np.zeros_like(W0mx), eig_penalty_dir_w, np.zeros_like(W0sx), np.zeros_like(W0sy), np.zeros_like(s020), eig_penalty_dir_k, eig_penalty_dir_kappa, np.zeros_like(XX0), np.zeros_like(XXp0), np.zeros_like(Eta0), np.zeros_like(Xi0)) # assert(True==False) return eig_penalty_W allhot = np.zeros(W0.shape) #allhot[:nP*nQ+nQ**2] = 1 allhot[:4 * (nP * nQ + nQ**2)] = 1 # penalizing all Wn equally W_l2_reg = lambda W: np.sum((W * allhot)**2) f = lambda W: minusLW(W) + l2_penalty * W_l2_reg(W) fprime = lambda W: minusdLdW(W) + 2 * l2_penalty * W * allhot fix_violations(W0, bounds) W1, loss, result = sop.fmin_l_bfgs_b(f, W0, fprime=fprime, bounds=bounds, factr=1e4, maxiter=int(1e3)) if compute_hessian: gr = grad(minusLW)(W1) hess = hessian(minusLW)(W1) else: gr = None hess = None # W0mx,W0my,W0sx,W0sy,s020,k0,kappa0,XX0,XXp0,Eta0,Xi0 = parse_W(W1) #W0x,W0y,W1x,W1y,W2x,W2y,W3x,W3y,s02,k,kappa,T,XX,XXp,Eta,Xi,h = parse_W(W) return W1, loss, gr, hess, result
#Reset trace if done == 0: z = np.zeros((task.nactions, task.nstates)) # Update trace z = np.outer(u, x) + w[2] * w[3] * z # Compute RPE rpe = r + w[2] * np.einsum('i,ij,j->', u_, Q, x_) - np.einsum( 'i,ij,j->', u, Q, x) # Update value function Q += w[0] * rpe * z return L # Compare with autograd and the SARSASoftmax object. print(' AUTOGRAD \n\n ') agH = hessian(f)(w) print(agH) print('\n\n FITR (RAW) \n\n ') print(hess_) print('\n\n FITR (OBJECT) \n\n') print(agent_inv.hess_) agh = hessian(fQ)(w) print(' AUTOGRAD \n\n ') print(' Learning rate \n') print(agh[:, :, 0, 0]) print('\n\n Discount \n') print(agh[:, :, 2, 2])
w2 = w + eps * v negL1, _, _ = nll_GLM_GanmorCalciumAR1(w1, Xmat, Yobs, hyperparams, nlfun) negL2, _, _ = nll_GLM_GanmorCalciumAR1(w2, Xmat, Yobs, hyperparams, nlfun) gradient_finite_diff[i] = (negL2 - negL1) / (2.0 * eps) # if want finite difference computation of Hessian, uncomment this code # note this is redundant because it computes both upper and lower triangular elements # for j, v2 in enumerate(np.eye(D)): # wp = w + eps * v + eps * v2 # wm1 = w + eps * v # wm2 = w + eps * v2 # negLp, _, _ = nll_GLM_GanmorCalciumAR1(wp, Xmat, Yobs, hyperparams, nlfun) # negLm1, _, _ = nll_GLM_GanmorCalciumAR1(wm1, Xmat, Yobs, hyperparams, nlfun) # negLm2, _, _ = nll_GLM_GanmorCalciumAR1(wm2, Xmat, Yobs, hyperparams, nlfun) # hess_finite_diff[i,j] = (negLp - negLm1 - negLm2 + negL) / (eps**2) print("Done.") # autograd grad_w = grad( lambda w: nll_GLM_GanmorCalciumAR1(w, Xmat, Yobs, hyperparams, nlfun)[0]) gradient_autograd = grad_w(w) hess_w = hessian( lambda w: nll_GLM_GanmorCalciumAR1(w, Xmat, Yobs, hyperparams, nlfun)[0]) H_autograd = hess_w(w) # diffs print("Gradient vs. Autograd : ", np.linalg.norm(gradient_autograd - gradient)) print("Gradient vs. finite diffs: ", np.linalg.norm(gradient_finite_diff - gradient)) print("Hessian vs. Autograd : ", np.linalg.norm(H - H_autograd))
def hessian_log_likelihood(self, params: np.ndarray, x: np.ndarray, idx_params: np.ndarray): return hessian(self.log_likelihood, argnum=0)(params, x, idx_params)
def part2(target, link_length, min_roll, max_roll, min_pitch, max_pitch, min_yaw, max_yaw, obstacles): """Function that uses optimization to do inverse kinematics for a snake robot Args: target: [x, y, z, q0, q1, q2, q3]' position and orientation of the end effector link_length: Nx1 vectors of the lengths of the links min_xxx, max_xxx are the vectors of the limits on the roll, pitch, yaw of each link. obstacles: A Mx4 matrix where each row is [ x y z radius ] of a sphere obstacle. M obstacles. Returns: r: N vector of roll p: N vector of pitch y: N vector of yaw """ N = len(link_length) def func(x0): pos = np.array([0,0,0]) qM = np.eye(3) rs = x0[:N] ps = x0[N:2*N] ys = x0[2*N:] ll = link_length for r,p,y,l in zip(rs,ps,ys,ll): pos,qM = fwd(pos,l,r,p,y,qM) t = np.array(target) C = 1 # meters and radians. Close enough extra = 0.0 for ob in obstacles: p0 = np.array([0,0,0]) q = np.eye(3) for r,p,y,l in zip(rs,ps,ys,ll): p1,q = fwd(p0,l,r,p,y,q) i1,i2 = sphere_line_intersection(p0,p1,ob[:3],ob[3]) p0 = p1 if i1 is not None: i1 = np.array(i1) extra += (((ob[:3]-i1)**2-ob[3])**2).sum() if i2 is not None: i2 = np.array(i2) extra += (((ob[:3]-i2)**2-ob[3])**2).sum() quat = transforms3d.quaternions.mat2quat(qM) rot_error = 1.0 - ((quat*np.array([t[3],-t[4],-t[5],-t[6]]))**2 ).sum() return ((pos[:3]-t[:3])**2).sum() + C*rot_error + extra bounds = [(x,y) for x,y in zip(min_roll, max_roll)] bounds = bounds + [(x,y) for x,y in zip(min_pitch,max_pitch)] bounds = bounds + [(x,y) for x,y in zip(min_yaw, max_yaw)] midpoint = lambda mn,mx: mn+0.5*(mx-mn) x0 = [midpoint(min_roll[i],max_roll[i]) for i in range(N)] + [midpoint(min_pitch[i],max_pitch[i]) for i in range(N)] + [midpoint(min_yaw[i], max_yaw[i]) for i in range(N)] x0 = np.array(x0) + 1e-6 jac = grad(func) hess = hessian(func) def jac_reg(x): j = jac(x) if np.isfinite(j).all(): return j else: return opt.approx_fprime(x0,func,1e-6) print(jac(x0)) if False: # quat should be norm 1 ? eps = 1e-3 constraints = [{'type:': 'eq', 'fun': lambda x: (x[3]**2 + x[4]**2 + x[5]**2 + x[6]**2) > 1.0-eps }] constraints = constraints + [{'type:': 'eq', 'fun': lambda x: (x[3]**2 + x[4]**2 + x[5]**2 + x[6]**2) < 1.0+eps }] else: constraints = [] for ob in obstacles: pass #soft for now? # I think only method='SLSQP' is good? # L-BFGS-B, TNC and SLSQP # Powell, SLSQP, COBYLA if False: import cma es = cma.CMAEvolutionStrategy(x0, pi/2.0, {'bounds':list(zip(*bounds))}) es.optimize(func) print(es.result_pretty()) resx = es.result.xbest else: res = opt.minimize(func,x0=x0,bounds=bounds,constraints=constraints,method='Powell',jac=jac_reg) print(res) resx = res.x return resx[:N], resx[N:2*N], resx[2*N:]
raise ValueError('Upper bound must be greater than lower bound') if ub == float("inf"): if lb == -float("inf"): # TODO: I'm not sure this copy work with autodiff. return copy.copy(free_vec) else: return np.exp(free_vec) + lb else: # the upper bound is finite if lb == -float("inf"): return ub - np.exp(-1 * free_vec) else: exp_vec = np.exp(free_vec) return (ub - lb) * exp_vec / (1 + exp_vec) + lb constrain_scalar_jac = autograd.jacobian(constrain) constrain_scalar_hess = autograd.hessian(constrain) def get_inbounds_value(lb, ub): assert lb < ub if lb > -float('inf') and ub < float('inf'): return 0.5 * (ub - lb) else: if lb > -float('inf'): # The upper bound is infinite. return lb + 1.0 elif ub < float('inf'): # The lower bound is infinite. return ub - 1.0 else: # Both are infinie. return 0.0
#scale value function def square_loss(pred, y): return 0.5 * (y - pred)**2 def logistic_loss(pred, y): return -(y * anp.log(pred) + (1 - y) * anp.log(1 - pred)) # square_loss_grad(pred,y) # square_loss_hess(pred,y) square_loss_grad = grad( square_loss) # square_loss with respect to pred, grad = pred-y square_loss_hess = hessian( square_loss) # square_loss_grad with respect to pred, hess = 1 print square_loss_grad(0.0, 0.5) # -0.5 print square_loss_hess(0.0, 0.5) # 1 # logistic_loss_grad(pred,y) # logistic_loss_hess(pred,y) logistic_loss_grad = grad( logistic_loss ) # logistic_loss with respect to pred, grad = (1-y)/(1-pred) - y/pred logistic_loss_hess = hessian( logistic_loss ) #logistic_loss_grad with respect to pred, hess = y/pred**2 + (1-y)/(1-pred)**2 print logistic_loss_grad(0.2, 0) # 1.25 print logistic_loss_hess(0.2, 0) # 1.25
def newtons_method(g, max_its, w, **kwargs): # flatten input funciton, in case it takes in matrices of weights flat_g, unflatten, w = flatten_func(g, w) # compute the gradient / hessian functions of our input function - # note these are themselves functions. In particular the gradient - # - when evaluated - returns both the gradient and function evaluations (remember # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use # an Automatic Differntiator to evaluate the gradient) gradient = value_and_grad(flat_g) hess = hessian(flat_g) # set numericxal stability parameter / regularization parameter epsilon = 10**(-7) if 'epsilon' in kwargs: beta = kwargs['epsilon'] # run the newtons method loop weight_history = [] # container for weight history cost_history = [] # container for corresponding cost function history for k in range(max_its): # evaluate the gradient, store current weights and cost function value cost_eval, grad_eval = gradient(w) weight_history.append(unflatten(w)) cost_history.append(cost_eval) # evaluate the hessian hess_eval = hess(w) # reshape for numpy linalg functionality hess_eval.shape = (int( (np.size(hess_eval))**(0.5)), int((np.size(hess_eval))**(0.5))) # solve second order system system for weight update w = w - np.dot( np.linalg.pinv(hess_eval + epsilon * np.eye(np.size(w))), grad_eval) # collect final weights weight_history.append(unflatten(w)) # compute final cost function value via g itself (since we aren't computing # the gradient at the final step we don't get the final cost function value # via the Automatic Differentiatoor) cost_history.append(flat_g(w)) return weight_history, cost_history # gradient descent function - inputs: g (input function), alpha (steplength parameter), max_its (maximum number of iterations), w (initialization) def gradient_descent(g, alpha_choice, max_its, w): # compute the gradient function of our input function - note this is a function too # that - when evaluated - returns both the gradient and function evaluations (remember # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use # an Automatic Differntiator to evaluate the gradient) gradient = value_and_grad(g) # run the gradient descent loop weight_history = [] # container for weight history cost_history = [] # container for corresponding cost function history alpha = 0 for k in range(1, max_its + 1): # check if diminishing steplength rule used if alpha_choice == 'diminishing': alpha = 1 / float(k) else: alpha = alpha_choice # evaluate the gradient, store current weights and cost function value cost_eval, grad_eval = gradient(w) weight_history.append(w) cost_history.append(cost_eval) # take gradient descent step w = w - alpha * grad_eval # collect final weights weight_history.append(w) # compute final cost function value via g itself (since we aren't computing # the gradient at the final step we don't get the final cost function value # via the Automatic Differentiatoor) cost_history.append(g(w)) return weight_history, cost_history
def run(self, theta, niter=10, tol=.0001, verbose=False, path=""): """ runs NPV for ... iterations mimics npv_run.m from Sam Gershman's original matlab code USAGE: [F mu s2] = npv_run(nlogpdf,theta,[nIter]) INPUTS: theta - [N x D+1] initial parameter settings, where N is the number of components, D is the number of latent variables in the model, and the last column contains the log bandwidths (variances) nIter (optional) - maximum number of iterations (default: 10) tol (optional) - change in the evidence lower bound (ELBO) for convergence (default: 0.0001) OUTPUTS: F - [nIter x 1] approximate ELBO value at each iteration mu - [N x D] component means s2 - [N x 1] component bandwidths """ N, Dpp = theta.shape D = Dpp - 1 # set LBFGS optim arguments disp = 10 if verbose else None opts = { 'disp': disp, 'maxiter': 5000, 'gtol': 1e-7, 'ftol': 1e-7 } #, 'factr':1e2} elbo_vals = np.zeros(niter) timestamps = [] timestamps.append(time()) for ii in xrange(niter): elbo_vals[ii] = self.mc_elbo(theta) print "iteration %d (elbo = %2.4f)" % (ii, elbo_vals[ii]) # first-order approximation (L1): optimize mu, one component at a time print " ... optimizing mus " for n in xrange(N): print " ... %d / %d " % (n, N) fun, gfun = self.make_elbo1_funs(theta, n) res = minimize(fun, x0=theta[n, :D], jac=gfun, method='L-BFGS-B', options=opts) theta[n, :D] = res.x #print theta[:,:D] #print " ... elbo: ", self.mc_elbo(theta) # second-order approximation (L2): optimize s2 print " ... optimizing sigmas" mu = theta[:, :D] h = np.zeros(N) for n in xrange(N): # compute Hessian trace using finite differencing or autograd h[n] = np.sum(np.diag(hessian(self.lnpdf)(mu[n]))) fun, gfun = self.make_elbo2_funs(theta, h) res = minimize(fun, x0=theta[:, -1], jac=gfun, method='L-BFGS-B', options=opts) theta = np.column_stack([mu, res.x]) # mmd_samples = mogsamples(2000, theta) if (ii % 5 == 0): timestamps.append(time()) np.savez(path + '/iter' + str(ii) + "of" + str(niter) + ".npz", timestamps=timestamps, mu=mu, sigma=np.exp(theta[:, -1]) + self.s2min, n_feval=self.lnpdf.counter) # calculate the approximate ELBO (L2) #if (ii > 1) and (np.abs(elbo_vals[ii] - elbo_vals[ii-1] < tol)) # TODO check for convergence #if (ii > 1) and (np.abs(F[ii]-F[ii-1]) < tol) # break # end % check for convergence # unpack params and return mu = theta[:, :D] s2 = np.exp(theta[:, -1]) + self.s2min return mu, s2, elbo_vals, theta
def test_objective(self): model = Model(dim=3) objective = obj_lib.Objective(par=model.x, fun=model.f) model.set_inits() x_free = model.x.get_free() x_vec = model.x.get_vector() model.set_opt() self.assertTrue(objective.fun_free(x_free) > 0.0) np_test.assert_array_almost_equal(objective.fun_free(x_free), objective.fun_vector(x_vec)) grad = objective.fun_free_grad(x_free) hess = objective.fun_free_hessian(x_free) np_test.assert_array_almost_equal(np.matmul(hess, grad), objective.fun_free_hvp(x_free, grad)) self.assertTrue(objective.fun_vector(x_vec) > 0.0) grad = objective.fun_vector_grad(x_vec) hess = objective.fun_vector_hessian(x_vec) np_test.assert_array_almost_equal( np.matmul(hess, grad), objective.fun_vector_hvp(x_free, grad)) # Test Jacobians. vec_objective = obj_lib.Objective(par=model.x, fun=model.get_x_vec) vec_jac = vec_objective.fun_vector_jacobian(x_vec) np_test.assert_array_almost_equal(model.b_mat, vec_jac) free_jac = vec_objective.fun_free_jacobian(x_free) x_free_to_vec_jac = \ model.x.free_to_vector_jac(x_free).todense() np_test.assert_array_almost_equal( np.matmul(model.b_mat, np.transpose(x_free_to_vec_jac)), free_jac) # Test the preconditioning preconditioner = 2.0 * np.eye(model.dim) preconditioner[model.dim - 1, 0] = 0.1 # Add asymmetry for testing! objective.preconditioner = preconditioner np_test.assert_array_almost_equal( objective.fun_free_cond(x_free), objective.fun_free(np.matmul(preconditioner, x_free)), err_msg='Conditioned function values') fun_free_cond_grad = autograd.grad(objective.fun_free_cond) grad_cond = objective.fun_free_grad_cond(x_free) np_test.assert_array_almost_equal( fun_free_cond_grad(x_free), grad_cond, err_msg='Conditioned gradient values') fun_free_cond_hessian = autograd.hessian(objective.fun_free_cond) hess_cond = objective.fun_free_hessian_cond(x_free) np_test.assert_array_almost_equal(fun_free_cond_hessian(x_free), hess_cond, err_msg='Conditioned Hessian values') fun_free_cond_hvp = autograd.hessian_vector_product( objective.fun_free_cond) np_test.assert_array_almost_equal( fun_free_cond_hvp(x_free, grad_cond), objective.fun_free_hvp_cond(x_free, grad_cond), err_msg='Conditioned Hessian vector product values')
elp += np.sum(Ez * log_likes) # assert np.all(np.isfinite(elp)) return -1 * elp / scale def hessian_neg_expected_log_joint(x, Ez, Ezzp1, scale=1): T, D = np.shape(x) x_mask = np.ones((T, D), dtype=bool) hessian_diag, hessian_lower_diag = latent_ddm.dynamics.hessian_expected_log_dynamics_prob(Ez, x, input, x_mask, tag) hessian_diag[:-1] += latent_ddm.transitions.hessian_expected_log_trans_prob(x, input, x_mask, tag, Ezzp1) hessian_diag += latent_ddm.emissions.hessian_log_emissions_prob(data, input, mask, tag, x) # The Hessian of the log probability should be *negative* definite since we are *maximizing* it. # hessian_diag -= 1e-8 * np.eye(D) # Return the scaled negative hessian, which is positive definite return -1 * hessian_diag / scale, -1 * hessian_lower_diag / scale from autograd import hessian from ssm.primitives import blocks_to_full hess = hessian(neg_expected_log_joint) H_autograd = hess(x, Ez, Ezzp1).reshape((T,T)) H_diag, H_lower_diag = hessian_neg_expected_log_joint(x, Ez, Ezzp1) H = blocks_to_full(H_diag, H_lower_diag) assert np.allclose(H,H_autograd) print("All close: ", np.allclose(H,H_autograd,rtol=1e-8,atol=1e-8)) print("Norm difference: ",np.linalg.norm(H-H_autograd))
def optimize(W0,compute_hessian=False): def compute_fprime_(Eta,Xi,s02): # Wmx,Wmy,Wsx,Wsy,s02,k,kappa,XX,YY,Eta,Xi = parse_W(W) # WWx,WWy = [gen_Weight(W,k,kappa) for W in [Wx,Wy]] return fprime_m(Eta,Xi**2+np.concatenate([s02 for ipixel in range(nS*nT)]))*Xi def compute_f_(Eta,Xi,s02): return pop_rate_fn(Eta,Xi**2+np.concatenate([s02 for ipixel in range(nS*nT)],axis=0)) def compute_f_fprime_t_(W,perturbation): Wmx,Wmy,Wsx,Wsy,s02,k,kappa,T,XX,XXp,Eta,Xi,h1,h2 = parse_W(W) fval = compute_f_(Eta,Xi,s02) resEta = Eta - u_fn(XX,fval,Wmx,Wmy,k,kappa,T) resXi = Xi - u_fn(XX,fval,Wsx,Wsy,k,kappa) YY = fval + perturbation def dYYdt(YY,Eta1,Xi1): return -YY + compute_f_(Eta1,Xi1,s02) for t in range(niter): Eta1 = resEta + u_fn(XX,YY,Wmx,Wmy,k,kappa) Xi1 = resXi + u_fn(XX,YY,Wsx,Wsy,k,kappa) YY = YY + dt*dYYdt(YY,Eta1,Xi1) YYprime = compute_fprime_(Eta1,Xi1,s02) return YY,YYprime def compute_f_fprime_t_avg_(W,perturbation,burn_in=0.5): Wmx,Wmy,Wsx,Wsy,s02,K,kappa,T,XX,XXp,Eta,Xi,h1,h2 = parse_W(W) fval = compute_f_(Eta,Xi,s02) resEta = Eta - u_fn(XX,fval,Wmx,Wmy,K,kappa,T) resXi = Xi - u_fn(XX,fval,Wsx,Wsy,K,kappa,T) YY = fval + perturbation YYmean = np.zeros_like(Eta) YYprimemean = np.zeros_like(Eta) def dYYdt(YY,Eta1,Xi1): return -YY + compute_f_(Eta1,Xi1,s02) for t in range(niter): Eta1 = resEta + u_fn(XX,YY,Wmx,Wmy,K,kappa,T) Xi1 = resXi + u_fn(XX,YY,Wsx,Wsy,K,kappa,T) YY = YY + dt*dYYdt(YY,Eta1,Xi1) if t>niter*burn_in: YYprime = compute_fprime_(Eta1,Xi1,s02) YYmean = YYmean + 1/niter/burn_in*YY YYprimemean = YYprimemean + 1/niter/burn_in*YYprime return YYmean,YYprimemean def u_fn(XX,YY,Wx,Wy,K,kappa,T): WWx,WWy = [gen_Weight(W,K,kappa,T) for W in [Wx,Wy]] #print(WWx.shape) #print(WWy.shape) #print_labeled('WWx',WWx) #print_labeled('WWy',WWy) #plt.figure(1) #plt.imshow(WWy) #plt.savefig('WWy.jpg',dpi=300) return XX @ WWx + YY @ WWy def minusLW(W): def compute_sq_error(a,b,wt): return np.sum(wt*(a-b)**2) def compute_kl_error(mu_data,pc_list,mu_model,fprimeval,wt): # how to model variability in X? kl = compute_kl_divergence(fprimeval,noise,mu_data,mu_model,pc_list) return kl #wt*kl # principled way would be to use 1/wt for noise term. Should add later. def compute_opto_error(W): Wmx,Wmy,Wsx,Wsy,s02,K,kappa,T,XX,XXp,Eta,Xi,h1,h2 = parse_W(W) WWy = gen_Weight(Wmy,K,kappa,T) Phi = fprime_m(Eta,Xi**2+np.concatenate([s02 for ipixel in range(nS*nT)])) dHH = np.zeros((nN,nQ*nS*nT)) dHH[:,np.arange(2,nQ*nS*nT,nQ)] = 1 dHH = dHH*h print('dYY: '+str(dYY.shape)) print('Phi: '+str(Phi.shape)) print('dHH: '+str(dHH.shape)) cost = np.sum((dYY - (dYY @ WWy) * Phi - dHH * Phi)**2) return cost def compute_opto_error_with_inv(W): Wmx,Wmy,Wsx,Wsy,s02,K,kappa,T,XX,XXp,Eta,Xi,h1,h2 = parse_W(W) WWy = gen_Weight(Wmy,K,kappa,T) Phi = fprime_m(Eta,Xi**2+np.concatenate([s02 for ipixel in range(nS*nT)])) Phi = np.concatenate((Phi,Phi),axis=0) Phi1 = np.array([np.diag(phi) for phi in Phi]) invmat = np.array([np.linalg.inv(np.eye(nQ*nS*nT) - WWy @ phi1) for phi1 in Phi1]) dHH = np.zeros((nN,nQ*nS*nT)) dHH[:,np.arange(2,nQ*nS*nT,nQ)] = 1 dHH = np.concatenate((dHH*h1,dHH*h2),axis=0) print('dYY: '+str(dYY.shape)) print('Phi: '+str(Phi.shape)) print('dHH: '+str(dHH.shape)) invprod = np.einsum('ij,ijk->ik',dHH,Phi1) invprod = np.einsum('ij,ijk->ik',invprod,invmat) #invprod = np.array([dhh @ phi1 @ lil_invmat for dhh,phi1,this_invmat in zip(dHH,Phi1,invmat)]) cost = np.sum((dYY[opto_mask] - invprod[opto_mask])**2) return cost def compute_isn_error(W): Wmx,Wmy,Wsx,Wsy,s02,K,kappa,T,XX,XXp,Eta,Xi,h1,h2 = parse_W(W) Phi = fprime_m(Eta,Xi**2+np.concatenate([s02 for ipixel in range(nS*nT)])) #print('min Eta: %f'%np.min(Eta[:,0])) #print('WEE: %f'%Wmy[0,0]) #print('min phiE*WEE: %f'%np.min(Phi[:,0]*Wmy[0,0])) cost = -np.sum(np.log(Phi[:,0]*Wmy[0,0]-1)) #print('ISN cost: %f'%cost) return cost Wmx,Wmy,Wsx,Wsy,s02,K,kappa,T,XX,XXp,Eta,Xi,h1,h2 = parse_W(W) #print_labeled('T',T) #print_labeled('K',K) #print_labeled('Wmy',Wmy) perturbation = perturbation_size*np.random.randn(*Eta.shape) # fval,fprimeval = compute_f_fprime_t_(W,perturbation) # Eta the mean input per cell, Xi the stdev. input per cell, s02 the baseline variability in input fval,fprimeval = compute_f_fprime_t_avg_(W,perturbation) # Eta the mean input per cell, Xi the stdev. input per cell, s02 the baseline variability in input #print_labeled('fval',fval) Xterm = compute_kl_error(XXhat,Xpc_list,XX,XXp,wtStim*wtInp) # XX the modeled input layer (L4) Yterm = compute_kl_error(YYhat,Ypc_list,fval,fprimeval,wtStim*wtCell) # fval the modeled output layer (L2/3) Etaterm = compute_sq_error(Eta,u_fn(XX,fval,Wmx,Wmy,K,kappa,T),wtStim*wtCell) # magnitude of fudge factor in mean input Xiterm = compute_sq_error(Xi,u_fn(XX,fval,Wsx,Wsy,K,kappa,T),wtStim*wtCell) # magnitude of fudge factor in input variability # returns value float #Optoterm = compute_opto_error(W) Optoterm = compute_opto_error_with_inv(W) #testing out 8/20/20 cost = wtX*Xterm + wtY*Yterm + wtEta*Etaterm + wtXi*Xiterm + wtOpto*Optoterm if constrain_isn: ISNterm = compute_isn_error(W) cost = cost + wtISN*ISNterm if isinstance(Xterm,float): print('X:%f'%(wtX*Xterm)) print('Y:%f'%(wtY*Yterm)) print('Eta:%f'%(wtEta*Etaterm)) print('Xi:%f'%(wtXi*Xiterm)) print('Opto:%f'%(wtOpto*Optoterm)) if constrain_isn: print('ISN:%f'%(wtISN*ISNterm)) #lbls = ['Yterm'] #vars = [Yterm] lbls = ['cost'] vars = [cost] for lbl,var in zip(lbls,vars): print_labeled(lbl,var) return cost def minusdLdW(W): # returns value (R,) # sum in first dimension: (N,1) times (N,1) times (N,P) # return jacobian(minusLW)(W) return grad(minusLW)(W) def fix_violations(w,bounds): lb = np.array([b[0] for b in bounds]) ub = np.array([b[1] for b in bounds]) lb_violation = w<lb ub_violation = w>ub w[lb_violation] = lb[lb_violation] w[ub_violation] = ub[ub_violation] return w,lb_violation,ub_violation def sorted_r_eigs(w): drW,prW = np.linalg.eig(w) srtinds = np.argsort(drW) return drW[srtinds],prW[:,srtinds] def compute_eig_penalty_(Wmy,K0,kappa,T0): # still need to finish! Hopefully won't need # need to fix this to reflect addition of kappa argument Wsquig = gen_Weight(Wmy,K0,kappa,T0) drW,prW = sorted_r_eigs(Wsquig - np.eye(nQ*nS*nT)) plW = np.linalg.inv(prW) eig_outer_all = [np.real(np.outer(plW[:,k],prW[k,:])) for k in range(nS*nQ*nT)] eig_penalty_size_all = [barrier_wt/np.abs(np.real(drW[k])) for k in range(nS*nQ*nT)] eig_penalty_dir_w = [eig_penalty_size*((eig_outer[:nQ,:nQ] + eig_outer[nQ:,nQ:]) + K0[np.newaxis,:]*(eig_outer[:nQ,nQ:] + kappa*eig_outer[nQ:,:nQ])) for eig_outer,eig_penalty_size in zip(eig_outer_all,eig_penalty_size_all)] eig_penalty_dir_k = [eig_penalty_size*((eig_outer[:nQ,nQ:] + eig_outer[nQ:,:nQ]*kappa)*W0my).sum(0) for eig_outer,eig_penalty_size in zip(eig_outer_all,eig_penalty_size_all)] eig_penalty_dir_kappa = [eig_penalty_size*(eig_outer[nQ:,:nQ]*k0[np.newaxis,:]*W0my).sum().reshape((1,)) for eig_outer,eig_penalty_size in zip(eig_outer_all,eig_penalty_size_all)] eig_penalty_dir_w = np.array(eig_penalty_dir_w).sum(0) eig_penalty_dir_k = np.array(eig_penalty_dir_k).sum(0) eig_penalty_dir_kappa = np.array(eig_penalty_dir_kappa).sum(0) return eig_penalty_dir_w,eig_penalty_dir_k,eig_penalty_dir_kappa def compute_eig_penalty(W): # still need to finish! Hopefully won't need W0mx,W0my,W0sx,W0sy,s020,K0,kappa0,T0,XX0,XXp0,Eta0,Xi0,h0 = parse_W(W) eig_penalty_dir_w,eig_penalty_dir_k,eig_penalty_dir_kappa = compute_eig_penalty_(W0my,k0,kappa0) eig_penalty_W = unparse_W(np.zeros_like(W0mx),eig_penalty_dir_w,np.zeros_like(W0sx),np.zeros_like(W0sy),np.zeros_like(s020),eig_penalty_dir_k,eig_penalty_dir_kappa,np.zeros_like(XX0),np.zeros_like(XXp0),np.zeros_like(Eta0),np.zeros_like(Xi0)) # assert(True==False) return eig_penalty_W allhot = np.zeros(W0.shape) allhot[:nP*nQ+nQ**2] = 1 W_l2_reg = lambda W: np.sum((W*allhot)**2) f = lambda W: minusLW(W) + l2_penalty*W_l2_reg(W) fprime = lambda W: minusdLdW(W) + 2*l2_penalty*W*allhot fix_violations(W0,bounds) W1,loss,result = sop.fmin_l_bfgs_b(f,W0,fprime=fprime,bounds=bounds,factr=1e4,maxiter=int(1e3)) if compute_hessian: gr = grad(minusLW)(W1) hess = hessian(minusLW)(W1) else: gr = None hess = None # W0mx,W0my,W0sx,W0sy,s020,k0,kappa0,XX0,XXp0,Eta0,Xi0 = parse_W(W1) return W1,loss,gr,hess,result
def test_laplace_em_hessian(N=5, K=3, D=2, T=20): for transitions in ["standard", "recurrent", "recurrent_only"]: for emissions in ["gaussian_orthog", "gaussian"]: print("Checking analytical hessian for transitions={}, " "and emissions={}".format(transitions, emissions)) slds = ssm.SLDS(N, K, D, transitions=transitions, dynamics="gaussian", emissions=emissions) z, x, y = slds.sample(T) new_slds = ssm.SLDS(N, K, D, transitions="standard", dynamics="gaussian", emissions=emissions) inputs = [np.zeros((T, 0))] masks = [np.ones_like(y)] tags = [None] method = "laplace_em" datas = [y] num_samples = 1 def neg_expected_log_joint_wrapper(x_vec, T, D): x = x_vec.reshape(T, D) return new_slds._laplace_neg_expected_log_joint( datas[0], inputs[0], masks[0], tags[0], x, Ez, Ezzp1) variational_posterior = new_slds._make_variational_posterior( "structured_meanfield", datas, inputs, masks, tags, method) new_slds._fit_laplace_em_discrete_state_update( variational_posterior, datas, inputs, masks, tags, num_samples) Ez, Ezzp1, _ = variational_posterior.discrete_expectations[0] x = variational_posterior.mean_continuous_states[0] scale = x.size J_diag, J_lower_diag = new_slds._laplace_hessian_neg_expected_log_joint( datas[0], inputs[0], masks[0], tags[0], x, Ez, Ezzp1) dense_hessian = scipy.linalg.block_diag(*[x for x in J_diag]) dense_hessian[D:, :-D] += scipy.linalg.block_diag( *[x for x in J_lower_diag]) dense_hessian[:-D, D:] += scipy.linalg.block_diag( *[x.T for x in J_lower_diag]) true_hess = hessian(neg_expected_log_joint_wrapper)(x.reshape(-1), T, D) assert np.allclose(true_hess, dense_hessian) print("Hessian passed.") # Also check that computation of H works. h_dense = dense_hessian @ x.reshape(-1) h_dense = h_dense.reshape(T, D) J_ini, J_dyn_11, J_dyn_21, J_dyn_22, J_obs = new_slds._laplace_neg_hessian_params( datas[0], inputs[0], masks[0], tags[0], x, Ez, Ezzp1) h_ini, h_dyn_1, h_dyn_2, h_obs = new_slds._laplace_neg_hessian_params_to_hs( x, J_ini, J_dyn_11, J_dyn_21, J_dyn_22, J_obs) h = h_obs.copy() h[0] += h_ini h[:-1] += h_dyn_1 h[1:] += h_dyn_2 assert np.allclose(h, h_dense)
return -np.inf return lp + log_likelihood(th) def nll(th): return -log_likelihood(th) def nlpost(th): return -log_probability(th) dnll = jacobian(nll) dnlpost = jacobian(nlpost) ddnlpost = hessian(nlpost) time_bnds = [1e-5, pd['nt']] ang_bnds = [0, 2 * np.pi] bounds_dict = { 'tau': [10, 500], 'diff': [1e-5, 1e-1], 'xpos': [-0.5, 0.5], 'ypos': [-0.5, 0.5], 'a0': [-10, 10], 'a3': ang_bnds, 'b0': [-50, 50], 'b1': time_bnds, 'b2': time_bnds, 'b3': ang_bnds, 'c0': [-100, 100],
def ggnvp_maker(x): J = jacobian(f)(x) H = hessian(g)(f(x)) def ggnvp(v): return np.dot(J.T, np.dot(H, np.dot(J, v))) return ggnvp
def compute_dParams_dWeights(self, some_example_weights, solver_method='cholesky', non_fixed_dims=None, rank=-1, **kwargs): ''' sets self.jacobian = dParams_dxn for each datapoint x_n rank = -1 uses a full-rank matrix solve (i.e. np.linalg.solve on the full Hessian). A positive integer uses a low rank approximation in inverse_hessian_vector_product ''' if non_fixed_dims is None: non_fixed_dims = np.arange(self.params.get_free().shape[0]) if len(non_fixed_dims) == 0: self.dParams_dWeights = np.zeros( (0, some_example_weights.shape[0])) return dObj_dParams = autograd.jacobian(self.weighted_model_objective, argnum=1) d2Obj_dParams2 = autograd.jacobian(dObj_dParams, argnum=1) d2Obj_dParamsdWeights = autograd.jacobian(dObj_dParams, argnum=0) # Have to re-copy this into self.params after every autograd call, as # autograd turns self.params.get_free() into an ArrayBox (whereas we want # it to be a numpy array) #array_box_go_away = self.params.get_free().copy() #cur_weights = self.example_weights.copy() start = time.time() grads = self.compute_gradients(some_example_weights) X = self.training_data.X if solver_method == 'cholesky': eval_reg_hess = autograd.hessian(self.regularization) tmp = self.params.get_free().copy() reg_hess = eval_reg_hess(self.params.get_free()) reg_hess[-1, :] = 0.0 reg_hess[:, -1] = 0.0 self.params.set_free(tmp) self.dParams_dWeights = -solvers.ihvp_cholesky( grads, X, self.D2, regularizer_hessian=reg_hess) elif solver_method == 'agarwal': eval_reg_hess = autograd.hessian(self.regularization) tmp = self.params.get_free().copy() reg_hess = eval_reg_hess(self.params.get_free()) reg_hess[-1, :] = 0.0 reg_hess[:, -1] = 0.0 self.params.set_free(tmp) self.dParams_dWeights = -solvers.ihvp_agarwal( grads, X, self.D2, regularizer_hessian=reg_hess, **kwargs) elif solver_method == 'lanczos': print('NOTE lanczos currently assumes l2 regularization') self.dParams_dWeights = -solvers.ihvp_exactEvecs( grads, X, self.D2, rank=rank, L2Lambda=self.L2Lambda) elif solver_method == 'tropp': print('NOTE tropp currently assumes l2 regularization') self.dParams_dWeights = -solvers.ihvp_tropp( grads, X, self.D2, L2Lambda=self.L2Lambda, rank=rank) #self.params.set_free(array_box_go_away) #self.example_weights = cur_weights self.non_fixed_dims = non_fixed_dims
def test_hessian_matrix_product(): fun = lambda a: np.sum(np.sin(a)) a = npr.randn(5, 4) V = npr.randn(5, 4) H = hessian(fun)(a) check_equivalent(np.tensordot(H, V), hessian_vector_product(fun)(a, V))
# We have double counted the diagonal. For some reason the autograd # diagonal functions require axis1=-1 and axis2=-2 mat_val = mat_val - \ np.make_diagonal(np.diagonal(ld_mat, axis1=-1, axis2=-2), axis1=-1, axis2=-2) return mat_val def pos_def_matrix_free_to_vector(free_val, diag_lb=0.0): mat_val = unpack_posdef_matrix(free_val, diag_lb=diag_lb) return vectorize_ld_matrix(mat_val) pos_def_matrix_free_to_vector_jac = \ autograd.jacobian(pos_def_matrix_free_to_vector) pos_def_matrix_free_to_vector_hess = \ autograd.hessian(pos_def_matrix_free_to_vector) class PosDefMatrixParam(object): def __init__(self, name='', size=2, diag_lb=0.0, val=None): self.name = name self.__size = int(size) self.__vec_size = int(size * (size + 1) / 2) self.__diag_lb = diag_lb assert diag_lb >= 0 if val is None: self.__val = np.diag(np.full(self.__size, diag_lb + 1.0)) else: self.set(val) # These will be dense, so just use autograd directly. self.free_to_vector_jac_dense = autograd.jacobian(self.free_to_vector)
def hess(x, g): return np.tensordot(ad.hessian(objective)(x), g, axes=x.ndim) return hess
def survival_fit_weights(censored_inputs, noncensored_inputs, C, maxiter): """ Minimize the negative log of MLE to get gamma, the covariate weight vector. See Eq 19 in paper. Parameters: ---------------------------- censored_inputs: An array consisting of censored inputs of the form [time, prob, covariates]. E.g. [[1, 0.5, [1.4,2.3,5.2]],...]. noncensored_inputs: Same as above except these represent the noncensored rows. The number of covariates is the same as above, but we could have a different number of samples. maxiter: Maximum number of iterations for numerical solver C: Positive float giving the strength of L^2 regularization parameter. Returns: --------------------------------- Weights: [scaling, shape, gamma], which is flat array where gamma is the covaraite vector weights. """ n_cens = len(censored_inputs) n_noncens = len(noncensored_inputs) n_rows = n_cens + n_noncens def training_loss(flatparam): arr = flatparam[2:] # gamma param = [flatparam[0], flatparam[1], arr] # [scaling, shape, gamma] # Training loss is the negative log-likelihood. known_loss = np.log( np.array(mod_prob_density(noncensored_inputs, param))) # noncensored loss term unknown_loss = np.log( np.array(mod_overall_survival(censored_inputs, param))) # censored loss term reg = np.dot(np.array(arr), np.array(arr)) return C * reg - 1 / n_rows * (np.sum(known_loss) + np.sum(unknown_loss)) training_gradient = grad(training_loss) hess = hessian(training_loss) length = len((censored_inputs[0])[2]) + 2 b = (0.001, None ) # Make sure that both the shape and scaling parameter positive. bnds = (b, b) + tuple( (None, None) for x in range(length - 2) ) # The covariate vector components do not need to be positive. guess = np.random.uniform(low=0.1, high=0.9, size=length) res = minimize( training_loss, guess, method="SLSQP", jac=training_gradient, bounds=bnds, options={"maxiter": maxiter}, ) model_weights = res.x log_likelihood = (-n_rows) * training_loss(model_weights) observed_information_matrix = n_rows * hess(model_weights) stand_errors = np.sqrt(inv(observed_information_matrix).diagonal()) return model_weights, stand_errors, log_likelihood