def hessian_vector_product(fun, argnum=0): """Builds a function that returns the exact Hessian-vector product. The returned function has arguments (*args, vector, **kwargs), and takes roughly 4x as long to evaluate as the original function.""" fun_grad = grad(fun, argnum) def vector_dot_grad(*args, **kwargs): args, vector = args[:-1], args[-1] return np.dot(vector, fun_grad(*args, **kwargs)) return grad(vector_dot_grad, argnum) # Grad wrt original input.
def quick_grad_check(fun, arg0, extra_args=(), kwargs={}, verbose=True, eps=EPS, rtol=RTOL, atol=ATOL, rs=None): """Checks the gradient of a function (w.r.t. to its first arg) in a random direction""" if verbose: print("Checking gradient of {0} at {1}".format(fun, arg0)) if rs is None: rs = np.random.RandomState() random_dir = rs.standard_normal(np.shape(arg0)) random_dir = random_dir / np.sqrt(np.sum(random_dir * random_dir)) unary_fun = lambda x: fun(arg0 + x * random_dir, *extra_args, **kwargs) numeric_grad = unary_nd(unary_fun, 0.0, eps=eps) analytic_grad = np.sum(grad(fun)(arg0, *extra_args, **kwargs) * random_dir) assert np.allclose(numeric_grad, analytic_grad, rtol=rtol, atol=atol), \ "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad) if verbose: print("Gradient projection OK (numeric grad: {0}, analytic grad: {1})". format(numeric_grad, analytic_grad))
def check_grads(fun, *args): if not args: raise Exception("No args given") exact = tuple([grad(fun, i)(*args) for i in range(len(args))]) numeric = nd(fun, *args) check_equivalent(exact, numeric)
def grad_and_aux_fun(*args, **kwargs): saved = lambda: None def return_val_save_aux(*args, **kwargs): val, saved.aux = fun(*args, **kwargs) return val gradval = grad(return_val_save_aux, argnum)(*args, **kwargs) return gradval, saved.aux
def elementwise_grad(fun, argnum=0): """Like `jacobian`, but produces a function which computes just the diagonal of the Jacobian, and does the computation in one pass rather than in a loop. Note: this is only valid if the Jacobian is diagonal. Only arrays are currently supported. Can be used for broadcasting.""" def sum_output(*args, **kwargs): return np.sum(fun(*args, **kwargs)) return grad(sum_output, argnum=argnum)
def grad_and_aux_fun(*args, **kwargs): saved_aux = [] def return_val_save_aux(*args, **kwargs): val, aux = fun(*args, **kwargs) saved_aux.append(aux) return val gradval = grad(return_val_save_aux, argnum)(*args, **kwargs) return gradval, saved_aux[0]
def elementwise_grad(fun, argnum=0): """Like `jacobian`, but produces a function which computes just the diagonal of the Jacobian, and does the computation in one pass rather than in a loop. Note: this is only valid if the Jacobian is diagonal. Only arrays are currently supported.""" def sum_output(*args, **kwargs): return np.sum(fun(*args, **kwargs)) return grad(sum_output, argnum=argnum)
def _hessian_vector_product(fun, argnum=0): """Builds a function that returns the exact Hessian-vector product. The returned function has arguments (*args, vector, **kwargs). Note, this function will be incorporated into autograd, with name hessian_vector_product. Once it has been this function can be deleted.""" fun_grad = grad(fun, argnum) def vector_dot_grad(*args, **kwargs): args, vector = args[:-1], args[-1] try: return np.tensordot(fun_grad(*args, **kwargs), vector, axes=vector.ndim) except AttributeError: # Assume we are on the product manifold. return np.sum([np.tensordot(fun_grad(*args, **kwargs)[k], vector[k], axes=vector[k].ndim) for k in range(len(vector))]) # Grad wrt original input. return grad(vector_dot_grad, argnum)
def jac_fun(*args, **kwargs): arg_in = args[argnum] output = fun(*args, **kwargs) assert isinstance(getval(arg_in), np.ndarray), "Must have array input" assert isinstance(getval(output), np.ndarray), "Must have array output" jac = np.zeros(output.shape + arg_in.shape) input_slice = (slice(None),) * len(arg_in.shape) for idxs in it.product(*map(range, output.shape)): scalar_fun = lambda *args, **kwargs : fun(*args, **kwargs)[idxs] jac[idxs + input_slice] = grad(scalar_fun, argnum=argnum)(*args, **kwargs) return jac
def jac_fun(*args, **kwargs): arg_in = args[argnum] output = fun(*args, **kwargs) assert isinstance(getval(arg_in), np.ndarray), "Must have array input" assert isinstance(getval(output), np.ndarray), "Must have array output" jac = np.zeros(output.shape + arg_in.shape) input_slice = (slice(None),) * len(arg_in.shape) for idxs in it.product(*list(map(range, output.shape))): scalar_fun = lambda *args, **kwargs : fun(*args, **kwargs)[idxs] jac[idxs + input_slice] = grad(scalar_fun, argnum=argnum)(*args, **kwargs) return jac
def multigrad(fun, argnums=[0]): """Takes gradients wrt multiple arguments simultaneously.""" def combined_arg_fun(multi_arg, *args, **kwargs): extra_args_list = list(args) for argnum_ix, arg_ix in enumerate(argnums): extra_args_list[arg_ix] = multi_arg[argnum_ix] return fun(*extra_args_list, **kwargs) gradfun = grad(combined_arg_fun, argnum=0) def gradfun_rearranged(*args, **kwargs): multi_arg = tuple([args[i] for i in argnums]) return gradfun(multi_arg, *args, **kwargs) return gradfun_rearranged
def gradfun(*args, **kwargs): bindings = sig.bind(*args, **kwargs) args = lambda dct: tuple(dct[var_pos[0]]) if var_pos else () kwargs = lambda dct: todict(dct[var_kwd[0]]) if var_kwd else {} others = lambda dct: tuple(dct[argname] for argname in argnames if argname not in var_kwd + var_pos) newfun = lambda dct: fun(*(others(dct) + args(dct)), **kwargs(dct)) argdict = apply_defaults(bindings.arguments) grad_dict = grad(newfun)(dict(argdict)) return OrderedDict((argname, grad_dict[argname]) for argname in argdict)
def compute_gradient(self, objective, argument): """ Compute the gradient of 'objective' with respect to the first argument and return as a function. """ g = grad(objective) # Sometimes x will be some custom type, e.g. with the FixedRankEmbedded # manifold. Therefore cast it to a numpy.array. def gradient(x): if type(x) in (list, tuple): return g([np.array(xi) for xi in x]) else: return g(np.array(x)) return gradient
def _hessian_vector_product(fun, argnum=0): """Builds a function that returns the exact Hessian-vector product. The returned function has arguments (*args, vector, **kwargs). Note, this function will be incorporated into autograd, with name hessian_vector_product. Once it has been this function can be deleted.""" fun_grad = grad(fun, argnum) def vector_dot_grad(*args, **kwargs): args, vector = args[:-1], args[-1] try: return np.tensordot(fun_grad(*args, **kwargs), vector, axes=vector.ndim) except AttributeError: # Assume we are on the product manifold. return np.sum([ np.tensordot(fun_grad(*args, **kwargs)[k], vector[k], axes=vector[k].ndim) for k in range(len(vector)) ]) # Grad wrt original input. return grad(vector_dot_grad, argnum)
def quick_grad_check(fun, arg0, extra_args=(), kwargs={}, verbose=True, eps=EPS, rtol=RTOL, atol=ATOL, rs=None): """Checks the gradient of a function (w.r.t. to its first arg) in a random direction""" if verbose: print("Checking gradient of {0} at {1}".format(fun, arg0)) if rs is None: rs = np.random.RandomState() random_dir = rs.standard_normal(np.shape(arg0)) random_dir = random_dir / np.sqrt(np.sum(random_dir * random_dir)) unary_fun = lambda x : fun(arg0 + x * random_dir, *extra_args, **kwargs) numeric_grad = unary_nd(unary_fun, 0.0, eps=eps) analytic_grad = np.sum(grad(fun)(arg0, *extra_args, **kwargs) * random_dir) assert np.allclose(numeric_grad, analytic_grad, rtol=rtol, atol=atol), \ "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad) if verbose: print("Gradient projection OK (numeric grad: {0}, analytic grad: {1})".format( numeric_grad, analytic_grad))
def compute_gradient(self, objective, argument): """ Compute the gradient of 'objective' with respect to the first argument and return as a function. """ return grad(objective)
def make_functions(X, inv_var, lam0, lam0_delta, K, K_chol, sig2_omega, sig2_mu): """ Make basis fitting functions INPUTS: - X : N_spec x len(lam0) matrix of spectra (missing stuff can be 0'd out) - inv_var : N_spec x len(lam0) matrix of spectra inverse variances (0 = infinite variance = no observation) - lam0 : wavelength observation locations - lam0_delta : wavelength observation jumps (could be inferred...) - K : number of bases - K_chol : cholesky decomposition of MVN covariance prior for a single basis - sig2_omega : variance for omega (logit loadings) - sig2_mu : variance for log magnitudes OUTPUTS: - loss_fun, loss_fun_grad, prior_loss, prior_loss_grad, train_model """ parser = ParamParser() V = len(lam0) N = X.shape[0] parser.add_weights('mus', (N, 1)) parser.add_weights('betas', (K, V)) parser.add_weights('omegas', (N, K)) ## weighted loss function - observations have gaussian noise #def loss_fun(th_vec, X, inv_var, lam0_delta, K): def loss_fun(th_vec, idx=None): """ Negative log likelihood function. The likelihood model encoded here is beta_k ~ GP(0, K) omega_k ~ Normal(0, 1) mu_k ~ Normal(0, 10) Normalize Basis and weights so they both sum to 1 B_k = exp(beta_k) / sum( exp(beta_k) DeltaLam) w_k = exp(w_k) / sum(exp(w_i)) m = exp(mu_k) f = m \sum w_k B_k Observations are normal about the latent spectra, with known variance X_lam ~ Normal(f, var_lam) """ # unpack params N = X.shape[0] mus = parser.get(th_vec, 'mus') betas = parser.get(th_vec, 'betas') omegas = parser.get(th_vec, 'omegas') # subselect for SGD if idx is not None: mus = mus[idx] omegas = omegas[idx, :] X_idx = X[idx, :] inv_var_idx = inv_var[idx, :] else: X_idx = X inv_var_idx = inv_var # exponentiate and normalize params W = np.exp(omegas) W = W / np.sum(W, axis=1, keepdims=True) B = np.exp(np.dot(K_chol, betas.T).T) B = B / np.sum(B * lam0_delta, axis=1, keepdims=True) M = np.exp(mus) Xtilde = np.dot(W * M, B) loss_mat = inv_var_idx * np.square(X_idx - Xtilde) return np.sum(loss_mat[~np.isnan(loss_mat)]) loss_grad = grad(loss_fun) ## joint prior over parameters def prior_loss(th, idx=None): """ WHITENED SPACE PRIOR - th_mat : K x (N + V) matrix holding all weights and basis params - N : number of examples in training set - sig2_omega: prior variance on log weights """ mus = parser.get(th, 'mus') betas = parser.get(th, 'betas') omegas = parser.get(th, 'omegas') if idx is not None: mus = mus[idx] omegas = omegas[idx, :] loss_mus = .5 / (sig2_mu) * np.sum(np.square(mus)) loss_omegas = .5 / (sig2_omega) * np.sum(np.square(omegas)) loss_betas = .5 * np.sum(np.square(betas)) return loss_omegas + loss_mus + loss_betas prior_loss_grad = grad(prior_loss) return parser, loss_fun, loss_grad, prior_loss, prior_loss_grad
# on both the input to the original function (x), and the output of the # original function (ans). def make_grad_logsumexp(ans, x): # If you want to be able to take higher-order derivatives, then all the # code inside this function must be itself differentiable by autograd. def gradient_product(g): # This closure multiplies g with the Jacobian of logsumexp (d_ans/d_x). # Because autograd uses reverse-mode differentiation, g contains # the gradient of the objective w.r.t. ans, the output of logsumexp. return np.full(x.shape, g) * np.exp(x - np.full(x.shape, ans)) return gradient_product # Now we tell autograd that logsumexmp has a gradient-making function. logsumexp.defgrad(make_grad_logsumexp) if __name__ == '__main__': # Now we can use logsumexp() inside a larger function that we want # to differentiate. def example_func(y): z = y**2 lse = logsumexp(z) return np.sum(lse) grad_of_example = grad(example_func) print("Gradient: ", grad_of_example(npr.randn(10))) # Check the gradients numerically, just to be safe. quick_grad_check(example_func, npr.randn(10))
def rgrad(cost, proj): """ Generates the Riemannain gradient of cost. Cost must be defined using autograd.numpy. """ return lambda x: proj(x, grad(cost)(x))
# The reason for the closure is so that the gradient can depend # on both the input to the original function (x), and the output of the # original function (ans). def make_grad_logsumexp(ans, x): # If you want to be able to take higher-order derivatives, then all the # code inside this function must be itself differentiable by autograd. def gradient_product(g): # This closure multiplies g with the Jacobian of logsumexp (d_ans/d_x). # Because autograd uses reverse-mode differentiation, g contains # the gradient of the objective w.r.t. ans, the output of logsumexp. return np.full(x.shape, g) * np.exp(x - np.full(x.shape, ans)) return gradient_product # Now we tell autograd that logsumexmp has a gradient-making function. logsumexp.defgrad(make_grad_logsumexp) if __name__ == '__main__': # Now we can use logsumexp() inside a larger function that we want # to differentiate. def example_func(y): z = y**2 lse = logsumexp(z) return np.sum(lse) grad_of_example = grad(example_func) print("Gradient: ", grad_of_example(npr.randn(10))) # Check the gradients numerically, just to be safe. quick_grad_check(example_func, npr.randn(10))
def grad_named(fun, argname): '''Takes gradients with respect to a named argument. Doesn't work on *args or **kwargs.''' arg_index = getargspec(fun).args.index(argname) return grad(fun, arg_index)