def _central_difference(_, *args): x = args[argnum] args = args[:argnum] + args[argnum + 1:] # Why do we calculate a * MACHINE_EPSILON_POWER? # consider if x is massive, like, 2**100. Then even for a simple # function like the identity function, (2**100 + h) - 2**100 = 0 due # to floating points. (the correct answer should be 1.0) # another thing to consider (and later to add) is that x is machine representable, but x + h is # rarely, and will be rounded to be machine representable. This (x + h) - x != h. delta = np.maximum(x * MACHINE_EPISLON_POWER, 1e-7) return unbroadcast_f( x, lambda g: g * (-new_f(x + 2 * delta, *args) + 8 * new_f(x + delta, *args) - 8 * new_f(x - delta, *args) + new_f(x - 2 * delta, *args)) / (12 * delta), )
from __future__ import absolute_import, division import autograd.numpy as np import scipy.stats from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f from autograd.scipy.special import gamma cdf = primitive(scipy.stats.chi2.cdf) logpdf = primitive(scipy.stats.chi2.logpdf) pdf = primitive(scipy.stats.chi2.pdf) def grad_chi2_logpdf(x, df): return np.where(df % 1 == 0, (df - x - 2) / (2 * x), 0) defvjp(cdf, lambda ans, x, df: unbroadcast_f( x, lambda g: g * np.power(2., -df / 2) * np.exp(-x / 2) * np.power( x, df / 2 - 1) / gamma(df / 2)), argnums=[0]) defvjp( logpdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * grad_chi2_logpdf(x, df)), argnums=[0]) defvjp(pdf, lambda ans, x, df: unbroadcast_f( x, lambda g: g * ans * grad_chi2_logpdf(x, df)), argnums=[0])
delta = np.maximum(x * MACHINE_EPISLON_POWER, 1e-7) return unbroadcast_f( x, lambda g: g * (-new_f(x + 2 * delta, *args) + 8 * new_f(x + delta, *args) - 8 * new_f(x - delta, *args) + new_f(x - 2 * delta, *args)) / (12 * delta), ) return _central_difference defvjp( gammainc, central_difference_of_(gammainc), lambda ans, a, x: unbroadcast_f( x, lambda g: g * np.exp(-x + np.log(x) * (a - 1) - gammaln(a))), ) defvjp( gammaincc, central_difference_of_(gammaincc), lambda ans, a, x: unbroadcast_f( x, lambda g: -g * np.exp(-x + np.log(x) * (a - 1) - gammaln(a))), ) defvjp( gammainccln, central_difference_of_(gammainccln), lambda ans, a, x: unbroadcast_f( x, lambda g: -g * np.exp(-x + np.log(x) *
from __future__ import absolute_import, division import autograd.numpy as np import scipy.stats from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f from autograd.scipy.special import gamma cdf = primitive(scipy.stats.chi2.cdf) logpdf = primitive(scipy.stats.chi2.logpdf) pdf = primitive(scipy.stats.chi2.pdf) def grad_chi2_logpdf(x, df): return np.where(df % 1 == 0, (df - x - 2) / (2 * x), 0) defvjp(cdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * np.power(2., -df/2) * np.exp(-x/2) * np.power(x, df/2 - 1) / gamma(df/2)), argnums=[0]) defvjp(logpdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * grad_chi2_logpdf(x, df)), argnums=[0]) defvjp(pdf, lambda ans, x, df: unbroadcast_f(x, lambda g: g * ans * grad_chi2_logpdf(x, df)), argnums=[0])
"differentiable w.r.t. a singular covariance matix") J = np.linalg.inv(cov) solved = np.matmul(J, np.expand_dims(x - mean, -1)) return 1. / 2 * (generalized_outer_product(solved) - J) def solve(allow_singular): if allow_singular: return lambda A, x: np.dot(np.linalg.pinv(A), x) else: return np.linalg.solve defvjp(logpdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f( x, lambda g: -np.expand_dims(g, 1) * solve(allow_singular) (cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f( mean, lambda g: np.expand_dims(g, 1) * solve(allow_singular) (cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f( cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad( x, mean, cov, allow_singular))) # Same as log pdf, but multiplied by the pdf (ans). defvjp(pdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f( x, lambda g: -np.expand_dims(ans * g, 1) * solve(allow_singular) (cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(
from __future__ import absolute_import import scipy.special import autograd.numpy as np from autograd.extend import primitive, defvjp, defjvp from autograd.numpy.numpy_vjps import unbroadcast_f, repeat_to_match_shape ### Beta function ### beta = primitive(scipy.special.beta) betainc = primitive(scipy.special.betainc) betaln = primitive(scipy.special.betaln) defvjp(beta, lambda ans, a, b: unbroadcast_f(a, lambda g: g * ans * (psi(a) - psi(a + b))), lambda ans, a, b: unbroadcast_f(b, lambda g: g * ans * (psi(b) - psi(a + b)))) defvjp(betainc, lambda ans, a, b, x: unbroadcast_f(x, lambda g: g * np.power(x, a - 1) * np.power(1 - x, b - 1) / beta(a, b)), argnums=[2]) defvjp(betaln, lambda ans, a, b: unbroadcast_f(a, lambda g: g * (psi(a) - psi(a + b))), lambda ans, a, b: unbroadcast_f(b, lambda g: g * (psi(b) - psi(a + b)))) ### Gamma functions ### polygamma = primitive(scipy.special.polygamma) psi = primitive(scipy.special.psi) # psi(x) is just polygamma(0, x) digamma = primitive(scipy.special.digamma) # digamma is another name for psi. gamma = primitive(scipy.special.gamma) gammaln = primitive(scipy.special.gammaln) gammainc = primitive(scipy.special.gammainc) gammaincc = primitive(scipy.special.gammaincc) gammasgn = primitive(scipy.special.gammasgn) rgamma = primitive(scipy.special.rgamma)
from __future__ import absolute_import import scipy.special import autograd.numpy as np from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f ### Beta function ### beta = primitive(scipy.special.beta) betainc = primitive(scipy.special.betainc) betaln = primitive(scipy.special.betaln) defvjp(beta, lambda ans, a, b: unbroadcast_f(a, lambda g: g * ans * (psi(a) - psi(a + b))), lambda ans, a, b: unbroadcast_f(b, lambda g: g * ans * (psi(b) - psi(a + b)))) defvjp(betainc, lambda ans, a, b, x: unbroadcast_f(x, lambda g: g * np.power(x, a - 1) * np.power(1 - x, b - 1) / beta(a, b)), argnums=[2]) defvjp(betaln, lambda ans, a, b: unbroadcast_f(a, lambda g: g * (psi(a) - psi(a + b))), lambda ans, a, b: unbroadcast_f(b, lambda g: g * (psi(b) - psi(a + b)))) ### Gamma functions ### polygamma = primitive(scipy.special.polygamma) psi = primitive(scipy.special.psi) # psi(x) is just polygamma(0, x) digamma = primitive(scipy.special.digamma) # digamma is another name for psi. gamma = primitive(scipy.special.gamma) gammaln = primitive(scipy.special.gammaln) gammainc = primitive(scipy.special.gammainc) gammaincc = primitive(scipy.special.gammaincc) gammasgn = primitive(scipy.special.gammasgn) rgamma = primitive(scipy.special.rgamma)
from __future__ import absolute_import import autograd.numpy as np import scipy.stats from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f cdf = primitive(scipy.stats.poisson.cdf) logpmf = primitive(scipy.stats.poisson.logpmf) pmf = primitive(scipy.stats.poisson.pmf) def grad_poisson_logpmf(k, mu): return np.where(k % 1 == 0, k / mu - 1, 0) defvjp(cdf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * -pmf(np.floor(k), mu)), argnums=[1]) defvjp(logpmf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * grad_poisson_logpmf(k, mu)), argnums=[1]) defvjp(pmf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * ans * grad_poisson_logpmf(k, mu)), argnums=[1])
I wasted a few hours on this but sadly it turns out to be extremely slow. Side note 2: TensorFlow actually has a `similar bug <https://github.com/tensorflow/tensorflow/issues/17995>`_ ''' return _scipy_gammainc(k, x) delta = 1e-6 defvjp( gammainc, lambda ans, a, x: unbroadcast_f( a, lambda g: g * (-gammainc(a + 2 * delta, x) + 8 * gammainc(a + delta, x) - 8 * gammainc(a - delta, x) + gammainc(a - 2 * delta, x)) / (12 * delta), ), lambda ans, a, x: unbroadcast_f( x, lambda g: g * np.exp(-x) * np.power(x, a - 1) / gamma(a)), ) gammaincc = primitive(_scipy_gammaincc) defvjp( gammaincc, lambda ans, a, x: unbroadcast_f( a, lambda g: g * (-gammaincc(a + 2 * delta, x) + 8 * gammaincc(a + delta, x) - 8 * gammaincc(a - delta, x) + gammaincc(a - 2 * delta, x)) / (12 * delta),
return mpmath.meijerg(a_s=([], [0, 0]), b_s=([s-1, -1, -1], []), z=x) I wasted a few hours on this but sadly it turns out to be extremely slow. Side note 2: TensorFlow actually has a `similar bug <https://github.com/tensorflow/tensorflow/issues/17995>`_ """ return gammainc_orig(k, x) @primitive def gammainc2(k, x): return gammainc_orig(k, x) G_EPS = 1e-8 defvjp( gammainc2, lambda ans, k, x: unbroadcast_f( k, lambda g: g * (gammainc_orig(k + G_EPS, x) - 2 * ans + gammainc_orig(k - G_EPS, x)) / G_EPS ** 2 ), lambda ans, k, x: unbroadcast_f( k, lambda g: g * (gammainc_orig(k, x + G_EPS) - 2 * ans + gammainc_orig(k, x - G_EPS)) / G_EPS ** 2 ), ) defvjp( gammainc, lambda ans, k, x: unbroadcast_f(k, lambda g: g * (gammainc2(k + G_EPS, x) - ans) / G_EPS), lambda ans, k, x: unbroadcast_f(x, lambda g: g * (gammainc2(k, x + G_EPS) - ans) / G_EPS), )
def grad_beta_logpdf_arg0(x, a, b): return (1 + a * (x - 1) + x * (b - 2)) / (x * (x - 1)) def grad_beta_logpdf_arg1(x, a, b): return np.log(x) - psi(a) + psi(a + b) def grad_beta_logpdf_arg2(x, a, b): return np.log1p(-x) - psi(b) + psi(a + b) defvjp(cdf, lambda ans, x, a, b: unbroadcast_f( x, lambda g: g * np.power(x, a - 1) * np.power(1 - x, b - 1) / beta( a, b)), argnums=[0]) defvjp( logpdf, lambda ans, x, a, b: unbroadcast_f( x, lambda g: g * grad_beta_logpdf_arg0(x, a, b)), lambda ans, x, a, b: unbroadcast_f( a, lambda g: g * grad_beta_logpdf_arg1(x, a, b)), lambda ans, x, a, b: unbroadcast_f( b, lambda g: g * grad_beta_logpdf_arg2(x, a, b))) defvjp( pdf, lambda ans, x, a, b: unbroadcast_f( x, lambda g: g * ans * grad_beta_logpdf_arg0(x, a, b)), lambda ans, x, a, b: unbroadcast_f( a, lambda g: g * ans * grad_beta_logpdf_arg1(x, a, b)), lambda ans, x, a, b: unbroadcast_f(
def grad_tlogpdf_diff(diff, df): return -diff * (1.0 + df) / (diff**2 + df) def grad_tlogpdf_x(x, df, loc, scale): return grad_tlogpdf_diff((x - loc) / scale, df) / scale def grad_tlogpdf_loc(x, df, loc, scale): return -grad_tlogpdf_diff((x - loc) / scale, df) / scale def grad_tlogpdf_scale(x, df, loc, scale): diff = x - loc return -(df * (scale**2 - diff**2))/(scale * (df * scale**2 + diff**2)) def grad_tlogpdf_df(x, df, loc, scale): y = (x - loc)/scale return 0.5 * ((y**2 * (df+1))/(df * (y**2 + df)) - np.log(y**2 / df + 1) - 1.0/df -psi(df/2.0) + psi((df + 1)/2.0)) defvjp(pdf, lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * ans * grad_tlogpdf_x( x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(df, lambda g: g * ans * grad_tlogpdf_df( x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: g * ans * grad_tlogpdf_loc( x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(scale, lambda g: g * ans * grad_tlogpdf_scale(x, df, loc, scale))) defvjp(cdf, lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * pdf(x, df, loc, scale)), lambda ans, x, df, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: -g * pdf(x, df, loc, scale)), argnums=(0,2)) defvjp(logpdf, lambda ans, x, df, loc=0.0, scale=1.0:
delta = temp - x return unbroadcast_f( x, lambda g: g * (-1 * new_f(x + 2 * delta, *args) + 8 * new_f( x + 1 * delta, *args) - 8 * new_f(x - 1 * delta, *args) + 1 * new_f(x - 2 * delta, *args)) / (12 * delta) / new_f(x, *args), ) return _central_difference defvjp( gammainc, central_difference_of_(gammainc), lambda ans, a, x: unbroadcast_f( x, lambda g: g * np.exp(-x + np.log(x) * (a - 1) - gammaln(a))), ) defvjp( gammaincc, central_difference_of_(gammaincc), lambda ans, a, x: unbroadcast_f( x, lambda g: -g * np.exp(-x + np.log(x) * (a - 1) - gammaln(a))), ) defvjp( gammaincinv, central_difference_of_(gammaincinv), lambda ans, a, y: unbroadcast_f( y, lambda g: g * np.exp( gammaincinv(a, y) - np.log(gammaincinv(a, y)) *
if allow_singular: raise NotImplementedError("The multivariate normal pdf is not " "differentiable w.r.t. a singular covariance matix") J = np.linalg.inv(cov) solved = np.matmul(J, np.expand_dims(x - mean, -1)) return 1./2 * (generalized_outer_product(solved) - J) def solve(allow_singular): if allow_singular: return lambda A, x: np.dot(np.linalg.pinv(A), x) else: return np.linalg.solve defvjp(logpdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(x, lambda g: -np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(mean, lambda g: np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular))) # Same as log pdf, but multiplied by the pdf (ans). defvjp(pdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(x, lambda g: -np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(mean, lambda g: np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(cov, lambda g: -np.reshape(ans * g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular))) defvjp(entropy, None,
from __future__ import absolute_import import autograd.numpy as np import scipy.stats from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f cdf = primitive(scipy.stats.poisson.cdf) logpmf = primitive(scipy.stats.poisson.logpmf) pmf = primitive(scipy.stats.poisson.pmf) def grad_poisson_logpmf(k, mu): return np.where(k % 1 == 0, k / mu - 1, 0) defvjp( cdf, lambda ans, k, mu: unbroadcast_f(mu, lambda g: g * -pmf(np.floor(k), mu)), argnums=[1]) defvjp(logpmf, lambda ans, k, mu: unbroadcast_f( mu, lambda g: g * grad_poisson_logpmf(k, mu)), argnums=[1]) defvjp(pmf, lambda ans, k, mu: unbroadcast_f( mu, lambda g: g * ans * grad_poisson_logpmf(k, mu)), argnums=[1])
from __future__ import absolute_import import scipy.stats import autograd.numpy as anp from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f pdf = primitive(scipy.stats.norm.pdf) cdf = primitive(scipy.stats.norm.cdf) sf = primitive(scipy.stats.norm.sf) logpdf = primitive(scipy.stats.norm.logpdf) logcdf = primitive(scipy.stats.norm.logcdf) logsf = primitive(scipy.stats.norm.logsf) defvjp(pdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: -g * ans * (x - loc) / scale**2), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: g * ans * (x - loc) / scale**2), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(scale, lambda g: g * ans * (((x - loc)/scale)**2 - 1.0)/scale)) defvjp(cdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(x, lambda g: g * pdf(x, loc, scale)) , lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(loc, lambda g: -g * pdf(x, loc, scale)), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(scale, lambda g: -g * pdf(x, loc, scale)*(x-loc)/scale)) defvjp(logpdf, lambda ans, x, loc=0.0, scale=1.0:
# -*- coding: utf-8 -*- from __future__ import division from scipy.stats import norm as _scipy_norm import autograd.numpy as np from autograd.scipy.stats import norm from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f # TODO: next release of autograd will have this built in. logsf = primitive(_scipy_norm.logsf) defvjp( logsf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( x, lambda g: -g * np.exp( norm.logpdf(x, loc, scale) - logsf(x, loc, scale))), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( loc, lambda g: g * np.exp( norm.logpdf(x, loc, scale) - logsf(x, loc, scale))), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: g * np.exp( norm.logpdf(x, loc, scale) - logsf(x, loc, scale)) * (x - loc) / scale), )
As a really stupid workaround, because we don't need the numbers to be 100% exact, we just approximate the gradient. Side note 1: if you truly want to compute the correct derivative, see the `Wikipedia articule about the Incomplete gamma function <https://en.wikipedia.org/wiki/Incomplete_gamma_function#Derivatives>`_ where the T(3, s, x) function can be implemented as .. code-block:: python def T3(s, x): return mpmath.meijerg(a_s=([], [0, 0]), b_s=([s-1, -1, -1], []), z=x) I wasted a few hours on this but sadly it turns out to be extremely slow. Side note 2: TensorFlow actually has a `similar bug <https://github.com/tensorflow/tensorflow/issues/17995>`_ ''' return gammainc_orig(k, x) G_EPS = 1e-6 defvjp( gammainc, lambda ans, k, x: unbroadcast_f( k, lambda g: g * (gammainc_orig(k + G_EPS, x) - ans) / G_EPS), lambda ans, k, x: unbroadcast_f( x, lambda g: g * (gammainc_orig(k, x + G_EPS) - ans) / G_EPS), )
def gammainc_vjp_arg1(ans, a, x): coeffs = sign * np.exp(-x) * np.power(x, a - 1) / gamma(a) return unbroadcast_f(x, lambda g: g * coeffs)
"""Gradients of the normal distribution.""" from __future__ import absolute_import import scipy.stats import autograd.numpy as anp from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f pdf = primitive(scipy.stats.norm.pdf) cdf = primitive(scipy.stats.norm.cdf) sf = primitive(scipy.stats.norm.sf) logpdf = primitive(scipy.stats.norm.logpdf) logcdf = primitive(scipy.stats.norm.logcdf) logsf = primitive(scipy.stats.norm.logsf) defvjp(pdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( x, lambda g: -g * ans * (x - loc) / scale**2), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( loc, lambda g: g * ans * (x - loc) / scale**2), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: g * ans * (((x - loc) / scale)**2 - 1.0) / scale)) defvjp(cdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( x, lambda g: g * pdf(x, loc, scale)), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( loc, lambda g: -g * pdf(x, loc, scale)), lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f( scale, lambda g: -g * pdf(x, loc, scale) * (x - loc) / scale)) defvjp(logpdf, lambda ans, x, loc=0.0, scale=1.0: unbroadcast_f(
def gammainc_vjp_arg1(ans, a, x): coeffs = sign * np.exp(-x) * np.power(x, a - 1) / gamma(a) return unbroadcast_f(x, lambda g: g * coeffs)
from __future__ import absolute_import import autograd.numpy as np import scipy.stats from autograd.extend import primitive, defvjp from autograd.numpy.numpy_vjps import unbroadcast_f from autograd.scipy.special import gamma, psi cdf = primitive(scipy.stats.gamma.cdf) logpdf = primitive(scipy.stats.gamma.logpdf) pdf = primitive(scipy.stats.gamma.pdf) def grad_gamma_logpdf_arg0(x, a): return (a - x - 1) / x def grad_gamma_logpdf_arg1(x, a): return np.log(x) - psi(a) defvjp(cdf, lambda ans, x, a: unbroadcast_f(x, lambda g: g * np.exp(-x) * np.power(x, a-1) / gamma(a)), argnums=[0]) defvjp(logpdf, lambda ans, x, a: unbroadcast_f(x, lambda g: g * grad_gamma_logpdf_arg0(x, a)), lambda ans, x, a: unbroadcast_f(a, lambda g: g * grad_gamma_logpdf_arg1(x, a))) defvjp(pdf, lambda ans, x, a: unbroadcast_f(x, lambda g: g * ans * grad_gamma_logpdf_arg0(x, a)), lambda ans, x, a: unbroadcast_f(a, lambda g: g * ans * grad_gamma_logpdf_arg1(x, a)))