def step(self, x, grad): """Perform a gradient step. Args: x (tensor): Current input value. This value will be updated in-place. grad (tensor): Current gradient. Returns: tensor: `x` after updating `x` in-place. """ if self.m is None or self.v is None: self.m = B.zeros(x) self.v = B.zeros(x) # Update estimates of moments. self.m *= self.beta1 self.m += (1 - self.beta1) * grad self.v *= self.beta2 self.v += (1 - self.beta2) * grad ** 2 # Correct for bias of initialisation. m_corr = self.m / (1 - self.beta1 ** (self.i + 1)) v_corr = self.v / (1 - self.beta2 ** (self.i + 1)) # Perform update. if self.local_rates: denom = B.sqrt(B.mean(v_corr)) + self.epsilon else: denom = B.sqrt(v_corr) + self.epsilon x -= self.rate * m_corr / denom # Increase iteration number. self.i += 1 return x
def test_logpdf_missing_data(): # Setup model. m = 3 noise = 1e-2 latent_noises = 2e-2 * B.ones(m) kernels = [0.5 * EQ().stretch(0.75) for _ in range(m)] x = B.linspace(0, 10, 20) # Concatenate two orthogonal matrices, to make the missing data # approximation exact. u1 = B.svd(B.randn(m, m))[0] u2 = B.svd(B.randn(m, m))[0] u = Dense(B.concat(u1, u2, axis=0) / B.sqrt(2)) s_sqrt = Diagonal(B.rand(m)) # Construct a reference model. oilmm_pp = ILMMPP(kernels, u @ s_sqrt, noise, latent_noises) # Sample to generate test data. y = oilmm_pp.sample(x, latent=False) # Throw away data, but retain orthogonality. y[5:10, 3:] = np.nan y[10:, :3] = np.nan # Construct OILMM to test. oilmm = OILMM(kernels, u, s_sqrt, noise, latent_noises) # Check that evidence is still exact. approx(oilmm_pp.logpdf(x, y), oilmm.logpdf(x, y), atol=1e-7)
def sqrt(a: AbstractMatrix): if structured(a): warn_upmodule( f"Taking an element-wise square root of {a}: converting to dense.", category=ToDenseWarning, ) return Dense(B.sqrt(B.dense(a)))
def predict(self, x, latent=False, return_variances=False): """Predict. Args: x (matrix): Input locations to predict at. latent (bool, optional): Predict noiseless processes. Defaults to `False`. return_variances (bool, optional): Return means and variances instead. Defaults to `False`. Returns: tuple[matrix]: Tuple containing means, lower 95% central credible bound, and upper 95% central credible bound if `variances` is `False`, and means and variances otherwise. """ mean, var = self.model.predict(x, latent=latent, return_variances=True) # Pull means and variances through mixing matrix. mean = B.dense(B.matmul(mean, self.h, tr_b=True)) var = B.dense(B.matmul(var, self.h ** 2, tr_b=True)) if not latent: var = var + self.noise_obs if return_variances: return mean, var else: error = 1.96 * B.sqrt(var) return mean, mean - error, mean + error
def predict(self, x, latent=False, return_variances=False): """Predict. Args: x (matrix): Input locations to predict at. latent (bool, optional): Predict noiseless processes. Defaults to `False`. return_variances (bool, optional): Return means and variances instead. Defaults to `False`. Returns: tuple: Tuple containing means, lower 95% central credible bound, and upper 95% central credible bound if `variances` is `False`, and means and variances otherwise. """ mean = B.stack(*[B.squeeze(B.dense(f.mean(x))) for f in self.fs], axis=1) var = B.stack(*[B.squeeze(f.kernel.elwise(x)) for f in self.fs], axis=1) if not latent: var = var + self.noises[None, :] if return_variances: return mean, var else: error = 1.96 * B.sqrt(var) return mean, mean - error, mean + error
def test_jit_to_numpy(check_lazy_shapes): @B.jit def f(x): available = B.jit_to_numpy(~B.isnan(x)) return B.sum(x[available]) x = B.sqrt(B.randn(100)) approx(f(x), f(jnp.array(x)))
def safe_sqrt(x): """Perform a square root that is safe to use in AD. Args: x (tensor): Tensor to take square root of. Returns: tensor: Square root of `x`. """ return B.sqrt(B.maximum(x, B.cast(B.dtype(x), 1e-30)))
def factor_to_scale(factor): """Convert a factor to a length scale. Args: factor (tensor): Factor to convert to a length scale. Returns: tensor: Equivalent length scale. """ return 1 / B.sqrt(4 * factor / B.pi)
def plot_compare(name1, name2, y_label=True, y_ticks=True, style2=None): """Compare prediction for the function for two models.""" _, mean1, var1 = preds_f[name1] _, mean2, var2 = preds_f[name2] inds = t_pred >= 150 mean1 = mean1[inds] mean2 = mean2[inds] var1 = var1[inds] var2 = var2[inds] t = t_pred[inds] plt.plot(t, mean1, style="pred", label=name1.upper()) plt.fill_between( t, mean1 - 1.96 * B.sqrt(var1), mean1 + 1.96 * B.sqrt(var1), style="pred", ) plt.plot(t, mean1 - 1.96 * B.sqrt(var1), style="pred", lw=0.5) plt.plot(t, mean1 + 1.96 * B.sqrt(var1), style="pred", lw=0.5) if style2 is None: style2 = "pred2" plt.plot(t, mean2, style=style2, label=name2.upper()) plt.fill_between( t, mean2 - 1.96 * B.sqrt(var2), mean2 + 1.96 * B.sqrt(var2), style=style2, ) plt.plot(t, mean2 - 1.96 * B.sqrt(var2), style=style2, lw=0.5) plt.plot(t, mean2 + 1.96 * B.sqrt(var2), style=style2, lw=0.5) inds = t_train >= 150 plt.scatter( t_train[inds], normaliser.untransform(y_train[inds]), style="train", label="Train", ) inds = t_test >= 150 plt.scatter(t_test[inds], y_test[inds], style="test", label="Test") plt.xlim(t[0], t[-1]) plt.xlabel(f"Day of {args.year}") if y_label: plt.ylabel("Crude Oil (USD)") if not y_ticks: plt.gca().set_yticklabels([]) tweak(legend_loc="upper right")
def root(a: B.Numeric): # pragma: no cover """Compute the positive square root of a positive-definite matrix. Args: a (matrix): Matrix to compute square root of. Returns: matrix: Positive square root of `a`. """ _assert_square_root(a) u, s, _ = B.svd(a) return B.mm(u, B.diag(B.sqrt(s)), u, tr_c=True)
def k_h(self): """Get the kernel function of the filter. Returns: :class:`mlkernels.Kernel`: Kernel for :math:`h`. """ # Convert `self.gamma` to a regular length scale. gamma_scale = B.sqrt(1 / (2 * self.gamma)) k_h = EQ().stretch(gamma_scale) # Kernel of filter before window k_h *= lambda t: B.exp(-self.alpha * t**2) # Window if self.causal: k_h *= lambda t: B.cast(self.dtype, t >= 0) # Causality constraint return k_h
def sample(self, x, latent=False): """Sample from the model. Args: x (matrix): Locations to sample at. latent (bool, optional): Sample noiseless processes. Defaults to `False`. Returns: matrix: Sample. """ sample = B.dense( B.matmul(self.model.sample(x, latent=latent), self.h, tr_b=True) ) if not latent: sample = sample + B.sqrt(self.noise_obs) * B.randn(sample) return sample
def sqrt(a: Constant): return Constant(B.sqrt(a.const), a.rows, a.cols)
def cholesky(a: Constant): _assert_square_cholesky(a) if a.cholesky is None: chol_const = B.divide(B.sqrt(a.const), B.sqrt(a.cols)) a.cholesky = Constant(chol_const, a.rows, a.cols) return a.cholesky
def cholesky(a: Diagonal): _assert_square_cholesky(a) if a.cholesky is None: a.cholesky = Diagonal(B.sqrt(a.diag)) return a.cholesky
def sqrt(a: UpperTriangular): return UpperTriangular(B.sqrt(a.mat))
def sqrt(a: Diagonal): return Diagonal(B.sqrt(a.diag))
def sqrt(a: Kronecker): return Kronecker(B.sqrt(a.left), B.sqrt(a.right))
def __init__( self, scheme="structured", noise=5e-2, fix_noise=False, alpha=None, alpha_t=None, window=None, fix_window=False, lam=None, gamma=None, gamma_t=None, a=None, b=None, m_max=None, m_max_cap=150, n_z=None, scale=None, fix_scale=False, ms=None, n_u=None, n_u_cap=300, t_u=None, extend_t_z=None, t=None, ): AbstractGPCM.__init__(self, scheme) # Ensure that `t` is a vector. if t is not None: t = np.array(t) # Store whether to fix the length scale, window length, and noise. self.fix_scale = fix_scale self.fix_window = fix_window self.fix_noise = fix_noise # First initialise optimisable model parameters. if alpha is None: alpha = 1 / window if alpha_t is None: alpha_t = B.sqrt(2 * alpha) if lam is None: lam = 1 / scale self.noise = noise self.alpha = alpha self.alpha_t = alpha_t self.lam = lam # For convenience, also store the extent of the filter. self.extent = 4 / self.alpha # Then initialise fixed variables. if t_u is None: # Place inducing points until the filter is `exp(-pi) = 4.32%`. t_u_max = B.pi / self.alpha # `n_u` is required to initialise `t_u`. if n_u is None: # Set it to two inducing points per wiggle, multiplied by two to account # for the longer range. n_u = int(np.ceil(2 * 2 * window / scale)) if n_u > n_u_cap: warnings.warn( f"Using {n_u} inducing points for the filter, which is too " f"many. It is capped to {n_u_cap}.", category=UserWarning, ) n_u = n_u_cap t_u = B.linspace(0, t_u_max, n_u) if n_u is None: n_u = B.shape(t_u)[0] if a is None: a = B.min(t) - B.max(t_u) if b is None: b = B.max(t) # First, try to determine `m_max` from `n_z`. if m_max is None and n_z is not None: m_max = int(np.ceil(n_z / 2)) if m_max is None: freq = 1 / scale m_max = int(np.ceil(freq * (b - a))) if m_max > m_max_cap: warnings.warn( f"Using {m_max} inducing features, which is too " f"many. It is capped to {m_max_cap}.", category=UserWarning, ) m_max = m_max_cap if ms is None: ms = B.range(2 * m_max + 1) self.a = a self.b = b self.m_max = m_max self.ms = ms self.n_z = len(ms) self.n_u = n_u self.t_u = t_u # Initialise dependent model parameters. if gamma is None: gamma = 1 / (2 * (self.t_u[1] - self.t_u[0])) if gamma_t is None: gamma_t = B.sqrt(2 * gamma) # Must ensure that `gamma < alpha`. self.gamma = min(gamma, self.alpha / 1.5) self.gamma_t = gamma_t
def sqrt(a: LowerTriangular): return LowerTriangular(B.sqrt(a.mat))