def _make_grad_hmm_normalizer(argnum, ans, log_pi0, log_Ps, ll): # Unbox the inputs if necessary log_pi0 = getval(log_pi0) log_Ps = getval(log_Ps) ll = getval(ll) # Make sure everything is C contiguous to_c = lambda arr: np.copy(arr, 'C') if not arr.flags['C_CONTIGUOUS'] else arr log_pi0 = to_c(log_pi0) log_Ps = to_c(log_Ps) ll = to_c(ll) dlog_pi0 = np.zeros_like(log_pi0) dlog_Ps= np.zeros_like(log_Ps) dll = np.zeros_like(ll) T, K = ll.shape # Forward pass to get alphas alphas = np.zeros((T, K)) forward_pass(log_pi0, log_Ps, ll, alphas) grad_hmm_normalizer(log_Ps, alphas, dlog_pi0, dlog_Ps, dll) if argnum == 0: return lambda g: g * dlog_pi0 if argnum == 1: return lambda g: g * dlog_Ps if argnum == 2: return lambda g: g * dll
def hmm_expected_states(log_pi0, log_Ps, ll): T, K = ll.shape # Make sure everything is C contiguous to_c = lambda arr: np.copy(arr, 'C') if not arr.flags['C_CONTIGUOUS'] else arr log_pi0 = to_c(getval(log_pi0)) log_Ps = to_c(getval(log_Ps)) ll = to_c(getval(ll)) alphas = np.zeros((T, K)) forward_pass(log_pi0, log_Ps, ll, alphas) normalizer = logsumexp(alphas[-1]) betas = np.zeros((T, K)) backward_pass(log_Ps, ll, betas) expected_states = alphas + betas expected_states -= logsumexp(expected_states, axis=1, keepdims=True) expected_states = np.exp(expected_states) expected_joints = alphas[:-1,:,None] + betas[1:,None,:] + ll[1:,None,:] + log_Ps expected_joints -= expected_joints.max((1,2))[:,None, None] expected_joints = np.exp(expected_joints) expected_joints /= expected_joints.sum((1,2))[:,None,None] return expected_states, expected_joints, normalizer
def store_args(self, params, X, Y, param_encoding_pairs): arg_dict = { 'features': getval(X), 'targets': getval(Y)} for param in params: arg_dict[param.name] = getval(param.data()) fname = self._filename() np.save(fname + '_args.npy', arg_dict) with open(fname + '_args.txt', 'w') as f: self._write_meta(f) for param, encoding in param_encoding_pairs: f.write(param_to_pretty_string(param, encoding) + '\n')
def forward(self, X1, X2): d1 = self.kernel1.dimension X1_shape = getval(X1.shape) X2_shape = getval(X2.shape) X1_1 = anp.take(X1, range(0, d1), axis=1) X1_2 = anp.take(X1, range(d1, X1_shape[1]), axis=1) X2_1 = anp.take(X2, range(0, d1), axis=1) X2_2 = anp.take(X2, range(d1, X2_shape[1]), axis=1) kmat1 = self.kernel1(X1_1, X2_1) kmat2 = self.kernel2(X1_2, X2_2) return kmat1 * kmat2
def sample_posterior_joint(features, mean, kernel, chol_fact, pred_mat, test_features, num_samples=1): """ Draws num_sample samples from joint posterior distribution over inputs test_features. This is done by computing mean and covariance matrix of this posterior, and using the Cholesky decomposition of the latter. If pred_mat is a matrix with m columns, the samples returned have shape (n_test, m, num_samples). :param features: Training inputs :param mean: Mean function :param kernel: Kernel function :param chol_fact: Part L of posterior state :param pred_mat: Part P of posterior state :param test_features: Test inputs :param num_samples: Number of samples to draw :return: Samples, shape (n_test, num_samples) or (n_test, m, num_samples) """ k_tr_te = kernel(features, test_features) linv_k_tr_te = aspl.solve_triangular(chol_fact, k_tr_te, lower=True) posterior_mean = anp.matmul(anp.transpose(linv_k_tr_te), pred_mat) + \ anp.reshape(mean(test_features), (-1, 1)) posterior_cov = kernel(test_features, test_features) - anp.dot( anp.transpose(linv_k_tr_te), linv_k_tr_te) jitter_init = anp.ones((1, )) * (1e-5) sys_mat = AddJitterOp(flatten_and_concat(posterior_cov, jitter_init), initial_jitter_factor=NOISE_VARIANCE_LOWER_BOUND) lfact = cholesky_factorization(sys_mat) # Draw samples # posterior_mean.shape = (n_test, m), where m is number of cols of pred_mat # Reshape to (n_test, m, 1) n_test = getval(posterior_mean.shape)[0] posterior_mean = anp.expand_dims(posterior_mean, axis=-1) n01_vecs = [ anp.random.normal(size=getval(posterior_mean.shape)) for _ in range(num_samples) ] n01_mat = anp.reshape(anp.concatenate(n01_vecs, axis=-1), (n_test, -1)) samples = anp.reshape(anp.dot(lfact, n01_mat), (n_test, -1, num_samples)) samples = samples + posterior_mean if samples.shape[1] == 1: samples = anp.reshape(samples, (n_test, -1)) # (n_test, num_samples) return samples
def cost(self, x, u, u_last, a): c = 0. if self.slew_rate: c += (u - u_last).T @ np.diag(self.uw) @ (u - u_last) else: c += u.T @ np.diag(self.uw) @ u if a: y = np.hstack((wrap_angle(x[0]), x[1])) if self.periodic else x J, j = self.features_jacobian(getval(y)) z = J(getval(y)) @ y + j c += a * (z - self.g).T @ np.diag(self.gw) @ (z - self.g) return c
def sample_and_cholesky_update(features, chol_fact, pred_mat, mean, kernel, noise_variance, feature): # Draw sample target. Also, lvec is reused below lvec = _compute_lvec(features, chol_fact, kernel, feature) pred_mean = anp.dot(lvec, pred_mat) + anp.reshape(mean(feature), (1, 1)) # Note: We do not add noise_variance to the predictive variance pred_std = anp.reshape( anp.sqrt( anp.maximum( kernel.diagonal(feature) - anp.sum(anp.square(lvec)), MIN_POSTERIOR_VARIANCE)), (1, 1)) n01mat = anp.random.normal(size=getval(pred_mean.shape)) target = pred_mean + anp.multiply(n01mat, pred_std) chol_fact_new, pred_mat_new = cholesky_update(features, chol_fact, pred_mat, mean, kernel, noise_variance, feature, target, lvec=lvec) features_new = anp.concatenate([features, feature], axis=0) return chol_fact_new, pred_mat_new, features_new, target
def diagonal(self, X): X = self._check_input_shape(X) covariance_scale = self._covariance_scale() covariance_scale_times_ones = anp.multiply( anp.ones((getval(X.shape[0]), 1)), covariance_scale) return anp.reshape(covariance_scale_times_ones, (-1, ))
def diagonal(self, X): d1 = self.kernel1.dimension X_shape = getval(X.shape) X1 = anp.take(X, range(0, d1), axis=1) X2 = anp.take(X, range(d1, X_shape[1]), axis=1) diag1 = self.kernel1.diagonal(X1) diag2 = self.kernel2.diagonal(X2) return diag1 * diag2
def store_value(self, value): fname = self._filename() with open(fname + '_value.txt', 'w') as f: f.write('value = {}\n'.format(getval(value))) self._write_meta(f) # Advance counters self.global_counter += 1 self.local_counter += 1
def forward(self, X): """ Actual computation of the scalar mean function We compute mean_value * vector_of_ones, whose dimensions are given by the the first column of X :param X: input data of size (n,d) for which we want to compute the mean (here, only useful to extract the right dimension) """ mean_value = encode_unwrap_parameter( self.mean_value_internal, self.encoding) return anp.multiply(anp.ones((getval(X.shape[0]), 1)), mean_value)
def negative_log_marginal_likelihood(chol_fact, pred_mat): """ The marginal likelihood is only computed if pred_mat has a single column (not for fantasy sample case). """ assert pred_mat.ndim == 1 or pred_mat.shape[1] == 1,\ "Multiple target vectors are not supported" sqnorm_predmat = anp.sum(anp.square(pred_mat)) logdet_cholfact = 2.0 * anp.sum(anp.log(anp.abs(anp.diag(chol_fact)))) n_samples = getval(pred_mat.size) return 0.5 * (sqnorm_predmat + n_samples * anp.log(2 * anp.pi) + logdet_cholfact)
def cholesky_update(features, chol_fact, pred_mat, mean, kernel, noise_variance, feature, target, lvec=None): """ Incremental update of posterior state (Cholesky factor, prediction matrix), given one datapoint (feature, target). Note: noise_variance is the initial value, before any jitter may have been added to compute chol_fact. Here, we add the minimum amount of jitter such that the new diagonal entry of the Cholesky factor is >= MIN_CHOLESKY_DIAGONAL_VALUE. This means that if cholesky_update is used several times, we in fact add a diagonal (but not spherical) jitter matrix. :param features: Shape (n, d) :param chol_fact: Shape (n, n) :param pred_mat: Shape (n, m) :param mean: :param kernel: :param noise_variance: :param feature: Shape (1, d) :param target: Shape (1, m) :param lvec: If given, this is the new column of the Cholesky factor except the diagonal entry. If not, this is computed here :return: chol_fact_new (n+1, n+1), pred_mat_new (n+1, m) """ if lvec is None: lvec = _compute_lvec(features, chol_fact, kernel, feature) kscal = anp.reshape(kernel.diagonal(feature), (1, )) noise_variance = anp.reshape(noise_variance, (1, )) lsqscal = anp.maximum(kscal + noise_variance - anp.sum(anp.square(lvec)), MIN_CHOLESKY_DIAGONAL_VALUE**2) lscal = anp.reshape(anp.sqrt(lsqscal), (1, 1)) mscal = anp.reshape(mean(feature), (1, 1)) pvec = target - mscal pvec = anp.divide(pvec - anp.matmul(lvec, pred_mat), lscal) pred_mat_new = anp.concatenate([pred_mat, pvec], axis=0) zerovec = anp.zeros((getval(lvec.size), 1)) chol_fact_new = anp.concatenate([ anp.concatenate([chol_fact, lvec], axis=0), anp.concatenate([zerovec, lscal], axis=0) ], axis=1) return chol_fact_new, pred_mat_new
def sample_posterior_marginals(features, mean, kernel, chol_fact, pred_mat, test_features, num_samples=1): """ Draws num_sample samples from the product of marginals of the posterior over input points test_features. If pred_mat is a matrix with m columns, the samples returned have shape (n_test, m, num_samples). :param features: Training inputs :param mean: Mean function :param kernel: Kernel function :param chol_fact: Part L of posterior state :param pred_mat: Part P of posterior state :param test_features: Test inputs :param num_samples: Number of samples to draw :return: Samples, shape (n_test, num_samples) or (n_test, m, num_samples) """ post_means, post_vars = predict_posterior_marginals( features, mean, kernel, chol_fact, pred_mat, test_features) post_means = anp.expand_dims(post_means, axis=-1) # (n_test, m, 1) post_stds = anp.sqrt(anp.reshape(post_vars, (-1, 1, 1))) # (n_test, 1, 1) n01_vecs = [ anp.random.normal(size=getval(post_means.shape)) for _ in range(num_samples) ] n01_mat = anp.concatenate(n01_vecs, axis=-1) samples = anp.multiply(n01_mat, post_stds) + post_means if samples.shape[1] == 1: n_test = getval(samples.shape)[0] samples = anp.reshape(samples, (n_test, -1)) # (n_test, num_samples) return samples
def forward(self, features, param_internal): """Returns constant positive vector If features.shape = (n, d), the shape of the vector returned is (d, 1) if size_cols = True, (n, 1) otherwise. :param features: Matrix for shape, dtype, ctx :param param_internal: Unwrapped parameter :return: Constant positive vector """ # Shape, dtype, ctx is determined by extracting column or row from # features, then use ones_like axis = 0 if self.size_cols else 1 ones_vec = anp.ones((features.size//getval(features.shape)[axis], 1)) param = anp.reshape(self.encoding.get(param_internal), (1, 1)) return anp.multiply(ones_vec, param)
def param_to_pretty_string(gluon_param, encoding): """ Take a gluon parameter and transform it to a string amenable to plotting If need be, the gluon parameter is appropriately encoded (e.g., log-exp transform). :param gluon_param: gluon parameter :param encoding: object in charge of encoding/decoding the gluon_param """ assert isinstance(gluon_param, Parameter) assert encoding is not None, "encoding of param {} should not be None".format( gluon_param.name) param_as_numpy = encoding.get(getval(gluon_param.data())) return "{}: {}".format( gluon_param.name, ";".join("{:.6f}".format(value) for value in param_as_numpy))
def __init__(self, features: np.ndarray, targets: Optional[np.ndarray], mean: MeanFunction, kernel: KernelFunction, noise_variance: np.ndarray, debug_log: bool = False, test_intermediates: Optional[dict] = None, **kwargs): """ If targets has m > 1 columns, they correspond to fantasy samples. If targets is None, this is an internal (copy) constructor, where kwargs contains chol_fact, pred_mat. :param features: Input points X, shape (n, d) :param targets: Targets Y, shape (n, m) :param mean: Mean function m(X) :param kernel: Kernel function k(X, X') :param noise_variance: Noise variance sigsq, shape (1,) :param test_intermediates: See cholesky_computations """ self.mean = mean self.kernel = kernel self.noise_variance = anp.array(noise_variance, copy=True) if targets is not None: targets_shape = getval(targets.shape) targets = anp.reshape(targets, (targets_shape[0], -1)) chol_fact, pred_mat = cholesky_computations( features, targets, mean, kernel, noise_variance, debug_log=debug_log, test_intermediates=test_intermediates) self.features = anp.array(features, copy=True) self.chol_fact = chol_fact self.pred_mat = pred_mat self._test_intermediates = test_intermediates else: # Internal (copy) constructor self.features = features self.chol_fact = kwargs['chol_fact'] self.pred_mat = kwargs['pred_mat']
def _compute_terms(self, X, alpha, mean_lam, gamma, delta, ret_mean=False): dim = self.kernel_x.dimension X_shape = getval(X.shape) cfg = anp.take(X, range(0, dim), axis=1) res = anp.take(X, range(dim, X_shape[1]), axis=1) kappa = self._compute_kappa(res, alpha, mean_lam) kr_pref = anp.reshape(gamma, (1, 1)) if ret_mean or (self.encoding_delta is not None) or delta > 0.0: mean = self.mean_x(cfg) else: mean = None if self.encoding_delta is not None: kr_pref = anp.subtract(kr_pref, anp.multiply(delta, mean)) elif delta > 0.0: kr_pref = anp.subtract(kr_pref, mean * delta) return cfg, res, kappa, kr_pref, mean
def _diagonal_normal_policy(state): return rng.normal(getval(policy(state)), scale)
import autograd.numpy as np import autograd.numpy.random as npr from autograd.scipy.misc import logsumexp from autograd.scipy.linalg import cholesky_banded, solve_banded, solveh_banded from autograd.extend import primitive, defvjp from autograd.tracer import getval from functools import partial from ssm.cstats import _blocks_to_bands_lower, _blocks_to_bands_upper, \ _bands_to_blocks_lower, _bands_to_blocks_upper, \ _transpose_banded, vjp_cholesky_banded_lower, \ _vjp_solve_banded_A, _vjp_solveh_banded_A from ssm.messages import forward_pass, backward_pass, backward_sample, grad_hmm_normalizer to_c = lambda arr: np.copy(getval(arr), 'C') if not arr.flags['C_CONTIGUOUS'] else getval(arr) @primitive def hmm_normalizer(log_pi0, log_Ps, ll): T, K = ll.shape alphas = np.zeros((T, K)) # Make sure everything is C contiguous log_pi0 = to_c(log_pi0) log_Ps = to_c(log_Ps) ll = to_c(ll) forward_pass(log_pi0, log_Ps, ll, alphas) return logsumexp(alphas[-1]) def _make_grad_hmm_normalizer(argnum, ans, log_pi0, log_Ps, ll):
def cost(self, x, u, u_lst): _J, _j = self.features_jacobian(getval(x)) _x = _J(getval(x)) @ x + _j return self.dt * ( (_x - self._g).T @ np.diag(self._gw) @ (_x - self._g) + (u - u_lst).T @ np.diag(self._uw) @ (u - u_lst))
def cost(self, x, u, a): _J, _j = self.features_jacobian(getval(x)) _x = _J(getval(x)) @ x + _j return a * (_x - self._g).T @ np.diag(self._gw) @ (_x - self._g)\ + u.T @ np.diag(self._uw) @ u
def _check_input_shape(self, X): return anp.reshape(X, (getval(X.shape[0]), self._dimension))
def forward(self, X): return anp.zeros((getval(X.shape[0]), 1))
def _surrogate_elbo(self, variational_posterior, datas, inputs=None, masks=None, tags=None, alpha=0.75, **kwargs): """ Lower bound on the marginal likelihood p(y | gamma) using variational posterior q(x; phi) where phi = variational_params and gamma = emission parameters. As part of computing this objective, we optimize q(z | x) and take a natural gradient step wrt theta, the parameters of the dynamics model. Note that the surrogate ELBO is a lower bound on the ELBO above. E_p(z | x, y)[log p(z, x, y)] = E_p(z | x, y)[log p(z, x, y) - log p(z | x, y) + log p(z | x, y)] = E_p(z | x, y)[log p(x, y) + log p(z | x, y)] = log p(x, y) + E_p(z | x, y)[log p(z | x, y)] = log p(x, y) -H[p(z | x, y)] <= log p(x, y) with equality only when p(z | x, y) is atomic. The gap equals the entropy of the posterior on z. """ # log p(theta) elbo = self.log_prior() # Sample x from the variational posterior xs = variational_posterior.sample() # Inner optimization: find the true posterior p(z | x, y; theta). # Then maximize the inner ELBO wrt theta, # # E_p(z | x, y; theta_fixed)[log p(z, x, y; theta). # # This can be seen as a natural gradient step in theta # space. Note: we do not want to compute gradients wrt x or the # emissions parameters backward throgh this optimization step, # so we unbox them first. xs_unboxed = [getval(x) for x in xs] emission_params_boxed = self.emissions.params flat_emission_params_boxed, unflatten = flatten(emission_params_boxed) self.emissions.params = unflatten(getval(flat_emission_params_boxed)) # E step: compute the true posterior p(z | x, y, theta_fixed) and # the necessary expectations under this posterior. expectations = [ self.expected_states(x, data, input, mask, tag) for x, data, input, mask, tag in zip( xs_unboxed, datas, inputs, masks, tags) ] # M step: maximize expected log joint wrt parameters # Note: Only do a partial update toward the M step for this sample of xs x_masks = [np.ones_like(x, dtype=bool) for x in xs_unboxed] for distn in [self.init_state_distn, self.transitions, self.dynamics]: curr_prms = copy.deepcopy(distn.params) distn.m_step(expectations, xs_unboxed, inputs, x_masks, tags, **kwargs) distn.params = convex_combination(curr_prms, distn.params, alpha) # Box up the emission parameters again before computing the ELBO self.emissions.params = emission_params_boxed # Compute expected log likelihood E_q(z | x, y) [log p(z, x, y; theta)] for (Ez, Ezzp1, _), x, x_mask, data, mask, input, tag in \ zip(expectations, xs, x_masks, datas, masks, inputs, tags): # Compute expected log likelihood (inner ELBO) log_pi0 = self.init_state_distn.log_initial_state_distn( x, input, x_mask, tag) log_Ps = self.transitions.log_transition_matrices( x, input, x_mask, tag) log_likes = self.dynamics.log_likelihoods(x, input, x_mask, tag) log_likes += self.emissions.log_likelihoods( data, input, mask, tag, x) elbo += np.sum(Ez[0] * log_pi0) elbo += np.sum(Ezzp1 * log_Ps) elbo += np.sum(Ez * log_likes) # -log q(x) elbo -= variational_posterior.log_density(xs) assert np.isfinite(elbo) return elbo