def _fit_laplace_em_params_update(self, variational_posterior, datas, inputs, masks, tags, emission_optimizer, emission_optimizer_maxiter, alpha): # Compute necessary expectations either analytically or via samples continuous_samples = variational_posterior.sample_continuous_states() discrete_expectations = variational_posterior.discrete_expectations # Approximate update of initial distribution and transition params. # Replace the expectation wrt x with sample from q(x). The parameter # update is partial and depends on alpha. xmasks = [np.ones_like(x, dtype=bool) for x in continuous_samples] for distn in [self.init_state_distn, self.transitions]: curr_prms = copy.deepcopy(distn.params) if curr_prms == tuple(): continue distn.m_step(discrete_expectations, continuous_samples, inputs, xmasks, tags) distn.params = convex_combination(curr_prms, distn.params, alpha) kwargs = dict(expectations=discrete_expectations, datas=continuous_samples, inputs=inputs, masks=xmasks, tags=tags) exact_m_step_dynamics = [ obs.AutoRegressiveObservations, obs.AutoRegressiveObservationsNoInput, obs.AutoRegressiveDiagonalNoiseObservations, ] if type(self.dynamics ) in exact_m_step_dynamics and self.dynamics.lags == 1: # In this case, we can do an exact M-step on the dynamics by passing # in the true sufficient statistics for the continuous state. kwargs[ "continuous_expectations"] = variational_posterior.continuous_expectations self.dynamics.m_step(**kwargs) else: # Otherwise, do an approximate m-step by sampling. curr_prms = copy.deepcopy(self.dynamics.params) self.dynamics.m_step(**kwargs) self.dynamics.params = convex_combination(curr_prms, self.dynamics.params, alpha) # Update emissions params. This is always approximate (at least for now). curr_prms = copy.deepcopy(self.emissions.params) self.emissions.m_step(discrete_expectations, continuous_samples, datas, inputs, masks, tags, optimizer=emission_optimizer, maxiter=emission_optimizer_maxiter) self.emissions.params = convex_combination(curr_prms, self.emissions.params, alpha)
def _fit_laplace_em_params_update( self, discrete_expectations, continuous_expectations, datas, inputs, masks, tags, emission_optimizer, emission_optimizer_maxiter, alpha): # 3. Update the model parameters. Replace the expectation wrt x with sample from q(x). # The parameter update is partial and depends on alpha. xmasks = [np.ones_like(x, dtype=bool) for x in continuous_expectations] for distn in [self.init_state_distn, self.transitions, self.dynamics]: curr_prms = copy.deepcopy(distn.params) if curr_prms == tuple(): continue distn.m_step(discrete_expectations, continuous_expectations, inputs, xmasks, tags) distn.params = convex_combination(curr_prms, distn.params, alpha) # update emissions params curr_prms = copy.deepcopy(self.emissions.params) self.emissions.m_step(discrete_expectations, continuous_expectations, datas, inputs, masks, tags, optimizer=emission_optimizer, maxiter=emission_optimizer_maxiter) self.emissions.params = convex_combination(curr_prms, self.emissions.params, alpha)
def _surrogate_elbo(self, variational_posterior, datas, inputs=None, masks=None, tags=None, alpha=0.75, **kwargs): """ Lower bound on the marginal likelihood p(y | gamma) using variational posterior q(x; phi) where phi = variational_params and gamma = emission parameters. As part of computing this objective, we optimize q(z | x) and take a natural gradient step wrt theta, the parameters of the dynamics model. Note that the surrogate ELBO is a lower bound on the ELBO above. E_p(z | x, y)[log p(z, x, y)] = E_p(z | x, y)[log p(z, x, y) - log p(z | x, y) + log p(z | x, y)] = E_p(z | x, y)[log p(x, y) + log p(z | x, y)] = log p(x, y) + E_p(z | x, y)[log p(z | x, y)] = log p(x, y) -H[p(z | x, y)] <= log p(x, y) with equality only when p(z | x, y) is atomic. The gap equals the entropy of the posterior on z. """ # log p(theta) elbo = self.log_prior() # Sample x from the variational posterior xs = variational_posterior.sample() # Inner optimization: find the true posterior p(z | x, y; theta). # Then maximize the inner ELBO wrt theta, # # E_p(z | x, y; theta_fixed)[log p(z, x, y; theta). # # This can be seen as a natural gradient step in theta # space. Note: we do not want to compute gradients wrt x or the # emissions parameters backward throgh this optimization step, # so we unbox them first. xs_unboxed = [getval(x) for x in xs] emission_params_boxed = self.emissions.params flat_emission_params_boxed, unflatten = flatten(emission_params_boxed) self.emissions.params = unflatten(getval(flat_emission_params_boxed)) # E step: compute the true posterior p(z | x, y, theta_fixed) and # the necessary expectations under this posterior. expectations = [ self.expected_states(x, data, input, mask, tag) for x, data, input, mask, tag in zip( xs_unboxed, datas, inputs, masks, tags) ] # M step: maximize expected log joint wrt parameters # Note: Only do a partial update toward the M step for this sample of xs x_masks = [np.ones_like(x, dtype=bool) for x in xs_unboxed] for distn in [self.init_state_distn, self.transitions, self.dynamics]: curr_prms = copy.deepcopy(distn.params) distn.m_step(expectations, xs_unboxed, inputs, x_masks, tags, **kwargs) distn.params = convex_combination(curr_prms, distn.params, alpha) # Box up the emission parameters again before computing the ELBO self.emissions.params = emission_params_boxed # Compute expected log likelihood E_q(z | x, y) [log p(z, x, y; theta)] for (Ez, Ezzp1, _), x, x_mask, data, mask, input, tag in \ zip(expectations, xs, x_masks, datas, masks, inputs, tags): # Compute expected log likelihood (inner ELBO) log_pi0 = self.init_state_distn.log_initial_state_distn( x, input, x_mask, tag) log_Ps = self.transitions.log_transition_matrices( x, input, x_mask, tag) log_likes = self.dynamics.log_likelihoods(x, input, x_mask, tag) log_likes += self.emissions.log_likelihoods( data, input, mask, tag, x) elbo += np.sum(Ez[0] * log_pi0) elbo += np.sum(Ezzp1 * log_Ps) elbo += np.sum(Ez * log_likes) # -log q(x) elbo -= variational_posterior.log_density(xs) assert np.isfinite(elbo) return elbo