def optimizer_step(self, sample): sample_observation_initial_context, sample_action_T, sample_next_observation_T, sample_reward_T = sample image_probs, reward_probs = self.model.forward_multiple( sample_observation_initial_context, sample_action_T) # reward loss true_reward = numerical_reward_to_bit_array( sample_reward_T, self.reward_prediction_bits, self.use_cuda) reward_loss = self.reward_criterion(reward_probs, true_reward) # image loss reconstruction_loss = self.frame_criterion(image_probs, sample_next_observation_T) loss = reconstruction_loss + self.reward_loss_coef * reward_loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # The minimal cross entropy between the distributions p and q is the entropy of p # so if they are equal the loss is equal to the distribution of p true_entropy = Bernoulli(probs=sample_next_observation_T).entropy() normalized_frame_loss = reconstruction_loss - true_entropy.mean() return (normalized_frame_loss, reward_loss), (image_probs, reward_probs)
def bald_acq(obj_samples): # the output of objective is of shape num_samples x batch_shape x d_out mean_p = obj_samples.mean(dim=0) posterior_entropies = Bernoulli(mean_p).entropy().squeeze(-1) sample_entropies = Bernoulli(obj_samples).entropy() conditional_entropies = sample_entropies.mean(dim=0).squeeze(-1) return posterior_entropies - conditional_entropies
def optimizer_step(self, sample): sample_observation_initial_context, sample_action_T, sample_next_observation_T, sample_reward_T = sample image_probs, reward_probs, \ (total_z_mu_prior, total_z_sigma_prior, total_z_mu_posterior, total_z_sigma_posterior) \ = self.model.forward_multiple(sample_observation_initial_context, sample_action_T) # reward loss true_reward = numerical_reward_to_bit_array( sample_reward_T, self.reward_prediction_bits, self.use_cuda) reward_loss = self.reward_criterion(reward_probs, true_reward) # image loss reconstruction_loss = self.frame_criterion(image_probs, sample_next_observation_T) prior_gaussian = Normal(loc=total_z_mu_prior, scale=total_z_sigma_prior) posterior_gaussian = Normal(loc=total_z_mu_posterior, scale=total_z_sigma_posterior) kl_div_loss = torch.distributions.kl.kl_divergence( prior_gaussian, posterior_gaussian) # loss is Evidence Lower Bound (ELBO) L = log p(X) − KL [q(Z)kp(Z|X)] frame_loss = reconstruction_loss + kl_div_loss.mean() loss = frame_loss + self.reward_loss_coef * reward_loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # The minimal cross entropy between the distributions p and q is the entropy of p # so if they are equal the loss is equal to the distribution of p true_entropy = Bernoulli(probs=sample_next_observation_T).entropy() entropy_normalized_loss = reconstruction_loss - true_entropy.mean() normalized_frame_loss = entropy_normalized_loss + kl_div_loss.mean() return (normalized_frame_loss, reward_loss), (image_probs, reward_probs)
def bald_acq(obj_samples: torch.Tensor) -> torch.Tensor: """Evaluate Mutual Information acquisition function. With latent function F and X a hypothetical observation at a new point, I(F; X) = I(X; F) = H(X) - H(X |F), H(X |F ) = E_{f} (H(X |F =f ) i.e., we take the posterior entropy of the (Bernoulli) observation X given the current model posterior and subtract the conditional entropy on F, that being the mean entropy over the posterior for F. This is equivalent to the BALD acquisition function in Houlsby et al. NeurIPS 2012. Args: obj_samples (torch.Tensor): Objective samples from the GP, of shape num_samples x batch_shape x d_out Returns: torch.Tensor: Value of acquisition at samples. """ mean_p = obj_samples.mean(dim=0) posterior_entropies = Bernoulli(mean_p).entropy().squeeze(-1) sample_entropies = Bernoulli(obj_samples).entropy() conditional_entropies = sample_entropies.mean(dim=0).squeeze(-1) return posterior_entropies - conditional_entropies