def log_diagnostics(self, paths, **kwargs): list_of_rewards, terminals, obs, actions, next_obs = split_paths(paths) returns = [] for rewards in list_of_rewards: returns.append(np.sum(rewards)) statistics = OrderedDict() statistics.update( create_stats_ordered_dict( 'Undiscounted Returns', returns, )) statistics.update( create_stats_ordered_dict( 'Rewards', list_of_rewards, )) statistics.update(create_stats_ordered_dict( 'Actions', actions, )) fraction_of_time_on_platform = [o[1] for o in obs] statistics['Fraction of time on platform'] = np.mean( fraction_of_time_on_platform) for key, value in statistics.items(): logger.record_tabular(key, value) return returns
def _statistics_from_paths(self, paths, stat_prefix): eval_replay_buffer = UpdatableSubtrajReplayBuffer( len(paths) * (self.max_path_length + 1), self.env, self.subtraj_length, self.memory_dim, ) for path in paths: eval_replay_buffer.add_trajectory(path) raw_subtraj_batch = eval_replay_buffer.get_all_valid_subtrajectories() assert raw_subtraj_batch is not None subtraj_batch = create_torch_subtraj_batch(raw_subtraj_batch) if self.save_memory_gradients: subtraj_batch['memories'].requires_grad = True statistics = self._statistics_from_subtraj_batch( subtraj_batch, stat_prefix=stat_prefix ) statistics.update(eval_util.get_generic_path_information( paths, stat_prefix="Test", )) env_actions = np.vstack([path["actions"][:self.action_dim] for path in paths]) writes = np.vstack([path["actions"][self.action_dim:] for path in paths]) statistics.update(create_stats_ordered_dict( 'Env Actions', env_actions, stat_prefix=stat_prefix )) statistics.update(create_stats_ordered_dict( 'Writes', writes, stat_prefix=stat_prefix )) return statistics
def debug_statistics(self): """ Given an image $$x$$, samples a bunch of latents from the prior $$z_i$$ and decode them $$\hat x_i$$. Compare this to $$\hat x$$, the reconstruction of $$x$$. Ideally - All the $$\hat x_i$$s do worse than $$\hat x$$ (makes sure VAE isn’t ignoring the latent) - Some $$\hat x_i$$ do better than other $$\hat x_i$$ (tests for coverage) """ debug_batch_size = 64 data = self.get_batch(train=False) reconstructions, _, _ = self.model(data) img = data[0] recon_mse = ((reconstructions[0] - img)**2).mean().view(-1) img_repeated = img.expand((debug_batch_size, img.shape[0])) samples = ptu.randn(debug_batch_size, self.representation_size) random_imgs, _ = self.model.decode(samples) random_mses = (random_imgs - img_repeated)**2 mse_improvement = ptu.get_numpy(random_mses.mean(dim=1) - recon_mse) stats = create_stats_ordered_dict( 'debug/MSE improvement over random', mse_improvement, ) stats.update( create_stats_ordered_dict( 'debug/MSE of random decoding', ptu.get_numpy(random_mses), )) stats['debug/MSE of reconstruction'] = ptu.get_numpy(recon_mse)[0] return stats
def log_diagnostics(self, paths, logger=default_logger): statistics = OrderedDict() for name_in_env_infos, name_to_log in [ ('distance_to_target', 'Distance to Target'), ('speed', 'Speed'), ('distance_reward', 'Distance Reward'), ('action_reward', 'Action Reward'), ]: stats = get_stat_in_paths(paths, 'env_infos', name_in_env_infos) statistics.update(create_stats_ordered_dict( name_to_log, stats, )) final_stats = [s[-1] for s in stats] statistics.update( create_stats_ordered_dict( "Final " + name_to_log, final_stats, always_show_all_stats=True, )) statistics.update( create_stats_ordered_dict( "Path Lengths", get_path_lengths(paths), )) for key, value in statistics.items(): logger.record_tabular(key, value)
def log_diagnostics(self, paths): final_values = [] final_unclipped_rewards = [] final_rewards = [] for path in paths: final_value = path["actions"][-1][0] final_values.append(final_value) score = path["observations"][0][0] * final_value final_unclipped_rewards.append(score) final_rewards.append(clip_magnitude(score, 1)) last_statistics = OrderedDict() last_statistics.update( create_stats_ordered_dict( 'Final Value', final_values, )) last_statistics.update( create_stats_ordered_dict( 'Unclipped Final Rewards', final_unclipped_rewards, )) last_statistics.update( create_stats_ordered_dict( 'Final Rewards', final_rewards, )) for key, value in last_statistics.items(): logger.record_tabular(key, value) return final_unclipped_rewards
def log_diagnostics(self, paths, **kwargs): list_of_rewards, terminals, obs, actions, next_obs = split_paths(paths) returns = [] for rewards in list_of_rewards: returns.append(np.sum(rewards)) last_statistics = OrderedDict() last_statistics.update( create_stats_ordered_dict( 'UndiscountedReturns', returns, )) last_statistics.update( create_stats_ordered_dict( 'Rewards', list_of_rewards, )) last_statistics.update(create_stats_ordered_dict( 'Actions', actions, )) for key, value in last_statistics.items(): logger.record_tabular(key, value) return returns
def _statistics_from_batch(self, batch, stat_prefix): statistics = OrderedDict() train_dict = self.get_train_dict(batch) for name in [ 'Policy Loss', ]: tensor = train_dict[name] statistics_name = "{} {} Mean".format(stat_prefix, name) statistics[statistics_name] = np.mean(ptu.get_numpy(tensor)) for name in [ 'QF Outputs', 'Policy Actions', ]: tensor = train_dict[name] statistics.update( create_stats_ordered_dict('{} {}'.format(stat_prefix, name), ptu.get_numpy(tensor))) statistics.update( create_stats_ordered_dict("{} Env Actions".format(stat_prefix), ptu.get_numpy(batch['actions']))) return statistics
def _do_training(self): tmp_batch = self.get_batch() random_state = tmp_batch['observations'] losses = [] batch = self.get_batch() obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] ob_deltas_pred = self.model(obs, actions) next_obs_pred = obs + ob_deltas_pred if self.vectorized: distance_to_random_state_pred = ( (next_obs_pred - random_state)**2 ) distance_to_random_state = ( (next_obs - random_state)**2 ) squared_errors = ( distance_to_random_state_pred - distance_to_random_state )**2 loss = squared_errors.mean() else: distance_to_random_state_pred = ( (next_obs_pred - random_state)**2 ).sum(1, keepdim=True) distance_to_random_state = ( (next_obs - random_state)**2 ).sum(1, keepdim=True) squared_errors = ( distance_to_random_state_pred - distance_to_random_state )**2 loss = squared_errors.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() losses.append(ptu.get_numpy(loss)) if self.eval_statistics is None: self.eval_statistics = OrderedDict() self.eval_statistics.update(create_stats_ordered_dict( 'Model Loss', losses, always_show_all_stats=True, exclude_max_min=True, )) self.eval_statistics.update(create_stats_ordered_dict( 'Distance To Random State', ptu.get_numpy(distance_to_random_state), )) self.eval_statistics.update(create_stats_ordered_dict( 'Distance To Random State Predicted', ptu.get_numpy(distance_to_random_state_pred), ))
def _do_training(self): batch = self.get_batch() """ Optimize Critic/Actor. """ rewards = batch['rewards'] terminals = batch['terminals'] obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] _, _, v_pred = self.target_policy(next_obs, None) y_target = self.reward_scale * rewards + ( 1. - terminals) * self.discount * v_pred y_target = y_target.detach() mu, y_pred, v = self.policy(obs, actions) policy_loss = self.policy_criterion(y_pred, y_target) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() """ Update Target Networks """ if self.use_soft_update: ptu.soft_update_from_to(self.policy, self.target_policy, self.tau) else: if self._n_train_steps_total % self.target_hard_update_period == 0: ptu.copy_model_params_from_to(self.policy, self.target_policy) if self.need_to_update_eval_statistics: self.need_to_update_eval_statistics = False self.eval_statistics['Policy Loss'] = np.mean( ptu.get_numpy(policy_loss)) self.eval_statistics.update( create_stats_ordered_dict( 'Policy v', ptu.get_numpy(v), )) self.eval_statistics.update( create_stats_ordered_dict( 'Policy mu', ptu.get_numpy(mu), )) self.eval_statistics.update( create_stats_ordered_dict( 'Y targets', ptu.get_numpy(y_target), )) self.eval_statistics.update( create_stats_ordered_dict( 'Y predictions', ptu.get_numpy(y_pred), ))
def log_diagnostics(self, paths, logger=default_logger): lms = get_stat_in_paths(paths, 'agent_infos', 'lagrange_multiplier') for key, value in create_stats_ordered_dict( "TDM LBFGS Lagrange Multiplier", lms, ).items(): logger.record_tabular(key, value)
def _do_training(self): batch = self.get_batch() obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] """ Policy operations. """ inputs = torch.cat((obs, self.env.convert_obs_to_goals(next_obs)), dim=1) policy_actions = self.policy(inputs) policy_loss = self.policy_criterion(policy_actions, actions) """ Update Networks """ self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() if self.need_to_update_eval_statistics: self.need_to_update_eval_statistics = False """ This way, these statistics are only computed for one batch. """ self.eval_statistics = OrderedDict() self.eval_statistics['Policy Loss'] = np.mean( ptu.get_numpy(policy_loss)) self.eval_statistics.update( create_stats_ordered_dict( 'Policy Action', ptu.get_numpy(policy_actions), ))
def _statistics_from_batch(self, batch, stat_prefix): statistics = OrderedDict() train_dict = self.get_train_dict(batch) for name in [ 'QF Loss', 'Policy Loss', ]: tensor = train_dict[name] statistics_name = "{} {} Mean".format(stat_prefix, name) statistics[statistics_name] = np.mean(ptu.get_numpy(tensor)) for name in [ 'Bellman Errors', 'Target Value', 'Target Advantage', 'Predicted Value', 'Predicted Advantage', 'Policy Action Value', 'Policy Action Advantage', ]: tensor = train_dict[name] statistics.update(create_stats_ordered_dict( '{} {}'.format(stat_prefix, name), ptu.get_numpy(tensor) )) return statistics
def _statistics_from_subtraj_batch(self, subtraj_batch, stat_prefix=''): statistics = OrderedDict() critic_dict = self.get_critic_output_dict(subtraj_batch) for name, tensor in critic_dict.items(): statistics.update(create_stats_ordered_dict( '{} QF {}'.format(stat_prefix, name), ptu.get_numpy(tensor) )) policy_dict = self.get_policy_output_dict(subtraj_batch) for name, tensor in policy_dict.items(): statistics.update(create_stats_ordered_dict( '{} Policy {}'.format(stat_prefix, name), ptu.get_numpy(tensor) )) return statistics
def save_gradient_norm(gradient): if self.need_to_update_eval_statistics: self.extra_eval_statistics.update( create_stats_ordered_dict( key, ptu.get_numpy(gradient.data.norm(p=2, dim=1)), always_show_all_stats=True, ))
def log_diagnostics(self, paths, logger=default_logger): statistics = OrderedDict() for name_in_env_infos, name_to_log in [ ('distance_to_target', 'Distance to Target'), ('reward_ctrl', 'Action Reward'), ]: stat = get_stat_in_paths(paths, 'env_infos', name_in_env_infos) statistics.update(create_stats_ordered_dict( name_to_log, stat, )) distances = get_stat_in_paths(paths, 'env_infos', 'distance_to_target') statistics.update(create_stats_ordered_dict( "Final Distance to Target", [ds[-1] for ds in distances], )) for key, value in statistics.items(): logger.record_tabular(key, value)
def log_diagnostics(self, paths, logger=default_logger): statistics = OrderedDict() for name_in_env_infos, name_to_log in [ ('posafter', 'Position'), ('height', 'Height'), ('angle', 'Angle'), ]: stats = get_stat_in_paths(paths, 'env_infos', name_in_env_infos) statistics.update(create_stats_ordered_dict( name_to_log, stats, )) statistics.update( create_stats_ordered_dict( "Final " + name_to_log, [s[-1] for s in stats], )) for key, value in statistics.items(): logger.record_tabular(key, value)
def get_diagnostics(self, paths): statistics = OrderedDict() for stat_name_in_paths, stat_name_to_print in [ ('arm_object_distance', 'Distance hand to object'), ('arm_goal_distance', 'Distance hand to goal'), ]: stats = get_stat_in_paths(paths, 'env_infos', stat_name_in_paths) statistics.update(create_stats_ordered_dict( stat_name_to_print, stats, always_show_all_stats=True, )) final_stats = [s[-1] for s in stats] statistics.update(create_stats_ordered_dict( "Final " + stat_name_to_print, final_stats, always_show_all_stats=True, )) return statistics
def get_diagnostics(self): path_lens = [len(path['actions']) for path in self._epoch_paths] stats = OrderedDict([ ('num steps total', self._num_steps_total), ('num paths total', self._num_paths_total), ]) stats.update( create_stats_ordered_dict( "path length", path_lens, always_show_all_stats=True, )) return stats
def log_diagnostics(self, paths): statistics = OrderedDict() for stat_name in [ 'arm to object distance', 'object to goal distance', 'arm to goal distance', ]: stat = get_stat_in_paths(paths, 'env_infos', stat_name) statistics.update(create_stats_ordered_dict(stat_name, stat)) for key, value in statistics.items(): logger.record_tabular(key, value)
def log_diagnostics(self, paths): target_onehots = [] for path in paths: first_observation = path["observations"][0][:self.n + 1] target_onehots.append(first_observation) final_predictions = [] # each element has shape (dim) nonfinal_predictions = [] # each element has shape (seq_length-1, dim) for path in paths: actions = path["actions"] if self._softmax_action: actions = softmax(actions, axis=-1) final_predictions.append(actions[-1]) nonfinal_predictions.append(actions[:-1]) nonfinal_predictions_sequence_dimension_flattened = np.vstack( nonfinal_predictions) # shape = N X dim nonfinal_prob_zero = [ softmax[0] for softmax in nonfinal_predictions_sequence_dimension_flattened ] final_probs_correct = [] for final_prediction, target_onehot in zip(final_predictions, target_onehots): correct_pred_idx = np.argmax(target_onehot) final_probs_correct.append(final_prediction[correct_pred_idx]) final_prob_zero = [softmax[0] for softmax in final_predictions] last_statistics = OrderedDict() last_statistics.update( create_stats_ordered_dict('Final P(correct)', final_probs_correct)) last_statistics.update( create_stats_ordered_dict('Non-final P(zero)', nonfinal_prob_zero)) last_statistics.update( create_stats_ordered_dict('Final P(zero)', final_prob_zero)) for key, value in last_statistics.items(): logger.record_tabular(key, value) return final_probs_correct
def get_diagnostics(self): if self._vae_sample_probs is None or self._vae_sample_priorities is None: stats = create_stats_ordered_dict( 'VAE Sample Weights', np.zeros(self._size), ) stats.update(create_stats_ordered_dict( 'VAE Sample Probs', np.zeros(self._size), )) else: vae_sample_priorities = self._vae_sample_priorities[:self._size] vae_sample_probs = self._vae_sample_probs[:self._size] stats = create_stats_ordered_dict( 'VAE Sample Weights', vae_sample_priorities, ) stats.update(create_stats_ordered_dict( 'VAE Sample Probs', vae_sample_probs, )) return stats
def log_diagnostics(self, paths, logger=default_logger): statistics = OrderedDict() for stat_name_in_paths, stat_name_to_print in [ ('hand_to_object_distance', 'Distance hand to object'), ('object_to_goal_distance', 'Distance object to goal'), ('hand_to_hand_goal_distance', 'Distance hand to hand goal'), ('success', 'Success (within 0.1)'), ]: stats = get_stat_in_paths(paths, 'env_infos', stat_name_in_paths) statistics.update( create_stats_ordered_dict( stat_name_to_print, stats, always_show_all_stats=True, )) final_stats = [s[-1] for s in stats] statistics.update( create_stats_ordered_dict( "Final " + stat_name_to_print, final_stats, always_show_all_stats=True, )) for key, value in statistics.items(): logger.record_tabular(key, value)
def _statistics_from_paths(self, paths, stat_prefix): rewards, terminals, obs, actions, next_obs = split_paths(paths) np_batch = dict( rewards=rewards, terminals=terminals, observations=obs, actions=actions, next_observations=next_obs, ) batch = np_to_pytorch_batch(np_batch) statistics = self._statistics_from_batch(batch, stat_prefix) statistics.update( create_stats_ordered_dict('Num Paths', len(paths), stat_prefix=stat_prefix)) return statistics
def _do_training(self): if not self.vectorized: return DQN._do_training(self) batch = self.get_batch() rewards = batch['rewards'] terminals = batch['terminals'] obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] goals = batch['goals'] num_steps_left = batch['num_steps_left'] """ Compute loss """ target_q_values = self.target_qf( next_obs, goals, num_steps_left - 1, ).detach().max(1, keepdim=False)[0] y_target = self.reward_scale * rewards + ( 1. - terminals) * self.discount * target_q_values y_target = y_target.detach() # actions is a one-hot vector y_pred = torch.sum(self.qf(obs, goals, num_steps_left) * actions.unsqueeze(2), dim=1, keepdim=False) qf_loss = self.qf_criterion(y_pred, y_target) """ Update networks """ self.qf_optimizer.zero_grad() qf_loss.backward() self.qf_optimizer.step() self._update_target_network() if self.need_to_update_eval_statistics: self.need_to_update_eval_statistics = False self.eval_statistics['QF Loss'] = np.mean(ptu.get_numpy(qf_loss)) self.eval_statistics.update( create_stats_ordered_dict( 'Y Predictions', ptu.get_numpy(y_pred), ))
def _statistics_from_paths(self, paths, stat_prefix): eval_replay_buffer = SubtrajReplayBuffer( len(paths) * (self.max_path_length + 1), self.env, self.subtraj_length, ) for path in paths: eval_replay_buffer.add_trajectory(path) raw_subtraj_batch = eval_replay_buffer.get_all_valid_subtrajectories() assert raw_subtraj_batch is not None subtraj_batch = create_torch_subtraj_batch(raw_subtraj_batch) statistics = self._statistics_from_batch( subtraj_batch, stat_prefix=stat_prefix ) statistics.update(create_stats_ordered_dict( 'Num Paths', len(paths), stat_prefix=stat_prefix )) return statistics
def train_from_torch(self, batch): rewards = batch['rewards'] terminals = batch['terminals'] obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] """ Compute loss """ best_action_idxs = self.qf(next_obs).max(1, keepdim=True)[1] target_q_values = self.target_qf(next_obs).gather( 1, best_action_idxs).detach() y_target = rewards + (1. - terminals) * self.discount * target_q_values y_target = y_target.detach() # actions is a one-hot vector y_pred = torch.sum(self.qf(obs) * actions, dim=1, keepdim=True) qf_loss = self.qf_criterion(y_pred, y_target) """ Update networks """ self.qf_optimizer.zero_grad() qf_loss.backward() self.qf_optimizer.step() """ Soft target network updates """ if self._n_train_steps_total % self.target_update_period == 0: ptu.soft_update_from_to(self.qf, self.target_qf, self.soft_target_tau) """ Save some statistics for eval using just one batch. """ if self._need_to_update_eval_statistics: self._need_to_update_eval_statistics = False self.eval_statistics['QF Loss'] = np.mean(ptu.get_numpy(qf_loss)) self.eval_statistics.update( create_stats_ordered_dict( 'Y Predictions', ptu.get_numpy(y_pred), ))
def _do_training(self): batch = self.get_batch() rewards = batch['rewards'] terminals = batch['terminals'] obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] """ Compute loss """ for t in range(self.max_horizon): if t == self.max_horizon - 1: q_target = self.reward_scale * rewards else: target_q_values = self.qfs[t + 1](next_obs).detach().max( 1, keepdim=True)[0] q_target = (self.reward_scale * rewards + (1. - terminals) * self.discount * target_q_values) # actions are one-hot vectors q_pred = torch.sum(self.qfs[t](obs) * actions, dim=1, keepdim=True) qf_loss = self.qf_criterion(q_pred, q_target.detach()) """ Update networks """ self.qf_optimizers[t].zero_grad() qf_loss.backward() self.qf_optimizers[t].step() """ Save some statistics for eval """ if self.need_to_update_eval_statistics: self.eval_statistics['QF {} Loss'.format(t)] = np.mean( ptu.get_numpy(qf_loss)) self.eval_statistics.update( create_stats_ordered_dict( 'Q {} Predictions'.format(t), ptu.get_numpy(q_pred), )) if self.need_to_update_eval_statistics: self.need_to_update_eval_statistics = False
def _do_training(self): batch = self.get_batch() obs = batch['observations'] actions = batch['actions'] num_steps_left = batch['num_steps_left'] next_obs = batch['next_observations'] """ Policy operations. """ # import ipdb; ipdb.set_trace() policy_actions = self.policy( obs, self.env.convert_obs_to_goals(next_obs), num_steps_left, return_preactivations=False, ) policy_loss = self.policy_criterion(policy_actions, actions) """ Update Networks """ self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() if self.eval_statistics is None: """ This way, these statistics are only computed for one batch. """ self.eval_statistics = OrderedDict() self.eval_statistics['Policy Loss'] = np.mean( ptu.get_numpy(policy_loss)) self.eval_statistics.update( create_stats_ordered_dict( 'Policy Action', ptu.get_numpy(policy_actions), ))
def _do_training(self): batch = self.get_batch(training=True) terminals = batch['terminals'] obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] goal_differences = batch['goal_differences'] goals = batch['goals'] """ Policy operations. """ policy_actions = self.policy(obs) # future_goals_predicted = ( # self.env.convert_obs_to_goals(obs) + self.gcm(obs, policy_actions) # ) # policy_loss = ((future_goals_predicted-goals)**2).sum(dim=1).mean() policy_loss = self.gcm(obs, policy_actions).sum(dim=1).mean() """ GCM operations. """ next_actions = self.target_policy(next_obs) # speed up computation by not backpropping these gradients next_actions.detach() target_difference = self.target_gcm( next_obs, next_actions, ) gcm_target = goal_differences + (1. - terminals) * target_difference gcm_target = gcm_target.detach() gcm_pred = self.gcm(obs, actions) bellman_errors = (gcm_pred - gcm_target)**2 gcm_loss = self.gcm_criterion(gcm_pred, gcm_target) """ Update Networks """ self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.gcm_optimizer.zero_grad() gcm_loss.backward() self.gcm_optimizer.step() self._update_target_networks() if self.eval_statistics is None: """ Eval should set this to None. This way, these statistics are only computed for one batch. """ self.eval_statistics = OrderedDict() self.eval_statistics['GCM Loss'] = np.mean(ptu.get_numpy(gcm_loss)) self.eval_statistics['Policy Loss'] = np.mean( ptu.get_numpy(policy_loss)) self.eval_statistics.update( create_stats_ordered_dict( 'Bellman Errors', ptu.get_numpy(bellman_errors), )) self.eval_statistics.update( create_stats_ordered_dict( 'Policy Action', ptu.get_numpy(policy_actions), )) self.eval_statistics.update( create_stats_ordered_dict( 'GCM Predictions', ptu.get_numpy(gcm_pred), )) self.eval_statistics.update( create_stats_ordered_dict( 'GCM Targets', ptu.get_numpy(gcm_target), ))
def _do_training(self): batch = self.get_batch() rewards = batch['rewards'] terminals = batch['terminals'] obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] goals = batch['goals'] num_steps_left = batch['num_steps_left'] q1_pred = self.qf1( observations=obs, actions=actions, goals=goals, num_steps_left=num_steps_left, ) q2_pred = self.qf2( observations=obs, actions=actions, goals=goals, num_steps_left=num_steps_left, ) # Make sure policy accounts for squashing functions like tanh correctly! policy_outputs = self.policy(obs, goals, num_steps_left, reparameterize=self.train_policy_with_reparameterization, return_log_prob=True) new_actions, policy_mean, policy_log_std, log_pi = policy_outputs[:4] if not self.dense_rewards and not self.dense_log_pi: log_pi = log_pi * terminals """ QF Loss """ target_v_values = self.target_vf( observations=next_obs, goals=goals, num_steps_left=num_steps_left-1, ) q_target = self.reward_scale * rewards + (1. - terminals) * self.discount * target_v_values q_target = q_target.detach() bellman_errors_1 = (q1_pred - q_target) ** 2 bellman_errors_2 = (q2_pred - q_target) ** 2 qf1_loss = bellman_errors_1.mean() qf2_loss = bellman_errors_2.mean() if self.use_automatic_entropy_tuning: """ Alpha Loss """ alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() alpha = self.log_alpha.exp() else: alpha = 1 """ VF Loss """ q1_new_actions = self.qf1( observations=obs, actions=new_actions, goals=goals, num_steps_left=num_steps_left, ) q2_new_actions = self.qf2( observations=obs, actions=new_actions, goals=goals, num_steps_left=num_steps_left, ) q_new_actions = torch.min(q1_new_actions, q2_new_actions) v_target = q_new_actions - alpha * log_pi v_pred = self.vf( observations=obs, goals=goals, num_steps_left=num_steps_left, ) v_target = v_target.detach() bellman_errors = (v_pred - v_target) ** 2 vf_loss = bellman_errors.mean() """ Update networks """ self.qf1_optimizer.zero_grad() qf1_loss.backward() self.qf1_optimizer.step() self.qf2_optimizer.zero_grad() qf2_loss.backward() self.qf2_optimizer.step() self.vf_optimizer.zero_grad() vf_loss.backward() self.vf_optimizer.step() """ Policy Loss """ # paper says to do + but apparently that's a typo. Do Q - V. if self.train_policy_with_reparameterization: policy_loss = (alpha * log_pi - q_new_actions).mean() else: log_policy_target = q_new_actions - v_pred policy_loss = ( log_pi * (alpha * log_pi - log_policy_target).detach() ).mean() mean_reg_loss = self.policy_mean_reg_weight * (policy_mean ** 2).mean() std_reg_loss = self.policy_std_reg_weight * (policy_log_std ** 2).mean() pre_tanh_value = policy_outputs[-1] pre_activation_reg_loss = self.policy_pre_activation_weight * ( (pre_tanh_value ** 2).sum(dim=1).mean() ) policy_reg_loss = mean_reg_loss + std_reg_loss + pre_activation_reg_loss policy_loss = policy_loss + policy_reg_loss if self._n_train_steps_total % self.policy_update_period == 0: self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() if self._n_train_steps_total % self.target_update_period == 0: ptu.soft_update_from_to( self.vf, self.target_vf, self.soft_target_tau ) """ Save some statistics for eval """ if self.need_to_update_eval_statistics: self.need_to_update_eval_statistics = False """ Eval should set this to None. This way, these statistics are only computed for one batch. """ self.eval_statistics['QF1 Loss'] = np.mean(ptu.get_numpy(qf1_loss)) self.eval_statistics['QF2 Loss'] = np.mean(ptu.get_numpy(qf2_loss)) self.eval_statistics['VF Loss'] = np.mean(ptu.get_numpy(vf_loss)) self.eval_statistics['Policy Loss'] = np.mean(ptu.get_numpy( policy_loss )) self.eval_statistics.update(create_stats_ordered_dict( 'Q1 Predictions', ptu.get_numpy(q1_pred), )) self.eval_statistics.update(create_stats_ordered_dict( 'Q2 Predictions', ptu.get_numpy(q2_pred), )) self.eval_statistics.update(create_stats_ordered_dict( 'V Predictions', ptu.get_numpy(v_pred), )) self.eval_statistics.update(create_stats_ordered_dict( 'Log Pis', ptu.get_numpy(log_pi), )) self.eval_statistics.update(create_stats_ordered_dict( 'Policy mu', ptu.get_numpy(policy_mean), )) self.eval_statistics.update(create_stats_ordered_dict( 'Policy log std', ptu.get_numpy(policy_log_std), )) if self.use_automatic_entropy_tuning: self.eval_statistics['Alpha'] = ptu.get_numpy(alpha)[0] self.eval_statistics['Alpha Loss'] = ptu.get_numpy(alpha_loss)[0]