def get_value_estimates(self, next_obs: List[np.ndarray], agent_id: str, done: bool) -> Dict[str, float]: """ Generates value estimates for bootstrapping. :param experience: AgentExperience to be used for bootstrapping. :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0. :return: The value estimate dictionary with key being the name of the reward signal and the value the corresponding value estimate. """ feed_dict: Dict[tf.Tensor, Any] = { self.model.batch_size: 1, self.model.sequence_length: 1, } vec_vis_obs = SplitObservations.from_observations(next_obs) for i in range(len(vec_vis_obs.visual_observations)): feed_dict[self.model.visual_in[i]] = [ vec_vis_obs.visual_observations[i] ] if self.use_vec_obs: feed_dict[self.model.vector_in] = [vec_vis_obs.vector_observations] if self.use_recurrent: feed_dict[self.model.memory_in] = self.retrieve_memories( [agent_id]) if not self.use_continuous_act and self.use_recurrent: feed_dict[self.model.prev_action] = self.retrieve_previous_action( [agent_id]) value_estimates = self.sess.run(self.model.value_heads, feed_dict) value_estimates = {k: float(v) for k, v in value_estimates.items()} # If we're done, reassign all of the value estimates that need terminal states. if done: for k in value_estimates: if self.reward_signals[k].use_terminal_states: value_estimates[k] = 0.0 return value_estimates
def _process_trajectory(self, trajectory: Trajectory) -> None: """ Takes a trajectory and processes it, putting it into the replay buffer. """ super()._process_trajectory(trajectory) last_step = trajectory.steps[-1] agent_id = trajectory.agent_id # All the agents should have the same ID agent_buffer_trajectory = trajectory.to_agentbuffer() # Update the normalization if self.is_training: self.policy.update_normalization( agent_buffer_trajectory["vector_obs"]) # Evaluate all reward functions for reporting purposes self.collected_rewards["environment"][agent_id] += np.sum( agent_buffer_trajectory["environment_rewards"]) for name, reward_signal in self.optimizer.reward_signals.items(): # BaseRewardProvider is a PyTorch-based reward signal if isinstance(reward_signal, BaseRewardProvider): evaluate_result = ( reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength) else: # reward_signal uses TensorFlow evaluate_result = reward_signal.evaluate_batch( agent_buffer_trajectory).scaled_reward # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) # Get all value estimates for reporting purposes value_estimates, _ = self.optimizer.get_trajectory_value_estimates( agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached) for name, v in value_estimates.items(): # BaseRewardProvider is a PyTorch-based reward signal if isinstance(self.optimizer.reward_signals[name], BaseRewardProvider): self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value", np.mean(v), ) else: # TensorFlow reward signal self._stats_reporter.add_stat( self.optimizer.reward_signals[name].value_name, np.mean(v)) # Bootstrap using the last step rather than the bootstrap step if max step is reached. # Set last element to duplicate obs and remove dones. if last_step.interrupted: vec_vis_obs = SplitObservations.from_observations(last_step.obs) for i, obs in enumerate(vec_vis_obs.visual_observations): agent_buffer_trajectory["next_visual_obs%d" % i][-1] = obs if vec_vis_obs.vector_observations.size > 1: agent_buffer_trajectory["next_vector_in"][ -1] = vec_vis_obs.vector_observations agent_buffer_trajectory["done"][-1] = False # Append to update buffer agent_buffer_trajectory.resequence_and_append( self.update_buffer, training_length=self.policy.sequence_length) if trajectory.done_reached: self._update_end_episode_stats(agent_id, self.optimizer)
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], behavior_spec: BehaviorSpec, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, current_pair_info in enumerate(pair_infos): if idx > len(pair_infos) - 2: break next_pair_info = pair_infos[idx + 1] current_decision_step, current_terminal_step = steps_from_proto( [current_pair_info.agent_info], behavior_spec) next_decision_step, next_terminal_step = steps_from_proto( [next_pair_info.agent_info], behavior_spec) previous_action = ( np.array(pair_infos[idx].action_info.vector_actions_deprecated, dtype=np.float32) * 0) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions_deprecated, dtype=np.float32, ) next_done = len(next_terminal_step) == 1 next_reward = 0 if len(next_terminal_step) == 1: next_reward = next_terminal_step.reward[0] else: next_reward = next_decision_step.reward[0] current_obs = None if len(current_terminal_step) == 1: current_obs = list(current_terminal_step.values())[0].obs else: current_obs = list(current_decision_step.values())[0].obs demo_raw_buffer["done"].append(next_done) demo_raw_buffer["rewards"].append(next_reward) split_obs = SplitObservations.from_observations(current_obs) for i, obs in enumerate(split_obs.visual_observations): demo_raw_buffer["visual_obs%d" % i].append(obs) demo_raw_buffer["vector_obs"].append(split_obs.vector_observations) # TODO: update the demonstraction files and read from the new proto format if behavior_spec.action_spec.continuous_size > 0: demo_raw_buffer["continuous_action"].append( current_pair_info.action_info.vector_actions_deprecated) if behavior_spec.action_spec.discrete_size > 0: demo_raw_buffer["discrete_action"].append( current_pair_info.action_info.vector_actions_deprecated) demo_raw_buffer["prev_action"].append(previous_action) if next_done: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append(demo_processed_buffer, batch_size=None, training_length=sequence_length) return demo_processed_buffer