def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_obs = [ np.random.normal(size=sen_spec.shape).astype(np.float32) for sen_spec in behavior_spec.sensor_specs ] next_obs = [ np.random.normal(size=sen_spec.shape).astype(np.float32) for sen_spec in behavior_spec.sensor_specs ] action_buffer = behavior_spec.action_spec.random_action(1) action = {} if behavior_spec.action_spec.continuous_size > 0: action["continuous_action"] = action_buffer.continuous if behavior_spec.action_spec.discrete_size > 0: action["discrete_action"] = action_buffer.discrete for _ in range(number): for i, obs in enumerate(curr_obs): buffer[ObsUtil.get_name_at(i)].append(obs) for i, obs in enumerate(next_obs): buffer[ObsUtil.get_name_at_next(i)].append(obs) buffer["actions"].append(action) for _act_type, _act in action.items(): buffer[_act_type].append(_act[0, :]) buffer["reward"].append(np.ones(1, dtype=np.float32) * reward) buffer["masks"].append(np.ones(1, dtype=np.float32)) buffer["done"] = np.zeros(number, dtype=np.float32) return buffer
def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_obs = [ np.random.normal(size=obs_spec.shape).astype(np.float32) for obs_spec in behavior_spec.observation_specs ] next_obs = [ np.random.normal(size=obs_spec.shape).astype(np.float32) for obs_spec in behavior_spec.observation_specs ] action_buffer = behavior_spec.action_spec.random_action(1) action = {} if behavior_spec.action_spec.continuous_size > 0: action[BufferKey.CONTINUOUS_ACTION] = action_buffer.continuous if behavior_spec.action_spec.discrete_size > 0: action[BufferKey.DISCRETE_ACTION] = action_buffer.discrete for _ in range(number): for i, obs in enumerate(curr_obs): buffer[ObsUtil.get_name_at(i)].append(obs) for i, obs in enumerate(next_obs): buffer[ObsUtil.get_name_at_next(i)].append(obs) # TODO # buffer[AgentBufferKey.ACTIONS].append(action) for _act_type, _act in action.items(): buffer[_act_type].append(_act[0, :]) # TODO was "rewards" buffer[BufferKey.ENVIRONMENT_REWARDS].append( np.ones(1, dtype=np.float32) * reward) buffer[BufferKey.MASKS].append(np.ones(1, dtype=np.float32)) buffer[BufferKey.DONE] = np.zeros(number, dtype=np.float32) return buffer
def _process_trajectory(self, trajectory: Trajectory) -> None: """ Takes a trajectory and processes it, putting it into the replay buffer. """ super()._process_trajectory(trajectory) last_step = trajectory.steps[-1] agent_id = trajectory.agent_id # All the agents should have the same ID agent_buffer_trajectory = trajectory.to_agentbuffer() # Check if we used group rewards, warn if so. self._warn_if_group_reward(agent_buffer_trajectory) # Update the normalization if self.is_training: self.policy.update_normalization(agent_buffer_trajectory) # Evaluate all reward functions for reporting purposes self.collected_rewards["environment"][agent_id] += np.sum( agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]) for name, reward_signal in self.optimizer.reward_signals.items(): evaluate_result = ( reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) # Get all value estimates for reporting purposes ( value_estimates, _, value_memories, ) = self.optimizer.get_trajectory_value_estimates( agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached) if value_memories is not None: agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set( value_memories) for name, v in value_estimates.items(): self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value", np.mean(v), ) # Bootstrap using the last step rather than the bootstrap step if max step is reached. # Set last element to duplicate obs and remove dones. if last_step.interrupted: last_step_obs = last_step.obs for i, obs in enumerate(last_step_obs): agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs agent_buffer_trajectory[BufferKey.DONE][-1] = False # Append to update buffer agent_buffer_trajectory.resequence_and_append( self.update_buffer, training_length=self.policy.sequence_length) if trajectory.done_reached: self._update_end_episode_stats(agent_id, self.optimizer)