def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_obs = [ np.random.normal(size=sen_spec.shape).astype(np.float32) for sen_spec in behavior_spec.sensor_specs ] next_obs = [ np.random.normal(size=sen_spec.shape).astype(np.float32) for sen_spec in behavior_spec.sensor_specs ] action_buffer = behavior_spec.action_spec.random_action(1) action = {} if behavior_spec.action_spec.continuous_size > 0: action["continuous_action"] = action_buffer.continuous if behavior_spec.action_spec.discrete_size > 0: action["discrete_action"] = action_buffer.discrete for _ in range(number): for i, obs in enumerate(curr_obs): buffer[ObsUtil.get_name_at(i)].append(obs) for i, obs in enumerate(next_obs): buffer[ObsUtil.get_name_at_next(i)].append(obs) buffer["actions"].append(action) for _act_type, _act in action.items(): buffer[_act_type].append(_act[0, :]) buffer["reward"].append(np.ones(1, dtype=np.float32) * reward) buffer["masks"].append(np.ones(1, dtype=np.float32)) buffer["done"] = np.zeros(number, dtype=np.float32) return buffer
def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_obs = [ np.random.normal(size=obs_spec.shape).astype(np.float32) for obs_spec in behavior_spec.observation_specs ] next_obs = [ np.random.normal(size=obs_spec.shape).astype(np.float32) for obs_spec in behavior_spec.observation_specs ] action_buffer = behavior_spec.action_spec.random_action(1) action = {} if behavior_spec.action_spec.continuous_size > 0: action[BufferKey.CONTINUOUS_ACTION] = action_buffer.continuous if behavior_spec.action_spec.discrete_size > 0: action[BufferKey.DISCRETE_ACTION] = action_buffer.discrete for _ in range(number): for i, obs in enumerate(curr_obs): buffer[ObsUtil.get_name_at(i)].append(obs) for i, obs in enumerate(next_obs): buffer[ObsUtil.get_name_at_next(i)].append(obs) # TODO # buffer[AgentBufferKey.ACTIONS].append(action) for _act_type, _act in action.items(): buffer[_act_type].append(_act[0, :]) # TODO was "rewards" buffer[BufferKey.ENVIRONMENT_REWARDS].append( np.ones(1, dtype=np.float32) * reward) buffer[BufferKey.MASKS].append(np.ones(1, dtype=np.float32)) buffer[BufferKey.DONE] = np.zeros(number, dtype=np.float32) return buffer
def test_buffer(): agent_1_buffer = construct_fake_buffer(1) agent_2_buffer = construct_fake_buffer(2) agent_3_buffer = construct_fake_buffer(3) a = agent_1_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=1, sequential=True) assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]])) a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=3, sequential=True) assert_array( np.array(a), np.array([ [231, 232, 233], [241, 242, 243], [251, 252, 253], [261, 262, 263], [271, 272, 273], [281, 282, 283], ]), ) a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=3, sequential=False) assert_array( np.array(a), np.array([ [251, 252, 253], [261, 262, 263], [271, 272, 273], [261, 262, 263], [271, 272, 273], [281, 282, 283], ]), ) agent_1_buffer.reset_agent() assert agent_1_buffer.num_experiences == 0 update_buffer = AgentBuffer() agent_2_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) agent_3_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20 assert np.array(update_buffer[BufferKey.CONTINUOUS_ACTION]).shape == (20, 2) c = update_buffer.make_mini_batch(start=0, end=1) assert c.keys() == update_buffer.keys() assert np.array(c[BufferKey.CONTINUOUS_ACTION]).shape == (1, 2)
def test_sample_actions(rnn, visual, discrete): policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK]) np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs)) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] memories = [ ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i]) for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) (sampled_actions, log_probs, entropies, memories) = policy.sample_actions( tensor_obs, masks=act_masks, memories=memories, seq_len=policy.sequence_length ) if discrete: assert log_probs.all_discrete_tensor.shape == ( 64, sum(policy.behavior_spec.action_spec.discrete_branches), ) else: assert log_probs.continuous_tensor.shape == ( 64, policy.behavior_spec.action_spec.continuous_size, ) assert entropies.shape == (64,) if rnn: assert memories.shape == (1, 1, policy.m_size)
def construct_fake_buffer(fake_agent_id): b = AgentBuffer() for step in range(9): b[ObsUtil.get_name_at(0)].append( np.array( [ 100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3, ], dtype=np.float32, )) b[BufferKey.CONTINUOUS_ACTION].append( np.array( [ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5, ], dtype=np.float32, )) b[BufferKey.GROUP_CONTINUOUS_ACTION].append([ np.array( [ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5, ], dtype=np.float32, ) ] * 3) return b
def test_evaluate_actions(rnn, visual, discrete): policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK]) agent_action = AgentAction.from_buffer(buffer) np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs)) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] memories = [ ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i]) for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) log_probs, entropy, values = policy.evaluate_actions( tensor_obs, masks=act_masks, actions=agent_action, memories=memories, seq_len=policy.sequence_length, ) if discrete: _size = policy.behavior_spec.action_spec.discrete_size else: _size = policy.behavior_spec.action_spec.continuous_size assert log_probs.flatten().shape == (64, _size) assert entropy.shape == (64,) for val in values.values(): assert val.shape == (64,)
def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] memory = torch.zeros([1, 1, self.policy.m_size]) next_obs = [obs.unsqueeze(0) for obs in next_obs] value_estimates, next_memory = self.policy.actor_critic.critic_pass( current_obs, memory, sequence_length=batch.num_experiences) next_value_estimate, _ = self.policy.actor_critic.critic_pass( next_obs, next_memory, sequence_length=1) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) next_value_estimate[name] = ModelUtils.to_numpy( next_value_estimate[name]) if done: for k in next_value_estimate: if not self.reward_signals[k].ignore_done: next_value_estimate[k] = 0.0 return value_estimates, next_value_estimate
def _process_trajectory(self, trajectory: Trajectory) -> None: """ Takes a trajectory and processes it, putting it into the replay buffer. """ super()._process_trajectory(trajectory) last_step = trajectory.steps[-1] agent_id = trajectory.agent_id # All the agents should have the same ID agent_buffer_trajectory = trajectory.to_agentbuffer() # Check if we used group rewards, warn if so. self._warn_if_group_reward(agent_buffer_trajectory) # Update the normalization if self.is_training: self.policy.update_normalization(agent_buffer_trajectory) # Evaluate all reward functions for reporting purposes self.collected_rewards["environment"][agent_id] += np.sum( agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]) for name, reward_signal in self.optimizer.reward_signals.items(): evaluate_result = ( reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength) # Report the reward signals self.collected_rewards[name][agent_id] += np.sum(evaluate_result) # Get all value estimates for reporting purposes ( value_estimates, _, value_memories, ) = self.optimizer.get_trajectory_value_estimates( agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached) if value_memories is not None: agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set( value_memories) for name, v in value_estimates.items(): self._stats_reporter.add_stat( f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value", np.mean(v), ) # Bootstrap using the last step rather than the bootstrap step if max step is reached. # Set last element to duplicate obs and remove dones. if last_step.interrupted: last_step_obs = last_step.obs for i, obs in enumerate(last_step_obs): agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs agent_buffer_trajectory[BufferKey.DONE][-1] = False # Append to update buffer agent_buffer_trajectory.resequence_and_append( self.update_buffer, training_length=self.policy.sequence_length) if trajectory.done_reached: self._update_end_episode_stats(agent_id, self.optimizer)
def forward(self, mini_batch: AgentBuffer) -> torch.Tensor: n_obs = len(self._encoder.processors) np_obs = ObsUtil.from_buffer(mini_batch, n_obs) # Convert to tensors tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] hidden, _ = self._encoder.forward(tensor_obs) self._encoder.update_normalization(mini_batch) return hidden
def get_state_inputs(self, mini_batch: AgentBuffer) -> List[torch.Tensor]: """ Creates the observation input. """ n_obs = len(self.encoder.processors) np_obs = ObsUtil.from_buffer(mini_batch, n_obs) # Convert to tensors tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] return tensor_obs
def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Extracts the next state embedding from a mini_batch. """ n_obs = len(self._state_encoder.processors) np_obs = ObsUtil.from_buffer_next(mini_batch, n_obs) # Convert to tensors tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] hidden, _ = self._state_encoder.forward(tensor_obs) return hidden
def construct_fake_buffer(fake_agent_id): b = AgentBuffer() for step in range(9): b[ObsUtil.get_name_at(0)].append([ 100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3, ]) b[BufferKey.CONTINUOUS_ACTION].append([ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5 ]) return b
def _update_batch( self, mini_batch_demo: AgentBuffer, n_sequences: int ) -> Dict[str, float]: """ Helper function for update_batch. """ np_obs = ObsUtil.from_buffer( mini_batch_demo, len(self.policy.behavior_spec.observation_specs) ) # Convert to tensors tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] act_masks = None expert_actions = AgentAction.from_buffer(mini_batch_demo) if self.policy.behavior_spec.action_spec.discrete_size > 0: act_masks = ModelUtils.list_to_tensor( np.ones( ( self.n_sequences * self.policy.sequence_length, sum(self.policy.behavior_spec.action_spec.discrete_branches), ), dtype=np.float32, ) ) memories = [] if self.policy.use_recurrent: memories = torch.zeros(1, self.n_sequences, self.policy.m_size) selected_actions, log_probs, _, _ = self.policy.sample_actions( tensor_obs, masks=act_masks, memories=memories, seq_len=self.policy.sequence_length, ) bc_loss = self._behavioral_cloning_loss( selected_actions, log_probs, expert_actions ) self.optimizer.zero_grad() bc_loss.backward() self.optimizer.step() run_out = {"loss": bc_loss.item()} return run_out
def update_normalization(self, buffer: AgentBuffer) -> None: obs = ObsUtil.from_buffer(buffer, len(self.processors)) for vec_input, enc in zip(obs, self.processors): if isinstance(enc, VectorInput): enc.update_normalization(torch.as_tensor(vec_input))
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], behavior_spec: BehaviorSpec, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, current_pair_info in enumerate(pair_infos): if idx > len(pair_infos) - 2: break next_pair_info = pair_infos[idx + 1] current_decision_step, current_terminal_step = steps_from_proto( [current_pair_info.agent_info], behavior_spec ) next_decision_step, next_terminal_step = steps_from_proto( [next_pair_info.agent_info], behavior_spec ) previous_action = ( np.array( pair_infos[idx].action_info.vector_actions_deprecated, dtype=np.float32 ) * 0 ) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions_deprecated, dtype=np.float32, ) next_done = len(next_terminal_step) == 1 next_reward = 0 if len(next_terminal_step) == 1: next_reward = next_terminal_step.reward[0] else: next_reward = next_decision_step.reward[0] current_obs = None if len(current_terminal_step) == 1: current_obs = list(current_terminal_step.values())[0].obs else: current_obs = list(current_decision_step.values())[0].obs demo_raw_buffer["done"].append(next_done) demo_raw_buffer["rewards"].append(next_reward) for i, obs in enumerate(current_obs): demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs) if ( len(current_pair_info.action_info.continuous_actions) == 0 and len(current_pair_info.action_info.discrete_actions) == 0 ): if behavior_spec.action_spec.continuous_size > 0: demo_raw_buffer["continuous_action"].append( current_pair_info.action_info.vector_actions_deprecated ) else: demo_raw_buffer["discrete_action"].append( current_pair_info.action_info.vector_actions_deprecated ) else: if behavior_spec.action_spec.continuous_size > 0: demo_raw_buffer["continuous_action"].append( current_pair_info.action_info.continuous_actions ) if behavior_spec.action_spec.discrete_size > 0: demo_raw_buffer["discrete_action"].append( current_pair_info.action_info.discrete_actions ) demo_raw_buffer["prev_action"].append(previous_action) if next_done: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) return demo_processed_buffer
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Performs update on model. :param batch: Batch of experiences. :param num_sequences: Number of sequences to process. :return: Results of update. """ # Get decayed parameters decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) decay_eps = self.decay_epsilon.get_value( self.policy.get_current_step()) decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) returns = {} old_values = {} for name in self.reward_signals: old_values[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.value_estimates_key(name)]) returns[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.returns_key(name)]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) actions = AgentAction.from_buffer(batch) memories = [ ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) # Get value memories value_memories = [ ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] if len(value_memories) > 0: value_memories = torch.stack(value_memories).unsqueeze(0) log_probs, entropy = self.policy.evaluate_actions( current_obs, masks=act_masks, actions=actions, memories=memories, seq_len=self.policy.sequence_length, ) values, _ = self.critic.critic_pass( current_obs, memories=value_memories, sequence_length=self.policy.sequence_length, ) old_log_probs = ActionLogProbs.from_buffer(batch).flatten() log_probs = log_probs.flatten() loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) value_loss = self.ppo_value_loss(values, old_values, returns, decay_eps, loss_masks) policy_loss = self.ppo_policy_loss( ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]), log_probs, old_log_probs, loss_masks, ) loss = (policy_loss + 0.5 * value_loss - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) self.optimizer.zero_grad() loss.backward() self.optimizer.step() update_stats = { # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. # TODO: After PyTorch is default, change to something more correct. "Losses/Policy Loss": torch.abs(policy_loss).item(), "Losses/Value Loss": value_loss.item(), "Policy/Learning Rate": decay_lr, "Policy/Epsilon": decay_eps, "Policy/Beta": decay_bet, } for reward_provider in self.reward_signals.values(): update_stats.update(reward_provider.update(batch)) return update_stats
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Updates model using buffer. :param num_sequences: Number of trajectories in batch. :param batch: Experience mini-batch. :param update_target: Whether or not to update target value network :param reward_signal_batches: Minibatches to use for updating the reward signals, indexed by name. If none, don't update the reward signals. :return: Output from update process. """ rewards = {} for name in self.reward_signals: rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"]) n_obs = len(self.policy.behavior_spec.sensor_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] next_obs = ObsUtil.from_buffer_next(batch, n_obs) # Convert to tensors next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] act_masks = ModelUtils.list_to_tensor(batch["action_mask"]) actions = AgentAction.from_dict(batch) memories_list = [ ModelUtils.list_to_tensor(batch["memory"][i]) for i in range( 0, len(batch["memory"]), self.policy.sequence_length) ] # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. offset = 1 if self.policy.sequence_length > 1 else 0 next_memories_list = [ ModelUtils.list_to_tensor( batch["memory"][i] [self.policy.m_size // 2:]) # only pass value part of memory to target network for i in range(offset, len(batch["memory"]), self.policy.sequence_length) ] if len(memories_list) > 0: memories = torch.stack(memories_list).unsqueeze(0) next_memories = torch.stack(next_memories_list).unsqueeze(0) else: memories = None next_memories = None # Q network memories are 0'ed out, since we don't have them during inference. q_memories = (torch.zeros_like(next_memories) if next_memories is not None else None) # Copy normalizers from policy self.value_network.q1_network.network_body.copy_normalization( self.policy.actor_critic.network_body) self.value_network.q2_network.network_body.copy_normalization( self.policy.actor_critic.network_body) self.target_network.network_body.copy_normalization( self.policy.actor_critic.network_body) ( sampled_actions, log_probs, _, value_estimates, _, ) = self.policy.actor_critic.get_action_stats_and_value( current_obs, masks=act_masks, memories=memories, sequence_length=self.policy.sequence_length, ) cont_sampled_actions = sampled_actions.continuous_tensor cont_actions = actions.continuous_tensor q1p_out, q2p_out = self.value_network( current_obs, cont_sampled_actions, memories=q_memories, sequence_length=self.policy.sequence_length, q2_grad=False, ) q1_out, q2_out = self.value_network( current_obs, cont_actions, memories=q_memories, sequence_length=self.policy.sequence_length, ) if self._action_spec.discrete_size > 0: disc_actions = actions.discrete_tensor q1_stream = self._condense_q_streams(q1_out, disc_actions) q2_stream = self._condense_q_streams(q2_out, disc_actions) else: q1_stream, q2_stream = q1_out, q2_out with torch.no_grad(): target_values, _ = self.target_network( next_obs, memories=next_memories, sequence_length=self.policy.sequence_length, ) masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) dones = ModelUtils.list_to_tensor(batch["done"]) q1_loss, q2_loss = self.sac_q_loss(q1_stream, q2_stream, target_values, dones, rewards, masks) value_loss = self.sac_value_loss(log_probs, value_estimates, q1p_out, q2p_out, masks) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks) entropy_loss = self.sac_entropy_loss(log_probs, masks) total_value_loss = q1_loss + q2_loss + value_loss decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) self.value_optimizer.zero_grad() total_value_loss.backward() self.value_optimizer.step() ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) self.entropy_optimizer.zero_grad() entropy_loss.backward() self.entropy_optimizer.step() # Update target network ModelUtils.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau) update_stats = { "Losses/Policy Loss": policy_loss.item(), "Losses/Value Loss": value_loss.item(), "Losses/Q1 Loss": q1_loss.item(), "Losses/Q2 Loss": q2_loss.item(), "Policy/Discrete Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.discrete)).item(), "Policy/Continuous Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.continuous)).item(), "Policy/Learning Rate": decay_lr, } return update_stats
def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool, agent_id: str = "", ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]: """ Get value estimates and memories for a trajectory, in batch form. :param batch: An AgentBuffer that consists of a trajectory. :param next_obs: the next observation (after the trajectory). Used for boostrapping if this is not a termiinal trajectory. :param done: Set true if this is a terminal trajectory. :param agent_id: Agent ID of the agent that this trajectory belongs to. :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)], the final value estimate as a Dict of [name, float], and optionally (if using memories) an AgentBufferField of initial critic memories to be used during update. """ n_obs = len(self.policy.behavior_spec.observation_specs) if agent_id in self.critic_memory_dict: memory = self.critic_memory_dict[agent_id] else: memory = ( torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None ) # Convert to tensors current_obs = [ ModelUtils.list_to_tensor(obs) for obs in ObsUtil.from_buffer(batch, n_obs) ] next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] next_obs = [obs.unsqueeze(0) for obs in next_obs] # If we're using LSTM, we want to get all the intermediate memories. all_next_memories: Optional[AgentBufferField] = None # To prevent memory leak and improve performance, evaluate with no_grad. with torch.no_grad(): if self.policy.use_recurrent: ( value_estimates, all_next_memories, next_memory, ) = self._evaluate_by_sequence(current_obs, memory) else: value_estimates, next_memory = self.critic.critic_pass( current_obs, memory, sequence_length=batch.num_experiences ) # Store the memory for the next trajectory. This should NOT have a gradient. self.critic_memory_dict[agent_id] = next_memory next_value_estimate, _ = self.critic.critic_pass( next_obs, next_memory, sequence_length=1 ) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name]) if done: for k in next_value_estimate: if not self.reward_signals[k].ignore_done: next_value_estimate[k] = 0.0 if agent_id in self.critic_memory_dict: self.critic_memory_dict.pop(agent_id) return value_estimates, next_value_estimate, all_next_memories
def get_trajectory_and_baseline_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], next_groupmate_obs: List[List[np.ndarray]], done: bool, agent_id: str = "", ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField], Optional[AgentBufferField], ]: """ Get value estimates, baseline estimates, and memories for a trajectory, in batch form. :param batch: An AgentBuffer that consists of a trajectory. :param next_obs: the next observation (after the trajectory). Used for boostrapping if this is not a termiinal trajectory. :param next_groupmate_obs: the next observations from other members of the group. :param done: Set true if this is a terminal trajectory. :param agent_id: Agent ID of the agent that this trajectory belongs to. :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)], the baseline estimates as a Dict, the final value estimate as a Dict of [name, float], and optionally (if using memories) an AgentBufferField of initial critic and baseline memories to be used during update. """ n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs) current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] groupmate_obs = [[ ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs ] for _groupmate_obs in groupmate_obs] groupmate_actions = AgentAction.group_from_buffer(batch) next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] next_obs = [obs.unsqueeze(0) for obs in next_obs] next_groupmate_obs = [ ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_groupmate_obs ] # Expand dimensions of next critic obs next_groupmate_obs = [[_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_groupmate_obs] if agent_id in self.value_memory_dict: # The agent_id should always be in both since they are added together _init_value_mem = self.value_memory_dict[agent_id] _init_baseline_mem = self.baseline_memory_dict[agent_id] else: _init_value_mem = (torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None) _init_baseline_mem = (torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None) all_obs = ([current_obs] + groupmate_obs if groupmate_obs is not None else [current_obs]) all_next_value_mem: Optional[AgentBufferField] = None all_next_baseline_mem: Optional[AgentBufferField] = None with torch.no_grad(): if self.policy.use_recurrent: ( value_estimates, baseline_estimates, all_next_value_mem, all_next_baseline_mem, next_value_mem, next_baseline_mem, ) = self._evaluate_by_sequence_team( current_obs, groupmate_obs, groupmate_actions, _init_value_mem, _init_baseline_mem, ) else: value_estimates, next_value_mem = self.critic.critic_pass( all_obs, _init_value_mem, sequence_length=batch.num_experiences) groupmate_obs_and_actions = (groupmate_obs, groupmate_actions) baseline_estimates, next_baseline_mem = self.critic.baseline( current_obs, groupmate_obs_and_actions, _init_baseline_mem, sequence_length=batch.num_experiences, ) # Store the memory for the next trajectory self.value_memory_dict[agent_id] = next_value_mem self.baseline_memory_dict[agent_id] = next_baseline_mem all_next_obs = ([next_obs] + next_groupmate_obs if next_groupmate_obs is not None else [next_obs]) next_value_estimates, _ = self.critic.critic_pass(all_next_obs, next_value_mem, sequence_length=1) for name, estimate in baseline_estimates.items(): baseline_estimates[name] = ModelUtils.to_numpy(estimate) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) # the base line and V shpuld not be on the same done flag for name, estimate in next_value_estimates.items(): next_value_estimates[name] = ModelUtils.to_numpy(estimate) if done: for k in next_value_estimates: if not self.reward_signals[k].ignore_done: next_value_estimates[k][-1] = 0.0 return ( value_estimates, baseline_estimates, next_value_estimates, all_next_value_mem, all_next_baseline_mem, )
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Updates model using buffer. :param num_sequences: Number of trajectories in batch. :param batch: Experience mini-batch. :param update_target: Whether or not to update target value network :param reward_signal_batches: Minibatches to use for updating the reward signals, indexed by name. If none, don't update the reward signals. :return: Output from update process. """ rewards = {} for name in self.reward_signals: rewards[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.rewards_key(name)]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] next_obs = ObsUtil.from_buffer_next(batch, n_obs) # Convert to tensors next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) actions = AgentAction.from_buffer(batch) memories_list = [ ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) ] # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. value_memories_list = [ ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] if len(memories_list) > 0: memories = torch.stack(memories_list).unsqueeze(0) value_memories = torch.stack(value_memories_list).unsqueeze(0) else: memories = None value_memories = None # Q and V network memories are 0'ed out, since we don't have them during inference. q_memories = (torch.zeros_like(value_memories) if value_memories is not None else None) # Copy normalizers from policy self.q_network.q1_network.network_body.copy_normalization( self.policy.actor.network_body) self.q_network.q2_network.network_body.copy_normalization( self.policy.actor.network_body) self.target_network.network_body.copy_normalization( self.policy.actor.network_body) self._critic.network_body.copy_normalization( self.policy.actor.network_body) sampled_actions, log_probs, _, _, = self.policy.actor.get_action_and_stats( current_obs, masks=act_masks, memories=memories, sequence_length=self.policy.sequence_length, ) value_estimates, _ = self._critic.critic_pass( current_obs, value_memories, sequence_length=self.policy.sequence_length) cont_sampled_actions = sampled_actions.continuous_tensor cont_actions = actions.continuous_tensor q1p_out, q2p_out = self.q_network( current_obs, cont_sampled_actions, memories=q_memories, sequence_length=self.policy.sequence_length, q2_grad=False, ) q1_out, q2_out = self.q_network( current_obs, cont_actions, memories=q_memories, sequence_length=self.policy.sequence_length, ) if self._action_spec.discrete_size > 0: disc_actions = actions.discrete_tensor q1_stream = self._condense_q_streams(q1_out, disc_actions) q2_stream = self._condense_q_streams(q2_out, disc_actions) else: q1_stream, q2_stream = q1_out, q2_out with torch.no_grad(): # Since we didn't record the next value memories, evaluate one step in the critic to # get them. if value_memories is not None: # Get the first observation in each sequence just_first_obs = [ _obs[::self.policy.sequence_length] for _obs in current_obs ] _, next_value_memories = self._critic.critic_pass( just_first_obs, value_memories, sequence_length=1) else: next_value_memories = None target_values, _ = self.target_network( next_obs, memories=next_value_memories, sequence_length=self.policy.sequence_length, ) masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) dones = ModelUtils.list_to_tensor(batch[BufferKey.DONE]) q1_loss, q2_loss = self.sac_q_loss(q1_stream, q2_stream, target_values, dones, rewards, masks) value_loss = self.sac_value_loss(log_probs, value_estimates, q1p_out, q2p_out, masks) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks) entropy_loss = self.sac_entropy_loss(log_probs, masks) total_value_loss = q1_loss + q2_loss if self.policy.shared_critic: policy_loss += value_loss else: total_value_loss += value_loss decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) self.value_optimizer.zero_grad() total_value_loss.backward() self.value_optimizer.step() ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) self.entropy_optimizer.zero_grad() entropy_loss.backward() self.entropy_optimizer.step() # Update target network ModelUtils.soft_update(self._critic, self.target_network, self.tau) update_stats = { "Losses/Policy Loss": policy_loss.item(), "Losses/Value Loss": value_loss.item(), "Losses/Q1 Loss": q1_loss.item(), "Losses/Q2 Loss": q2_loss.item(), "Policy/Discrete Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.discrete)).item(), "Policy/Continuous Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.continuous)).item(), "Policy/Learning Rate": decay_lr, } return update_stats
def test_buffer(): agent_1_buffer = construct_fake_buffer(1) agent_2_buffer = construct_fake_buffer(2) agent_3_buffer = construct_fake_buffer(3) # Test get_batch a = agent_1_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=1, sequential=True) assert_array( np.array(a), np.array([[171, 172, 173], [181, 182, 183]], dtype=np.float32)) # Test get_batch a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=3, sequential=True) assert_array( np.array(a), np.array( [ [231, 232, 233], [241, 242, 243], [251, 252, 253], [261, 262, 263], [271, 272, 273], [281, 282, 283], ], dtype=np.float32, ), ) a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=3, sequential=False) assert_array( np.array(a), np.array([ [251, 252, 253], [261, 262, 263], [271, 272, 273], [261, 262, 263], [271, 272, 273], [281, 282, 283], ]), ) # Test padding a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=None, training_length=4, sequential=True) assert_array( np.array(a), np.array([ [201, 202, 203], [211, 212, 213], [221, 222, 223], [231, 232, 233], [241, 242, 243], [251, 252, 253], [261, 262, 263], [271, 272, 273], [281, 282, 283], [0, 0, 0], [0, 0, 0], [0, 0, 0], ]), ) # Test group entries return Lists of Lists. Make sure to pad properly! a = agent_2_buffer[BufferKey.GROUP_CONTINUOUS_ACTION].get_batch( batch_size=None, training_length=4, sequential=True) for _group_entry in a[:-3]: assert len(_group_entry) == 3 for _group_entry in a[-3:]: assert len(_group_entry) == 0 agent_1_buffer.reset_agent() assert agent_1_buffer.num_experiences == 0 update_buffer = AgentBuffer() agent_2_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) agent_3_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20 assert np.array(update_buffer[BufferKey.CONTINUOUS_ACTION]).shape == (20, 2) c = update_buffer.make_mini_batch(start=0, end=1) assert c.keys() == update_buffer.keys() # Make sure the values of c are AgentBufferField for val in c.values(): assert isinstance(val, AgentBufferField) assert np.array(c[BufferKey.CONTINUOUS_ACTION]).shape == (1, 2)