def test_sample_actions(rnn, visual, discrete): policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK]) np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs)) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] memories = [ ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i]) for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) (sampled_actions, log_probs, entropies, memories) = policy.sample_actions( tensor_obs, masks=act_masks, memories=memories, seq_len=policy.sequence_length ) if discrete: assert log_probs.all_discrete_tensor.shape == ( 64, sum(policy.behavior_spec.action_spec.discrete_branches), ) else: assert log_probs.continuous_tensor.shape == ( 64, policy.behavior_spec.action_spec.continuous_size, ) assert entropies.shape == (64,) if rnn: assert memories.shape == (1, 1, policy.m_size)
def _group_agent_action_from_buffer( buff: AgentBuffer, cont_action_key: BufferKey, disc_action_key: BufferKey) -> List["AgentAction"]: """ Extracts continuous and discrete groupmate actions, as specified by BufferKey, and returns a List of AgentActions that correspond to the groupmate's actions. List will be of length equal to the maximum number of groupmates in the buffer. Any spots where there are less agents than maximum, the actions will be padded with 0's. """ continuous_tensors: List[torch.Tensor] = [] discrete_tensors: List[torch.Tensor] = [] if cont_action_key in buff: padded_batch = buff[cont_action_key].padded_to_batch() continuous_tensors = [ ModelUtils.list_to_tensor(arr) for arr in padded_batch ] if disc_action_key in buff: padded_batch = buff[disc_action_key].padded_to_batch(dtype=np.long) discrete_tensors = [ ModelUtils.list_to_tensor(arr, dtype=torch.long) for arr in padded_batch ] actions_list = [] for _cont, _disc in itertools.zip_longest(continuous_tensors, discrete_tensors, fillvalue=None): if _disc is not None: _disc = [_disc[..., i] for i in range(_disc.shape[-1])] actions_list.append(AgentAction(_cont, _disc)) return actions_list
def test_evaluate_actions(rnn, visual, discrete): policy = create_policy_mock( TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual ) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK]) agent_action = AgentAction.from_buffer(buffer) np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs)) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] memories = [ ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i]) for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) log_probs, entropy, values = policy.evaluate_actions( tensor_obs, masks=act_masks, actions=agent_action, memories=memories, seq_len=policy.sequence_length, ) if discrete: _size = policy.behavior_spec.action_spec.discrete_size else: _size = policy.behavior_spec.action_spec.continuous_size assert log_probs.flatten().shape == (64, _size) assert entropy.shape == (64,) for val in values.values(): assert val.shape == (64,)
def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] memory = torch.zeros([1, 1, self.policy.m_size]) next_obs = [obs.unsqueeze(0) for obs in next_obs] value_estimates, next_memory = self.policy.actor_critic.critic_pass( current_obs, memory, sequence_length=batch.num_experiences) next_value_estimate, _ = self.policy.actor_critic.critic_pass( next_obs, next_memory, sequence_length=1) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) next_value_estimate[name] = ModelUtils.to_numpy( next_value_estimate[name]) if done: for k in next_value_estimate: if not self.reward_signals[k].ignore_done: next_value_estimate[k] = 0.0 return value_estimates, next_value_estimate
def _update_batch(self, mini_batch_demo: Dict[str, np.ndarray], n_sequences: int) -> Dict[str, float]: """ Helper function for update_batch. """ vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])] act_masks = None if self.policy.use_continuous_act: expert_actions = ModelUtils.list_to_tensor( mini_batch_demo["actions"]) else: raw_expert_actions = ModelUtils.list_to_tensor( mini_batch_demo["actions"], dtype=torch.long) expert_actions = ModelUtils.actions_to_onehot( raw_expert_actions, self.policy.act_size) act_masks = ModelUtils.list_to_tensor( np.ones( ( self.n_sequences * self.policy.sequence_length, sum(self.policy.behavior_spec.discrete_action_branches ), ), dtype=np.float32, )) memories = [] if self.policy.use_recurrent: memories = torch.zeros(1, self.n_sequences, self.policy.m_size) if self.policy.use_vis_obs: vis_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_processors): vis_ob = ModelUtils.list_to_tensor( mini_batch_demo["visual_obs%d" % idx]) vis_obs.append(vis_ob) else: vis_obs = [] selected_actions, all_log_probs, _, _ = self.policy.sample_actions( vec_obs, vis_obs, masks=act_masks, memories=memories, seq_len=self.policy.sequence_length, all_log_probs=True, ) bc_loss = self._behavioral_cloning_loss(selected_actions, all_log_probs, expert_actions) self.optimizer.zero_grad() bc_loss.backward() self.optimizer.step() run_out = {"loss": bc_loss.item()} return run_out
def test_sample_actions(rnn, visual, discrete): policy = create_policy_mock(TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])] act_masks = ModelUtils.list_to_tensor(buffer["action_mask"]) vis_obs = [] for idx, _ in enumerate( policy.actor_critic.network_body.visual_processors): vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx]) vis_obs.append(vis_ob) memories = [ ModelUtils.list_to_tensor(buffer["memory"][i]) for i in range(0, len(buffer["memory"]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) ( sampled_actions, clipped_actions, log_probs, entropies, memories, ) = policy.sample_actions( vec_obs, vis_obs, masks=act_masks, memories=memories, seq_len=policy.sequence_length, all_log_probs=not policy.use_continuous_act, ) if discrete: assert log_probs.shape == ( 64, sum(policy.behavior_spec.action_spec.discrete_branches), ) else: assert log_probs.shape == ( 64, policy.behavior_spec.action_spec.continuous_size) assert clipped_actions.shape == ( 64, policy.behavior_spec.action_spec.continuous_size, ) assert entropies.shape == (64, ) if rnn: assert memories.shape == (1, 1, policy.m_size)
def forward(self, mini_batch: AgentBuffer) -> torch.Tensor: n_vis = len(self._encoder.visual_processors) hidden, _ = self._encoder.forward( vec_inputs=[ ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float) ], vis_inputs=[ ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float) for i in range(n_vis) ], ) self._encoder.update_normalization( torch.tensor(mini_batch["vector_obs"])) return hidden
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None: """ Make sure two policies have the same output for the same input. """ policy1.actor = policy1.actor.to(default_device()) policy2.actor = policy2.actor.to(default_device()) decision_step, _ = mb.create_steps_from_behavior_spec( policy1.behavior_spec, num_agents=1) np_obs = decision_step.obs masks = policy1._extract_masks(decision_step) memories = torch.as_tensor( policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] with torch.no_grad(): _, log_probs1, _, _ = policy1.sample_actions(tensor_obs, masks=masks, memories=memories) _, log_probs2, _, _ = policy2.sample_actions(tensor_obs, masks=masks, memories=memories) np.testing.assert_array_equal( ModelUtils.to_numpy(log_probs1.all_discrete_tensor), ModelUtils.to_numpy(log_probs2.all_discrete_tensor), )
def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Extracts the current state embedding from a mini_batch. """ n_vis = len(self._state_encoder.visual_processors) hidden, _ = self._state_encoder.forward( vec_inputs=[ ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float) ], vis_inputs=[ ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float) for i in range(n_vis) ], ) return hidden
def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]: vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])] if self.policy.use_vis_obs: visual_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_processors): visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx]) visual_obs.append(visual_ob) else: visual_obs = [] memory = torch.zeros([1, 1, self.policy.m_size]) vec_vis_obs = SplitObservations.from_observations(next_obs) next_vec_obs = [ ModelUtils.list_to_tensor( vec_vis_obs.vector_observations).unsqueeze(0) ] next_vis_obs = [ ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0) for _vis_ob in vec_vis_obs.visual_observations ] value_estimates, next_memory = self.policy.actor_critic.critic_pass( vector_obs, visual_obs, memory, sequence_length=batch.num_experiences) next_value_estimate, _ = self.policy.actor_critic.critic_pass( next_vec_obs, next_vis_obs, next_memory, sequence_length=1) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) next_value_estimate[name] = ModelUtils.to_numpy( next_value_estimate[name]) if done: for k in next_value_estimate: if not self.reward_signals[k].ignore_done: next_value_estimate[k] = 0.0 return value_estimates, next_value_estimate
def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction": """ A static method that accesses continuous and discrete action fields in an AgentBuffer and constructs the corresponding AgentAction from the retrieved np arrays. """ continuous: torch.Tensor = None discrete: List[torch.Tensor] = None # type: ignore if "continuous_action" in buff: continuous = ModelUtils.list_to_tensor(buff["continuous_action"]) if "discrete_action" in buff: discrete_tensor = ModelUtils.list_to_tensor( buff["discrete_action"], dtype=torch.long) discrete = [ discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1]) ] return AgentAction(continuous, discrete)
def get_state_inputs( self, mini_batch: AgentBuffer ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: """ Creates the observation input. """ n_vis = len(self.encoder.visual_processors) n_vec = len(self.encoder.vector_processors) vec_inputs = ([ ModelUtils.list_to_tensor(mini_batch["vector_obs"], dtype=torch.float) ] if n_vec > 0 else []) vis_inputs = [ ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i], dtype=torch.float) for i in range(n_vis) ] return vec_inputs, vis_inputs
def get_state_inputs(self, mini_batch: AgentBuffer) -> List[torch.Tensor]: """ Creates the observation input. """ n_obs = len(self.encoder.processors) np_obs = ObsUtil.from_buffer(mini_batch, n_obs) # Convert to tensors tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] return tensor_obs
def test_list_to_tensor(): # Test converting pure list unconverted_list = [[1.0, 2], [1, 3], [1, 4]] tensor = ModelUtils.list_to_tensor(unconverted_list) # Should be equivalent to torch.tensor conversion assert torch.equal(tensor, torch.tensor(unconverted_list)) # Test converting pure numpy array np_list = np.asarray(unconverted_list) tensor = ModelUtils.list_to_tensor(np_list) # Should be equivalent to torch.tensor conversion assert torch.equal(tensor, torch.tensor(unconverted_list)) # Test converting list of numpy arrays list_of_np = [np.asarray(_el) for _el in unconverted_list] tensor = ModelUtils.list_to_tensor(list_of_np) # Should be equivalent to torch.tensor conversion assert torch.equal(tensor, torch.tensor(unconverted_list, dtype=torch.float32))
def forward(self, mini_batch: AgentBuffer) -> torch.Tensor: n_obs = len(self._encoder.processors) np_obs = ObsUtil.from_buffer(mini_batch, n_obs) # Convert to tensors tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] hidden, _ = self._encoder.forward(tensor_obs) self._encoder.update_normalization(mini_batch) return hidden
def from_buffer(buff: AgentBuffer) -> "AgentAction": """ A static method that accesses continuous and discrete action fields in an AgentBuffer and constructs the corresponding AgentAction from the retrieved np arrays. """ continuous: torch.Tensor = None discrete: List[torch.Tensor] = None # type: ignore if BufferKey.CONTINUOUS_ACTION in buff: continuous = ModelUtils.list_to_tensor( buff[BufferKey.CONTINUOUS_ACTION]) if BufferKey.DISCRETE_ACTION in buff: discrete_tensor = ModelUtils.list_to_tensor( buff[BufferKey.DISCRETE_ACTION], dtype=torch.long) discrete = [ discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1]) ] return AgentAction(continuous, discrete)
def _update_batch( self, mini_batch_demo: AgentBuffer, n_sequences: int ) -> Dict[str, float]: """ Helper function for update_batch. """ np_obs = ObsUtil.from_buffer( mini_batch_demo, len(self.policy.behavior_spec.observation_specs) ) # Convert to tensors tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] act_masks = None expert_actions = AgentAction.from_buffer(mini_batch_demo) if self.policy.behavior_spec.action_spec.discrete_size > 0: act_masks = ModelUtils.list_to_tensor( np.ones( ( self.n_sequences * self.policy.sequence_length, sum(self.policy.behavior_spec.action_spec.discrete_branches), ), dtype=np.float32, ) ) memories = [] if self.policy.use_recurrent: memories = torch.zeros(1, self.n_sequences, self.policy.m_size) selected_actions, log_probs, _, _ = self.policy.sample_actions( tensor_obs, masks=act_masks, memories=memories, seq_len=self.policy.sequence_length, ) bc_loss = self._behavioral_cloning_loss( selected_actions, log_probs, expert_actions ) self.optimizer.zero_grad() bc_loss.backward() self.optimizer.step() run_out = {"loss": bc_loss.item()} return run_out
def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Uses the current state embedding and the action of the mini_batch to predict the next state embedding. """ if self._policy_specs.is_action_continuous(): action = ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float) else: action = torch.cat( ModelUtils.actions_to_onehot( ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long), self._policy_specs.discrete_action_branches, ), dim=1, ) forward_model_input = torch.cat( (self.get_current_state(mini_batch), action), dim=1 ) return self.forward_model_next_state_prediction(forward_model_input)
def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Extracts the next state embedding from a mini_batch. """ n_obs = len(self._state_encoder.processors) np_obs = ObsUtil.from_buffer_next(mini_batch, n_obs) # Convert to tensors tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] hidden, _ = self._state_encoder.forward(tensor_obs) return hidden
def compute_forward_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Computes the loss for the next state prediction """ return torch.mean( ModelUtils.dynamic_partition( self.compute_reward(mini_batch), ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), 2, )[1])
def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Computes the inverse loss for a mini_batch. Corresponds to the error on the action prediction (given the current and next state). """ predicted_action = self.predict_action(mini_batch) actions = AgentAction.from_dict(mini_batch) _inverse_loss = 0 if self._action_spec.continuous_size > 0: sq_difference = ( actions.continuous_tensor - predicted_action.continuous ) ** 2 sq_difference = torch.sum(sq_difference, dim=1) _inverse_loss += torch.mean( ModelUtils.dynamic_partition( sq_difference, ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), 2, )[1] ) if self._action_spec.discrete_size > 0: true_action = torch.cat( ModelUtils.actions_to_onehot( actions.discrete_tensor, self._action_spec.discrete_branches ), dim=1, ) cross_entropy = torch.sum( -torch.log(predicted_action.discrete + self.EPSILON) * true_action, dim=1, ) _inverse_loss += torch.mean( ModelUtils.dynamic_partition( cross_entropy, ModelUtils.list_to_tensor( mini_batch["masks"], dtype=torch.float ), # use masks not action_masks 2, )[1] ) return _inverse_loss
def test_evaluate_actions(rnn, visual, discrete): policy = create_policy_mock(TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual) buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size) vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])] act_masks = ModelUtils.list_to_tensor(buffer["action_mask"]) agent_action = AgentAction.from_dict(buffer) vis_obs = [] for idx, _ in enumerate( policy.actor_critic.network_body.visual_processors): vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx]) vis_obs.append(vis_ob) memories = [ ModelUtils.list_to_tensor(buffer["memory"][i]) for i in range(0, len(buffer["memory"]), policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) log_probs, entropy, values = policy.evaluate_actions( vec_obs, vis_obs, masks=act_masks, actions=agent_action, memories=memories, seq_len=policy.sequence_length, ) if discrete: _size = policy.behavior_spec.action_spec.discrete_size else: _size = policy.behavior_spec.action_spec.continuous_size assert log_probs.flatten().shape == (64, _size) assert entropy.shape == (64, ) for val in values.values(): assert val.shape == (64, )
def from_buffer(buff: AgentBuffer) -> "ActionLogProbs": """ A static method that accesses continuous and discrete log probs fields in an AgentBuffer and constructs the corresponding ActionLogProbs from the retrieved np arrays. """ continuous: torch.Tensor = None discrete: List[torch.Tensor] = None # type: ignore if BufferKey.CONTINUOUS_LOG_PROBS in buff: continuous = ModelUtils.list_to_tensor( buff[BufferKey.CONTINUOUS_LOG_PROBS]) if BufferKey.DISCRETE_LOG_PROBS in buff: discrete_tensor = ModelUtils.list_to_tensor( buff[BufferKey.DISCRETE_LOG_PROBS]) # This will keep discrete_list = None which enables flatten() if discrete_tensor.shape[1] > 0: discrete = [ discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1]) ] return ActionLogProbs(continuous, discrete, None)
def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs": """ A static method that accesses continuous and discrete log probs fields in an AgentBuffer and constructs the corresponding ActionLogProbs from the retrieved np arrays. """ continuous: torch.Tensor = None discrete: List[torch.Tensor] = None # type: ignore if "continuous_log_probs" in buff: continuous = ModelUtils.list_to_tensor( buff["continuous_log_probs"]) if "discrete_log_probs" in buff: discrete_tensor = ModelUtils.list_to_tensor( buff["discrete_log_probs"]) # This will keep discrete_list = None which enables flatten() if discrete_tensor.shape[1] > 0: discrete = [ discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1]) ] return ActionLogProbs(continuous, discrete, None)
def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Computes the inverse loss for a mini_batch. Corresponds to the error on the action prediction (given the current and next state). """ predicted_action = self.predict_action(mini_batch) if self._policy_specs.is_action_continuous(): sq_difference = ( ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float) - predicted_action ) ** 2 sq_difference = torch.sum(sq_difference, dim=1) return torch.mean( ModelUtils.dynamic_partition( sq_difference, ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), 2, )[1] ) else: true_action = torch.cat( ModelUtils.actions_to_onehot( ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long), self._policy_specs.discrete_action_branches, ), dim=1, ) cross_entropy = torch.sum( -torch.log(predicted_action + self.EPSILON) * true_action, dim=1 ) return torch.mean( ModelUtils.dynamic_partition( cross_entropy, ModelUtils.list_to_tensor( mini_batch["masks"], dtype=torch.float ), # use masks not action_masks 2, )[1] )
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Performs update on model. :param batch: Batch of experiences. :param num_sequences: Number of sequences to process. :return: Results of update. """ # Get decayed parameters decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) decay_eps = self.decay_epsilon.get_value( self.policy.get_current_step()) decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) returns = {} old_values = {} for name in self.reward_signals: old_values[name] = ModelUtils.list_to_tensor( batch[f"{name}_value_estimates"]) returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"]) vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])] act_masks = ModelUtils.list_to_tensor(batch["action_mask"]) if self.policy.use_continuous_act: actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1) else: actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long) memories = [ ModelUtils.list_to_tensor(batch["memory"][i]) for i in range( 0, len(batch["memory"]), self.policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) if self.policy.use_vis_obs: vis_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_encoders): vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx]) vis_obs.append(vis_ob) else: vis_obs = [] log_probs, entropy, values = self.policy.evaluate_actions( vec_obs, vis_obs, masks=act_masks, actions=actions, memories=memories, seq_len=self.policy.sequence_length, ) loss_masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) value_loss = self.ppo_value_loss(values, old_values, returns, decay_eps, loss_masks) policy_loss = self.ppo_policy_loss( ModelUtils.list_to_tensor(batch["advantages"]), log_probs, ModelUtils.list_to_tensor(batch["action_probs"]), loss_masks, ) loss = (policy_loss + 0.5 * value_loss - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) self.optimizer.zero_grad() loss.backward() self.optimizer.step() update_stats = { "Losses/Policy Loss": abs(policy_loss.detach().cpu().numpy()), "Losses/Value Loss": value_loss.detach().cpu().numpy(), "Policy/Learning Rate": decay_lr, "Policy/Epsilon": decay_eps, "Policy/Beta": decay_bet, } for reward_provider in self.reward_signals.values(): update_stats.update(reward_provider.update(batch)) return update_stats
def get_trajectory_and_baseline_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], next_groupmate_obs: List[List[np.ndarray]], done: bool, agent_id: str = "", ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField], Optional[AgentBufferField], ]: """ Get value estimates, baseline estimates, and memories for a trajectory, in batch form. :param batch: An AgentBuffer that consists of a trajectory. :param next_obs: the next observation (after the trajectory). Used for boostrapping if this is not a termiinal trajectory. :param next_groupmate_obs: the next observations from other members of the group. :param done: Set true if this is a terminal trajectory. :param agent_id: Agent ID of the agent that this trajectory belongs to. :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)], the baseline estimates as a Dict, the final value estimate as a Dict of [name, float], and optionally (if using memories) an AgentBufferField of initial critic and baseline memories to be used during update. """ n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs) current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] groupmate_obs = [[ ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs ] for _groupmate_obs in groupmate_obs] groupmate_actions = AgentAction.group_from_buffer(batch) next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] next_obs = [obs.unsqueeze(0) for obs in next_obs] next_groupmate_obs = [ ModelUtils.list_to_tensor_list(_list_obs) for _list_obs in next_groupmate_obs ] # Expand dimensions of next critic obs next_groupmate_obs = [[_obs.unsqueeze(0) for _obs in _list_obs] for _list_obs in next_groupmate_obs] if agent_id in self.value_memory_dict: # The agent_id should always be in both since they are added together _init_value_mem = self.value_memory_dict[agent_id] _init_baseline_mem = self.baseline_memory_dict[agent_id] else: _init_value_mem = (torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None) _init_baseline_mem = (torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None) all_obs = ([current_obs] + groupmate_obs if groupmate_obs is not None else [current_obs]) all_next_value_mem: Optional[AgentBufferField] = None all_next_baseline_mem: Optional[AgentBufferField] = None with torch.no_grad(): if self.policy.use_recurrent: ( value_estimates, baseline_estimates, all_next_value_mem, all_next_baseline_mem, next_value_mem, next_baseline_mem, ) = self._evaluate_by_sequence_team( current_obs, groupmate_obs, groupmate_actions, _init_value_mem, _init_baseline_mem, ) else: value_estimates, next_value_mem = self.critic.critic_pass( all_obs, _init_value_mem, sequence_length=batch.num_experiences) groupmate_obs_and_actions = (groupmate_obs, groupmate_actions) baseline_estimates, next_baseline_mem = self.critic.baseline( current_obs, groupmate_obs_and_actions, _init_baseline_mem, sequence_length=batch.num_experiences, ) # Store the memory for the next trajectory self.value_memory_dict[agent_id] = next_value_mem self.baseline_memory_dict[agent_id] = next_baseline_mem all_next_obs = ([next_obs] + next_groupmate_obs if next_groupmate_obs is not None else [next_obs]) next_value_estimates, _ = self.critic.critic_pass(all_next_obs, next_value_mem, sequence_length=1) for name, estimate in baseline_estimates.items(): baseline_estimates[name] = ModelUtils.to_numpy(estimate) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) # the base line and V shpuld not be on the same done flag for name, estimate in next_value_estimates.items(): next_value_estimates[name] = ModelUtils.to_numpy(estimate) if done: for k in next_value_estimates: if not self.reward_signals[k].ignore_done: next_value_estimates[k][-1] = 0.0 return ( value_estimates, baseline_estimates, next_value_estimates, all_next_value_mem, all_next_baseline_mem, )
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Performs update on model. :param batch: Batch of experiences. :param num_sequences: Number of sequences to process. :return: Results of update. """ # Get decayed parameters decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) decay_eps = self.decay_epsilon.get_value( self.policy.get_current_step()) decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) returns = {} old_values = {} for name in self.reward_signals: old_values[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.value_estimates_key(name)]) returns[name] = ModelUtils.list_to_tensor( batch[RewardSignalUtil.returns_key(name)]) n_obs = len(self.policy.behavior_spec.observation_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) actions = AgentAction.from_buffer(batch) memories = [ ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) ] if len(memories) > 0: memories = torch.stack(memories).unsqueeze(0) # Get value memories value_memories = [ ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length) ] if len(value_memories) > 0: value_memories = torch.stack(value_memories).unsqueeze(0) log_probs, entropy = self.policy.evaluate_actions( current_obs, masks=act_masks, actions=actions, memories=memories, seq_len=self.policy.sequence_length, ) values, _ = self.critic.critic_pass( current_obs, memories=value_memories, sequence_length=self.policy.sequence_length, ) old_log_probs = ActionLogProbs.from_buffer(batch).flatten() log_probs = log_probs.flatten() loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) value_loss = self.ppo_value_loss(values, old_values, returns, decay_eps, loss_masks) policy_loss = self.ppo_policy_loss( ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]), log_probs, old_log_probs, loss_masks, ) loss = (policy_loss + 0.5 * value_loss - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)) # Set optimizer learning rate ModelUtils.update_learning_rate(self.optimizer, decay_lr) self.optimizer.zero_grad() loss.backward() self.optimizer.step() update_stats = { # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. # TODO: After PyTorch is default, change to something more correct. "Losses/Policy Loss": torch.abs(policy_loss).item(), "Losses/Value Loss": value_loss.item(), "Policy/Learning Rate": decay_lr, "Policy/Epsilon": decay_eps, "Policy/Beta": decay_bet, } for reward_provider in self.reward_signals.values(): update_stats.update(reward_provider.update(batch)) return update_stats
def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: """ Updates model using buffer. :param num_sequences: Number of trajectories in batch. :param batch: Experience mini-batch. :param update_target: Whether or not to update target value network :param reward_signal_batches: Minibatches to use for updating the reward signals, indexed by name. If none, don't update the reward signals. :return: Output from update process. """ rewards = {} for name in self.reward_signals: rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"]) n_obs = len(self.policy.behavior_spec.sensor_specs) current_obs = ObsUtil.from_buffer(batch, n_obs) # Convert to tensors current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] next_obs = ObsUtil.from_buffer_next(batch, n_obs) # Convert to tensors next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] act_masks = ModelUtils.list_to_tensor(batch["action_mask"]) actions = AgentAction.from_dict(batch) memories_list = [ ModelUtils.list_to_tensor(batch["memory"][i]) for i in range( 0, len(batch["memory"]), self.policy.sequence_length) ] # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. offset = 1 if self.policy.sequence_length > 1 else 0 next_memories_list = [ ModelUtils.list_to_tensor( batch["memory"][i] [self.policy.m_size // 2:]) # only pass value part of memory to target network for i in range(offset, len(batch["memory"]), self.policy.sequence_length) ] if len(memories_list) > 0: memories = torch.stack(memories_list).unsqueeze(0) next_memories = torch.stack(next_memories_list).unsqueeze(0) else: memories = None next_memories = None # Q network memories are 0'ed out, since we don't have them during inference. q_memories = (torch.zeros_like(next_memories) if next_memories is not None else None) # Copy normalizers from policy self.value_network.q1_network.network_body.copy_normalization( self.policy.actor_critic.network_body) self.value_network.q2_network.network_body.copy_normalization( self.policy.actor_critic.network_body) self.target_network.network_body.copy_normalization( self.policy.actor_critic.network_body) ( sampled_actions, log_probs, _, value_estimates, _, ) = self.policy.actor_critic.get_action_stats_and_value( current_obs, masks=act_masks, memories=memories, sequence_length=self.policy.sequence_length, ) cont_sampled_actions = sampled_actions.continuous_tensor cont_actions = actions.continuous_tensor q1p_out, q2p_out = self.value_network( current_obs, cont_sampled_actions, memories=q_memories, sequence_length=self.policy.sequence_length, q2_grad=False, ) q1_out, q2_out = self.value_network( current_obs, cont_actions, memories=q_memories, sequence_length=self.policy.sequence_length, ) if self._action_spec.discrete_size > 0: disc_actions = actions.discrete_tensor q1_stream = self._condense_q_streams(q1_out, disc_actions) q2_stream = self._condense_q_streams(q2_out, disc_actions) else: q1_stream, q2_stream = q1_out, q2_out with torch.no_grad(): target_values, _ = self.target_network( next_obs, memories=next_memories, sequence_length=self.policy.sequence_length, ) masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool) dones = ModelUtils.list_to_tensor(batch["done"]) q1_loss, q2_loss = self.sac_q_loss(q1_stream, q2_stream, target_values, dones, rewards, masks) value_loss = self.sac_value_loss(log_probs, value_estimates, q1p_out, q2p_out, masks) policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks) entropy_loss = self.sac_entropy_loss(log_probs, masks) total_value_loss = q1_loss + q2_loss + value_loss decay_lr = self.decay_learning_rate.get_value( self.policy.get_current_step()) ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) self.value_optimizer.zero_grad() total_value_loss.backward() self.value_optimizer.step() ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) self.entropy_optimizer.zero_grad() entropy_loss.backward() self.entropy_optimizer.step() # Update target network ModelUtils.soft_update(self.policy.actor_critic.critic, self.target_network, self.tau) update_stats = { "Losses/Policy Loss": policy_loss.item(), "Losses/Value Loss": value_loss.item(), "Losses/Q1 Loss": q1_loss.item(), "Losses/Q2 Loss": q2_loss.item(), "Policy/Discrete Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.discrete)).item(), "Policy/Continuous Entropy Coeff": torch.mean(torch.exp(self._log_ent_coef.continuous)).item(), "Policy/Learning Rate": decay_lr, } return update_stats
def get_trajectory_value_estimates( self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool, agent_id: str = "", ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]: """ Get value estimates and memories for a trajectory, in batch form. :param batch: An AgentBuffer that consists of a trajectory. :param next_obs: the next observation (after the trajectory). Used for boostrapping if this is not a termiinal trajectory. :param done: Set true if this is a terminal trajectory. :param agent_id: Agent ID of the agent that this trajectory belongs to. :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)], the final value estimate as a Dict of [name, float], and optionally (if using memories) an AgentBufferField of initial critic memories to be used during update. """ n_obs = len(self.policy.behavior_spec.observation_specs) if agent_id in self.critic_memory_dict: memory = self.critic_memory_dict[agent_id] else: memory = ( torch.zeros((1, 1, self.critic.memory_size)) if self.policy.use_recurrent else None ) # Convert to tensors current_obs = [ ModelUtils.list_to_tensor(obs) for obs in ObsUtil.from_buffer(batch, n_obs) ] next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] next_obs = [obs.unsqueeze(0) for obs in next_obs] # If we're using LSTM, we want to get all the intermediate memories. all_next_memories: Optional[AgentBufferField] = None # To prevent memory leak and improve performance, evaluate with no_grad. with torch.no_grad(): if self.policy.use_recurrent: ( value_estimates, all_next_memories, next_memory, ) = self._evaluate_by_sequence(current_obs, memory) else: value_estimates, next_memory = self.critic.critic_pass( current_obs, memory, sequence_length=batch.num_experiences ) # Store the memory for the next trajectory. This should NOT have a gradient. self.critic_memory_dict[agent_id] = next_memory next_value_estimate, _ = self.critic.critic_pass( next_obs, next_memory, sequence_length=1 ) for name, estimate in value_estimates.items(): value_estimates[name] = ModelUtils.to_numpy(estimate) next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name]) if done: for k in next_value_estimate: if not self.reward_signals[k].ignore_done: next_value_estimate[k] = 0.0 if agent_id in self.critic_memory_dict: self.critic_memory_dict.pop(agent_id) return value_estimates, next_value_estimate, all_next_memories