def test_actions_to_onehot(): all_actions = torch.tensor([[1, 0, 2], [1, 0, 2]]) action_size = [2, 1, 3] oh_actions = ModelUtils.actions_to_onehot(all_actions, action_size) expected_result = [ torch.tensor([[0, 1], [0, 1]], dtype=torch.float), torch.tensor([[1], [1]], dtype=torch.float), torch.tensor([[0, 0, 1], [0, 0, 1]], dtype=torch.float), ] for res, exp in zip(oh_actions, expected_result): assert torch.equal(res, exp)
def _condense_q_streams( self, q_output: Dict[str, torch.Tensor], discrete_actions: torch.Tensor) -> Dict[str, torch.Tensor]: condensed_q_output = {} onehot_actions = ModelUtils.actions_to_onehot(discrete_actions, self.act_size) for key, item in q_output.items(): branched_q = ModelUtils.break_into_branches(item, self.act_size) only_action_qs = torch.stack([ torch.sum(_act * _q, dim=1, keepdim=True) for _act, _q in zip(onehot_actions, branched_q) ]) condensed_q_output[key] = torch.mean(only_action_qs, dim=0) return condensed_q_output
def to_flat(self, discrete_branches: List[int]) -> torch.Tensor: """ Flatten this AgentAction into a single torch Tensor of dimension (batch, num_continuous + num_one_hot_discrete). Discrete actions are converted into one-hot and concatenated with continuous actions. :param discrete_branches: List of sizes for discrete actions. :return: Tensor of flattened actions. """ # if there are any discrete actions, create one-hot if self.discrete_list is not None and self.discrete_list: discrete_oh = ModelUtils.actions_to_onehot(self.discrete_tensor, discrete_branches) discrete_oh = torch.cat(discrete_oh, dim=1) else: discrete_oh = torch.empty(0) return torch.cat([self.continuous_tensor, discrete_oh], dim=-1)
def test_predict_minimum_training(): # of 5 numbers, predict index of min np.random.seed(1336) torch.manual_seed(1336) n_k = 5 size = n_k + 1 embedding_size = 64 entity_embeddings = EntityEmbeddings(size, [size], embedding_size, [n_k], concat_self=False) transformer = ResidualSelfAttention(embedding_size) l_layer = LinearEncoder(embedding_size, 2, n_k) loss = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam( list(entity_embeddings.parameters()) + list(transformer.parameters()) + list(l_layer.parameters()), lr=0.001, weight_decay=1e-6, ) batch_size = 200 onehots = ModelUtils.actions_to_onehot( torch.range(0, n_k - 1).unsqueeze(1), [n_k])[0] onehots = onehots.expand((batch_size, -1, -1)) losses = [] for _ in range(400): num = np.random.randint(0, n_k) inp = torch.rand((batch_size, num + 1, 1)) with torch.no_grad(): # create the target : The minimum argmin = torch.argmin(inp, dim=1) argmin = argmin.squeeze() argmin = argmin.detach() sliced_oh = onehots[:, :num + 1] inp = torch.cat([inp, sliced_oh], dim=2) embeddings = entity_embeddings(inp, [inp]) masks = EntityEmbeddings.get_masks([inp]) prediction = transformer(embeddings, masks) prediction = l_layer(prediction) ce = loss(prediction, argmin) losses.append(ce.item()) print(ce.item()) optimizer.zero_grad() ce.backward() optimizer.step() assert np.array(losses[-20:]).mean() < 0.1
def forward(self, action: AgentAction) -> torch.Tensor: """ Returns a tensor corresponding the flattened action :param action: An AgentAction object """ action_list: List[torch.Tensor] = [] if self._specs.continuous_size > 0: action_list.append(action.continuous_tensor) if self._specs.discrete_size > 0: flat_discrete = torch.cat( ModelUtils.actions_to_onehot( torch.as_tensor(action.discrete_tensor, dtype=torch.long), self._specs.discrete_branches, ), dim=1, ) action_list.append(flat_discrete) return torch.cat(action_list, dim=1)
def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Uses the current state embedding and the action of the mini_batch to predict the next state embedding. """ if self._policy_specs.is_action_continuous(): action = ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float) else: action = torch.cat( ModelUtils.actions_to_onehot( ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long), self._policy_specs.discrete_action_branches, ), dim=1, ) forward_model_input = torch.cat( (self.get_current_state(mini_batch), action), dim=1 ) return self.forward_model_next_state_prediction(forward_model_input)
def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Computes the inverse loss for a mini_batch. Corresponds to the error on the action prediction (given the current and next state). """ predicted_action = self.predict_action(mini_batch) actions = AgentAction.from_dict(mini_batch) _inverse_loss = 0 if self._action_spec.continuous_size > 0: sq_difference = ( actions.continuous_tensor - predicted_action.continuous ) ** 2 sq_difference = torch.sum(sq_difference, dim=1) _inverse_loss += torch.mean( ModelUtils.dynamic_partition( sq_difference, ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), 2, )[1] ) if self._action_spec.discrete_size > 0: true_action = torch.cat( ModelUtils.actions_to_onehot( actions.discrete_tensor, self._action_spec.discrete_branches ), dim=1, ) cross_entropy = torch.sum( -torch.log(predicted_action.discrete + self.EPSILON) * true_action, dim=1, ) _inverse_loss += torch.mean( ModelUtils.dynamic_partition( cross_entropy, ModelUtils.list_to_tensor( mini_batch["masks"], dtype=torch.float ), # use masks not action_masks 2, )[1] ) return _inverse_loss
def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: """ Computes the inverse loss for a mini_batch. Corresponds to the error on the action prediction (given the current and next state). """ predicted_action = self.predict_action(mini_batch) if self._policy_specs.is_action_continuous(): sq_difference = ( ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float) - predicted_action ) ** 2 sq_difference = torch.sum(sq_difference, dim=1) return torch.mean( ModelUtils.dynamic_partition( sq_difference, ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float), 2, )[1] ) else: true_action = torch.cat( ModelUtils.actions_to_onehot( ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long), self._policy_specs.discrete_action_branches, ), dim=1, ) cross_entropy = torch.sum( -torch.log(predicted_action + self.EPSILON) * true_action, dim=1 ) return torch.mean( ModelUtils.dynamic_partition( cross_entropy, ModelUtils.list_to_tensor( mini_batch["masks"], dtype=torch.float ), # use masks not action_masks 2, )[1] )
def _update_batch(self, mini_batch_demo: Dict[str, np.ndarray], n_sequences: int) -> Dict[str, float]: """ Helper function for update_batch. """ vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])] act_masks = None if self.policy.use_continuous_act: expert_actions = ModelUtils.list_to_tensor( mini_batch_demo["actions"]) else: raw_expert_actions = ModelUtils.list_to_tensor( mini_batch_demo["actions"], dtype=torch.long) expert_actions = ModelUtils.actions_to_onehot( raw_expert_actions, self.policy.act_size) act_masks = ModelUtils.list_to_tensor( np.ones( ( self.n_sequences * self.policy.sequence_length, sum(self.policy.behavior_spec.action_spec. discrete_branches), ), dtype=np.float32, )) memories = [] if self.policy.use_recurrent: memories = torch.zeros(1, self.n_sequences, self.policy.m_size) if self.policy.use_vis_obs: vis_obs = [] for idx, _ in enumerate( self.policy.actor_critic.network_body.visual_processors): vis_ob = ModelUtils.list_to_tensor( mini_batch_demo["visual_obs%d" % idx]) vis_obs.append(vis_ob) else: vis_obs = [] ( selected_actions, clipped_actions, all_log_probs, _, _, ) = self.policy.sample_actions( vec_obs, vis_obs, masks=act_masks, memories=memories, seq_len=self.policy.sequence_length, all_log_probs=True, ) bc_loss = self._behavioral_cloning_loss(clipped_actions, all_log_probs, expert_actions) self.optimizer.zero_grad() bc_loss.backward() self.optimizer.step() run_out = {"loss": bc_loss.item()} return run_out