예제 #1
0
def test_sample_actions(rnn, visual, discrete):
    policy = create_policy_mock(
        TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
    act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK])

    np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs))
    tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]

    memories = [
        ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i])
        for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length)
    ]
    if len(memories) > 0:
        memories = torch.stack(memories).unsqueeze(0)

    (sampled_actions, log_probs, entropies, memories) = policy.sample_actions(
        tensor_obs, masks=act_masks, memories=memories, seq_len=policy.sequence_length
    )
    if discrete:
        assert log_probs.all_discrete_tensor.shape == (
            64,
            sum(policy.behavior_spec.action_spec.discrete_branches),
        )
    else:
        assert log_probs.continuous_tensor.shape == (
            64,
            policy.behavior_spec.action_spec.continuous_size,
        )
    assert entropies.shape == (64,)

    if rnn:
        assert memories.shape == (1, 1, policy.m_size)
예제 #2
0
    def _group_agent_action_from_buffer(
            buff: AgentBuffer, cont_action_key: BufferKey,
            disc_action_key: BufferKey) -> List["AgentAction"]:
        """
        Extracts continuous and discrete groupmate actions, as specified by BufferKey, and
        returns a List of AgentActions that correspond to the groupmate's actions. List will
        be of length equal to the maximum number of groupmates in the buffer. Any spots where
        there are less agents than maximum, the actions will be padded with 0's.
        """
        continuous_tensors: List[torch.Tensor] = []
        discrete_tensors: List[torch.Tensor] = []
        if cont_action_key in buff:
            padded_batch = buff[cont_action_key].padded_to_batch()
            continuous_tensors = [
                ModelUtils.list_to_tensor(arr) for arr in padded_batch
            ]
        if disc_action_key in buff:
            padded_batch = buff[disc_action_key].padded_to_batch(dtype=np.long)
            discrete_tensors = [
                ModelUtils.list_to_tensor(arr, dtype=torch.long)
                for arr in padded_batch
            ]

        actions_list = []
        for _cont, _disc in itertools.zip_longest(continuous_tensors,
                                                  discrete_tensors,
                                                  fillvalue=None):
            if _disc is not None:
                _disc = [_disc[..., i] for i in range(_disc.shape[-1])]
            actions_list.append(AgentAction(_cont, _disc))
        return actions_list
예제 #3
0
def test_evaluate_actions(rnn, visual, discrete):
    policy = create_policy_mock(
        TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    buffer = mb.simulate_rollout(64, policy.behavior_spec, memory_size=policy.m_size)
    act_masks = ModelUtils.list_to_tensor(buffer[BufferKey.ACTION_MASK])
    agent_action = AgentAction.from_buffer(buffer)
    np_obs = ObsUtil.from_buffer(buffer, len(policy.behavior_spec.observation_specs))
    tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]

    memories = [
        ModelUtils.list_to_tensor(buffer[BufferKey.MEMORY][i])
        for i in range(0, len(buffer[BufferKey.MEMORY]), policy.sequence_length)
    ]
    if len(memories) > 0:
        memories = torch.stack(memories).unsqueeze(0)

    log_probs, entropy, values = policy.evaluate_actions(
        tensor_obs,
        masks=act_masks,
        actions=agent_action,
        memories=memories,
        seq_len=policy.sequence_length,
    )
    if discrete:
        _size = policy.behavior_spec.action_spec.discrete_size
    else:
        _size = policy.behavior_spec.action_spec.continuous_size

    assert log_probs.flatten().shape == (64, _size)
    assert entropy.shape == (64,)
    for val in values.values():
        assert val.shape == (64,)
    def get_trajectory_value_estimates(
            self, batch: AgentBuffer, next_obs: List[np.ndarray],
            done: bool) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
        n_obs = len(self.policy.behavior_spec.observation_specs)
        current_obs = ObsUtil.from_buffer(batch, n_obs)

        # Convert to tensors
        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]

        memory = torch.zeros([1, 1, self.policy.m_size])

        next_obs = [obs.unsqueeze(0) for obs in next_obs]

        value_estimates, next_memory = self.policy.actor_critic.critic_pass(
            current_obs, memory, sequence_length=batch.num_experiences)

        next_value_estimate, _ = self.policy.actor_critic.critic_pass(
            next_obs, next_memory, sequence_length=1)

        for name, estimate in value_estimates.items():
            value_estimates[name] = ModelUtils.to_numpy(estimate)
            next_value_estimate[name] = ModelUtils.to_numpy(
                next_value_estimate[name])

        if done:
            for k in next_value_estimate:
                if not self.reward_signals[k].ignore_done:
                    next_value_estimate[k] = 0.0

        return value_estimates, next_value_estimate
예제 #5
0
    def _update_batch(self, mini_batch_demo: Dict[str, np.ndarray],
                      n_sequences: int) -> Dict[str, float]:
        """
        Helper function for update_batch.
        """
        vec_obs = [ModelUtils.list_to_tensor(mini_batch_demo["vector_obs"])]
        act_masks = None
        if self.policy.use_continuous_act:
            expert_actions = ModelUtils.list_to_tensor(
                mini_batch_demo["actions"])
        else:
            raw_expert_actions = ModelUtils.list_to_tensor(
                mini_batch_demo["actions"], dtype=torch.long)
            expert_actions = ModelUtils.actions_to_onehot(
                raw_expert_actions, self.policy.act_size)
            act_masks = ModelUtils.list_to_tensor(
                np.ones(
                    (
                        self.n_sequences * self.policy.sequence_length,
                        sum(self.policy.behavior_spec.discrete_action_branches
                            ),
                    ),
                    dtype=np.float32,
                ))

        memories = []
        if self.policy.use_recurrent:
            memories = torch.zeros(1, self.n_sequences, self.policy.m_size)

        if self.policy.use_vis_obs:
            vis_obs = []
            for idx, _ in enumerate(
                    self.policy.actor_critic.network_body.visual_processors):
                vis_ob = ModelUtils.list_to_tensor(
                    mini_batch_demo["visual_obs%d" % idx])
                vis_obs.append(vis_ob)
        else:
            vis_obs = []

        selected_actions, all_log_probs, _, _ = self.policy.sample_actions(
            vec_obs,
            vis_obs,
            masks=act_masks,
            memories=memories,
            seq_len=self.policy.sequence_length,
            all_log_probs=True,
        )
        bc_loss = self._behavioral_cloning_loss(selected_actions,
                                                all_log_probs, expert_actions)
        self.optimizer.zero_grad()
        bc_loss.backward()

        self.optimizer.step()
        run_out = {"loss": bc_loss.item()}
        return run_out
예제 #6
0
def test_sample_actions(rnn, visual, discrete):
    policy = create_policy_mock(TrainerSettings(),
                                use_rnn=rnn,
                                use_discrete=discrete,
                                use_visual=visual)
    buffer = mb.simulate_rollout(64,
                                 policy.behavior_spec,
                                 memory_size=policy.m_size)
    vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])]
    act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])

    vis_obs = []
    for idx, _ in enumerate(
            policy.actor_critic.network_body.visual_processors):
        vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx])
        vis_obs.append(vis_ob)

    memories = [
        ModelUtils.list_to_tensor(buffer["memory"][i])
        for i in range(0, len(buffer["memory"]), policy.sequence_length)
    ]
    if len(memories) > 0:
        memories = torch.stack(memories).unsqueeze(0)

    (
        sampled_actions,
        clipped_actions,
        log_probs,
        entropies,
        memories,
    ) = policy.sample_actions(
        vec_obs,
        vis_obs,
        masks=act_masks,
        memories=memories,
        seq_len=policy.sequence_length,
        all_log_probs=not policy.use_continuous_act,
    )
    if discrete:
        assert log_probs.shape == (
            64,
            sum(policy.behavior_spec.action_spec.discrete_branches),
        )
    else:
        assert log_probs.shape == (
            64, policy.behavior_spec.action_spec.continuous_size)
        assert clipped_actions.shape == (
            64,
            policy.behavior_spec.action_spec.continuous_size,
        )
    assert entropies.shape == (64, )

    if rnn:
        assert memories.shape == (1, 1, policy.m_size)
 def forward(self, mini_batch: AgentBuffer) -> torch.Tensor:
     n_vis = len(self._encoder.visual_processors)
     hidden, _ = self._encoder.forward(
         vec_inputs=[
             ModelUtils.list_to_tensor(mini_batch["vector_obs"],
                                       dtype=torch.float)
         ],
         vis_inputs=[
             ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i],
                                       dtype=torch.float)
             for i in range(n_vis)
         ],
     )
     self._encoder.update_normalization(
         torch.tensor(mini_batch["vector_obs"]))
     return hidden
예제 #8
0
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None:
    """
    Make sure two policies have the same output for the same input.
    """
    policy1.actor = policy1.actor.to(default_device())
    policy2.actor = policy2.actor.to(default_device())

    decision_step, _ = mb.create_steps_from_behavior_spec(
        policy1.behavior_spec, num_agents=1)
    np_obs = decision_step.obs
    masks = policy1._extract_masks(decision_step)
    memories = torch.as_tensor(
        policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0)
    tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]

    with torch.no_grad():
        _, log_probs1, _, _ = policy1.sample_actions(tensor_obs,
                                                     masks=masks,
                                                     memories=memories)
        _, log_probs2, _, _ = policy2.sample_actions(tensor_obs,
                                                     masks=masks,
                                                     memories=memories)
    np.testing.assert_array_equal(
        ModelUtils.to_numpy(log_probs1.all_discrete_tensor),
        ModelUtils.to_numpy(log_probs2.all_discrete_tensor),
    )
 def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
     """
     Extracts the current state embedding from a mini_batch.
     """
     n_vis = len(self._state_encoder.visual_processors)
     hidden, _ = self._state_encoder.forward(
         vec_inputs=[
             ModelUtils.list_to_tensor(mini_batch["vector_obs"],
                                       dtype=torch.float)
         ],
         vis_inputs=[
             ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i],
                                       dtype=torch.float)
             for i in range(n_vis)
         ],
     )
     return hidden
예제 #10
0
    def get_trajectory_value_estimates(
            self, batch: AgentBuffer, next_obs: List[np.ndarray],
            done: bool) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
        vector_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
        if self.policy.use_vis_obs:
            visual_obs = []
            for idx, _ in enumerate(
                    self.policy.actor_critic.network_body.visual_processors):
                visual_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" %
                                                            idx])
                visual_obs.append(visual_ob)
        else:
            visual_obs = []

        memory = torch.zeros([1, 1, self.policy.m_size])

        vec_vis_obs = SplitObservations.from_observations(next_obs)
        next_vec_obs = [
            ModelUtils.list_to_tensor(
                vec_vis_obs.vector_observations).unsqueeze(0)
        ]
        next_vis_obs = [
            ModelUtils.list_to_tensor(_vis_ob).unsqueeze(0)
            for _vis_ob in vec_vis_obs.visual_observations
        ]

        value_estimates, next_memory = self.policy.actor_critic.critic_pass(
            vector_obs,
            visual_obs,
            memory,
            sequence_length=batch.num_experiences)

        next_value_estimate, _ = self.policy.actor_critic.critic_pass(
            next_vec_obs, next_vis_obs, next_memory, sequence_length=1)

        for name, estimate in value_estimates.items():
            value_estimates[name] = ModelUtils.to_numpy(estimate)
            next_value_estimate[name] = ModelUtils.to_numpy(
                next_value_estimate[name])

        if done:
            for k in next_value_estimate:
                if not self.reward_signals[k].ignore_done:
                    next_value_estimate[k] = 0.0

        return value_estimates, next_value_estimate
예제 #11
0
 def from_dict(buff: Dict[str, np.ndarray]) -> "AgentAction":
     """
     A static method that accesses continuous and discrete action fields in an AgentBuffer
     and constructs the corresponding AgentAction from the retrieved np arrays.
     """
     continuous: torch.Tensor = None
     discrete: List[torch.Tensor] = None  # type: ignore
     if "continuous_action" in buff:
         continuous = ModelUtils.list_to_tensor(buff["continuous_action"])
     if "discrete_action" in buff:
         discrete_tensor = ModelUtils.list_to_tensor(
             buff["discrete_action"], dtype=torch.long)
         discrete = [
             discrete_tensor[..., i]
             for i in range(discrete_tensor.shape[-1])
         ]
     return AgentAction(continuous, discrete)
 def get_state_inputs(
     self, mini_batch: AgentBuffer
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
     """
     Creates the observation input.
     """
     n_vis = len(self.encoder.visual_processors)
     n_vec = len(self.encoder.vector_processors)
     vec_inputs = ([
         ModelUtils.list_to_tensor(mini_batch["vector_obs"],
                                   dtype=torch.float)
     ] if n_vec > 0 else [])
     vis_inputs = [
         ModelUtils.list_to_tensor(mini_batch["visual_obs%d" % i],
                                   dtype=torch.float) for i in range(n_vis)
     ]
     return vec_inputs, vis_inputs
예제 #13
0
 def get_state_inputs(self, mini_batch: AgentBuffer) -> List[torch.Tensor]:
     """
     Creates the observation input.
     """
     n_obs = len(self.encoder.processors)
     np_obs = ObsUtil.from_buffer(mini_batch, n_obs)
     # Convert to tensors
     tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
     return tensor_obs
예제 #14
0
def test_list_to_tensor():
    # Test converting pure list
    unconverted_list = [[1.0, 2], [1, 3], [1, 4]]
    tensor = ModelUtils.list_to_tensor(unconverted_list)
    # Should be equivalent to torch.tensor conversion
    assert torch.equal(tensor, torch.tensor(unconverted_list))

    # Test converting pure numpy array
    np_list = np.asarray(unconverted_list)
    tensor = ModelUtils.list_to_tensor(np_list)
    # Should be equivalent to torch.tensor conversion
    assert torch.equal(tensor, torch.tensor(unconverted_list))

    # Test converting list of numpy arrays
    list_of_np = [np.asarray(_el) for _el in unconverted_list]
    tensor = ModelUtils.list_to_tensor(list_of_np)
    # Should be equivalent to torch.tensor conversion
    assert torch.equal(tensor, torch.tensor(unconverted_list, dtype=torch.float32))
예제 #15
0
    def forward(self, mini_batch: AgentBuffer) -> torch.Tensor:
        n_obs = len(self._encoder.processors)
        np_obs = ObsUtil.from_buffer(mini_batch, n_obs)
        # Convert to tensors
        tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]

        hidden, _ = self._encoder.forward(tensor_obs)
        self._encoder.update_normalization(mini_batch)
        return hidden
예제 #16
0
 def from_buffer(buff: AgentBuffer) -> "AgentAction":
     """
     A static method that accesses continuous and discrete action fields in an AgentBuffer
     and constructs the corresponding AgentAction from the retrieved np arrays.
     """
     continuous: torch.Tensor = None
     discrete: List[torch.Tensor] = None  # type: ignore
     if BufferKey.CONTINUOUS_ACTION in buff:
         continuous = ModelUtils.list_to_tensor(
             buff[BufferKey.CONTINUOUS_ACTION])
     if BufferKey.DISCRETE_ACTION in buff:
         discrete_tensor = ModelUtils.list_to_tensor(
             buff[BufferKey.DISCRETE_ACTION], dtype=torch.long)
         discrete = [
             discrete_tensor[..., i]
             for i in range(discrete_tensor.shape[-1])
         ]
     return AgentAction(continuous, discrete)
예제 #17
0
    def _update_batch(
        self, mini_batch_demo: AgentBuffer, n_sequences: int
    ) -> Dict[str, float]:
        """
        Helper function for update_batch.
        """
        np_obs = ObsUtil.from_buffer(
            mini_batch_demo, len(self.policy.behavior_spec.observation_specs)
        )
        # Convert to tensors
        tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
        act_masks = None
        expert_actions = AgentAction.from_buffer(mini_batch_demo)
        if self.policy.behavior_spec.action_spec.discrete_size > 0:

            act_masks = ModelUtils.list_to_tensor(
                np.ones(
                    (
                        self.n_sequences * self.policy.sequence_length,
                        sum(self.policy.behavior_spec.action_spec.discrete_branches),
                    ),
                    dtype=np.float32,
                )
            )

        memories = []
        if self.policy.use_recurrent:
            memories = torch.zeros(1, self.n_sequences, self.policy.m_size)

        selected_actions, log_probs, _, _ = self.policy.sample_actions(
            tensor_obs,
            masks=act_masks,
            memories=memories,
            seq_len=self.policy.sequence_length,
        )
        bc_loss = self._behavioral_cloning_loss(
            selected_actions, log_probs, expert_actions
        )
        self.optimizer.zero_grad()
        bc_loss.backward()

        self.optimizer.step()
        run_out = {"loss": bc_loss.item()}
        return run_out
예제 #18
0
    def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
        """
        Uses the current state embedding and the action of the mini_batch to predict
        the next state embedding.
        """
        if self._policy_specs.is_action_continuous():
            action = ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
        else:
            action = torch.cat(
                ModelUtils.actions_to_onehot(
                    ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
                    self._policy_specs.discrete_action_branches,
                ),
                dim=1,
            )
        forward_model_input = torch.cat(
            (self.get_current_state(mini_batch), action), dim=1
        )

        return self.forward_model_next_state_prediction(forward_model_input)
    def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
        """
        Extracts the next state embedding from a mini_batch.
        """
        n_obs = len(self._state_encoder.processors)
        np_obs = ObsUtil.from_buffer_next(mini_batch, n_obs)
        # Convert to tensors
        tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]

        hidden, _ = self._state_encoder.forward(tensor_obs)
        return hidden
 def compute_forward_loss(self, mini_batch: AgentBuffer) -> torch.Tensor:
     """
     Computes the loss for the next state prediction
     """
     return torch.mean(
         ModelUtils.dynamic_partition(
             self.compute_reward(mini_batch),
             ModelUtils.list_to_tensor(mini_batch["masks"],
                                       dtype=torch.float),
             2,
         )[1])
예제 #21
0
 def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor:
     """
     Computes the inverse loss for a mini_batch. Corresponds to the error on the
     action prediction (given the current and next state).
     """
     predicted_action = self.predict_action(mini_batch)
     actions = AgentAction.from_dict(mini_batch)
     _inverse_loss = 0
     if self._action_spec.continuous_size > 0:
         sq_difference = (
             actions.continuous_tensor - predicted_action.continuous
         ) ** 2
         sq_difference = torch.sum(sq_difference, dim=1)
         _inverse_loss += torch.mean(
             ModelUtils.dynamic_partition(
                 sq_difference,
                 ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),
                 2,
             )[1]
         )
     if self._action_spec.discrete_size > 0:
         true_action = torch.cat(
             ModelUtils.actions_to_onehot(
                 actions.discrete_tensor, self._action_spec.discrete_branches
             ),
             dim=1,
         )
         cross_entropy = torch.sum(
             -torch.log(predicted_action.discrete + self.EPSILON) * true_action,
             dim=1,
         )
         _inverse_loss += torch.mean(
             ModelUtils.dynamic_partition(
                 cross_entropy,
                 ModelUtils.list_to_tensor(
                     mini_batch["masks"], dtype=torch.float
                 ),  # use masks not action_masks
                 2,
             )[1]
         )
     return _inverse_loss
def test_evaluate_actions(rnn, visual, discrete):
    policy = create_policy_mock(TrainerSettings(),
                                use_rnn=rnn,
                                use_discrete=discrete,
                                use_visual=visual)
    buffer = mb.simulate_rollout(64,
                                 policy.behavior_spec,
                                 memory_size=policy.m_size)
    vec_obs = [ModelUtils.list_to_tensor(buffer["vector_obs"])]
    act_masks = ModelUtils.list_to_tensor(buffer["action_mask"])
    agent_action = AgentAction.from_dict(buffer)
    vis_obs = []
    for idx, _ in enumerate(
            policy.actor_critic.network_body.visual_processors):
        vis_ob = ModelUtils.list_to_tensor(buffer["visual_obs%d" % idx])
        vis_obs.append(vis_ob)

    memories = [
        ModelUtils.list_to_tensor(buffer["memory"][i])
        for i in range(0, len(buffer["memory"]), policy.sequence_length)
    ]
    if len(memories) > 0:
        memories = torch.stack(memories).unsqueeze(0)

    log_probs, entropy, values = policy.evaluate_actions(
        vec_obs,
        vis_obs,
        masks=act_masks,
        actions=agent_action,
        memories=memories,
        seq_len=policy.sequence_length,
    )
    if discrete:
        _size = policy.behavior_spec.action_spec.discrete_size
    else:
        _size = policy.behavior_spec.action_spec.continuous_size

    assert log_probs.flatten().shape == (64, _size)
    assert entropy.shape == (64, )
    for val in values.values():
        assert val.shape == (64, )
예제 #23
0
    def from_buffer(buff: AgentBuffer) -> "ActionLogProbs":
        """
        A static method that accesses continuous and discrete log probs fields in an AgentBuffer
        and constructs the corresponding ActionLogProbs from the retrieved np arrays.
        """
        continuous: torch.Tensor = None
        discrete: List[torch.Tensor] = None  # type: ignore

        if BufferKey.CONTINUOUS_LOG_PROBS in buff:
            continuous = ModelUtils.list_to_tensor(
                buff[BufferKey.CONTINUOUS_LOG_PROBS])
        if BufferKey.DISCRETE_LOG_PROBS in buff:
            discrete_tensor = ModelUtils.list_to_tensor(
                buff[BufferKey.DISCRETE_LOG_PROBS])
            # This will keep discrete_list = None which enables flatten()
            if discrete_tensor.shape[1] > 0:
                discrete = [
                    discrete_tensor[..., i]
                    for i in range(discrete_tensor.shape[-1])
                ]
        return ActionLogProbs(continuous, discrete, None)
예제 #24
0
    def from_dict(buff: Dict[str, np.ndarray]) -> "ActionLogProbs":
        """
        A static method that accesses continuous and discrete log probs fields in an AgentBuffer
        and constructs the corresponding ActionLogProbs from the retrieved np arrays.
        """
        continuous: torch.Tensor = None
        discrete: List[torch.Tensor] = None  # type: ignore

        if "continuous_log_probs" in buff:
            continuous = ModelUtils.list_to_tensor(
                buff["continuous_log_probs"])
        if "discrete_log_probs" in buff:
            discrete_tensor = ModelUtils.list_to_tensor(
                buff["discrete_log_probs"])
            # This will keep discrete_list = None which enables flatten()
            if discrete_tensor.shape[1] > 0:
                discrete = [
                    discrete_tensor[..., i]
                    for i in range(discrete_tensor.shape[-1])
                ]
        return ActionLogProbs(continuous, discrete, None)
예제 #25
0
 def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor:
     """
     Computes the inverse loss for a mini_batch. Corresponds to the error on the
     action prediction (given the current and next state).
     """
     predicted_action = self.predict_action(mini_batch)
     if self._policy_specs.is_action_continuous():
         sq_difference = (
             ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.float)
             - predicted_action
         ) ** 2
         sq_difference = torch.sum(sq_difference, dim=1)
         return torch.mean(
             ModelUtils.dynamic_partition(
                 sq_difference,
                 ModelUtils.list_to_tensor(mini_batch["masks"], dtype=torch.float),
                 2,
             )[1]
         )
     else:
         true_action = torch.cat(
             ModelUtils.actions_to_onehot(
                 ModelUtils.list_to_tensor(mini_batch["actions"], dtype=torch.long),
                 self._policy_specs.discrete_action_branches,
             ),
             dim=1,
         )
         cross_entropy = torch.sum(
             -torch.log(predicted_action + self.EPSILON) * true_action, dim=1
         )
         return torch.mean(
             ModelUtils.dynamic_partition(
                 cross_entropy,
                 ModelUtils.list_to_tensor(
                     mini_batch["masks"], dtype=torch.float
                 ),  # use masks not action_masks
                 2,
             )[1]
         )
예제 #26
0
    def update(self, batch: AgentBuffer,
               num_sequences: int) -> Dict[str, float]:
        """
        Performs update on model.
        :param batch: Batch of experiences.
        :param num_sequences: Number of sequences to process.
        :return: Results of update.
        """
        # Get decayed parameters
        decay_lr = self.decay_learning_rate.get_value(
            self.policy.get_current_step())
        decay_eps = self.decay_epsilon.get_value(
            self.policy.get_current_step())
        decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
        returns = {}
        old_values = {}
        for name in self.reward_signals:
            old_values[name] = ModelUtils.list_to_tensor(
                batch[f"{name}_value_estimates"])
            returns[name] = ModelUtils.list_to_tensor(batch[f"{name}_returns"])

        vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
        if self.policy.use_continuous_act:
            actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
        else:
            actions = ModelUtils.list_to_tensor(batch["actions"],
                                                dtype=torch.long)

        memories = [
            ModelUtils.list_to_tensor(batch["memory"][i]) for i in range(
                0, len(batch["memory"]), self.policy.sequence_length)
        ]
        if len(memories) > 0:
            memories = torch.stack(memories).unsqueeze(0)

        if self.policy.use_vis_obs:
            vis_obs = []
            for idx, _ in enumerate(
                    self.policy.actor_critic.network_body.visual_encoders):
                vis_ob = ModelUtils.list_to_tensor(batch["visual_obs%d" % idx])
                vis_obs.append(vis_ob)
        else:
            vis_obs = []
        log_probs, entropy, values = self.policy.evaluate_actions(
            vec_obs,
            vis_obs,
            masks=act_masks,
            actions=actions,
            memories=memories,
            seq_len=self.policy.sequence_length,
        )
        loss_masks = ModelUtils.list_to_tensor(batch["masks"],
                                               dtype=torch.bool)
        value_loss = self.ppo_value_loss(values, old_values, returns,
                                         decay_eps, loss_masks)
        policy_loss = self.ppo_policy_loss(
            ModelUtils.list_to_tensor(batch["advantages"]),
            log_probs,
            ModelUtils.list_to_tensor(batch["action_probs"]),
            loss_masks,
        )
        loss = (policy_loss + 0.5 * value_loss -
                decay_bet * ModelUtils.masked_mean(entropy, loss_masks))

        # Set optimizer learning rate
        ModelUtils.update_learning_rate(self.optimizer, decay_lr)
        self.optimizer.zero_grad()
        loss.backward()

        self.optimizer.step()
        update_stats = {
            "Losses/Policy Loss": abs(policy_loss.detach().cpu().numpy()),
            "Losses/Value Loss": value_loss.detach().cpu().numpy(),
            "Policy/Learning Rate": decay_lr,
            "Policy/Epsilon": decay_eps,
            "Policy/Beta": decay_bet,
        }

        for reward_provider in self.reward_signals.values():
            update_stats.update(reward_provider.update(batch))

        return update_stats
예제 #27
0
    def get_trajectory_and_baseline_value_estimates(
        self,
        batch: AgentBuffer,
        next_obs: List[np.ndarray],
        next_groupmate_obs: List[List[np.ndarray]],
        done: bool,
        agent_id: str = "",
    ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray], Dict[str, float],
               Optional[AgentBufferField], Optional[AgentBufferField], ]:
        """
        Get value estimates, baseline estimates, and memories for a trajectory, in batch form.
        :param batch: An AgentBuffer that consists of a trajectory.
        :param next_obs: the next observation (after the trajectory). Used for boostrapping
            if this is not a termiinal trajectory.
        :param next_groupmate_obs: the next observations from other members of the group.
        :param done: Set true if this is a terminal trajectory.
        :param agent_id: Agent ID of the agent that this trajectory belongs to.
        :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)],
            the baseline estimates as a Dict, the final value estimate as a Dict of [name, float], and
            optionally (if using memories) an AgentBufferField of initial critic and baseline memories to be used
            during update.
        """

        n_obs = len(self.policy.behavior_spec.observation_specs)

        current_obs = ObsUtil.from_buffer(batch, n_obs)
        groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs)

        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
        groupmate_obs = [[
            ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs
        ] for _groupmate_obs in groupmate_obs]

        groupmate_actions = AgentAction.group_from_buffer(batch)

        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
        next_obs = [obs.unsqueeze(0) for obs in next_obs]

        next_groupmate_obs = [
            ModelUtils.list_to_tensor_list(_list_obs)
            for _list_obs in next_groupmate_obs
        ]
        # Expand dimensions of next critic obs
        next_groupmate_obs = [[_obs.unsqueeze(0) for _obs in _list_obs]
                              for _list_obs in next_groupmate_obs]

        if agent_id in self.value_memory_dict:
            # The agent_id should always be in both since they are added together
            _init_value_mem = self.value_memory_dict[agent_id]
            _init_baseline_mem = self.baseline_memory_dict[agent_id]
        else:
            _init_value_mem = (torch.zeros((1, 1, self.critic.memory_size))
                               if self.policy.use_recurrent else None)
            _init_baseline_mem = (torch.zeros((1, 1, self.critic.memory_size))
                                  if self.policy.use_recurrent else None)

        all_obs = ([current_obs] + groupmate_obs
                   if groupmate_obs is not None else [current_obs])
        all_next_value_mem: Optional[AgentBufferField] = None
        all_next_baseline_mem: Optional[AgentBufferField] = None
        with torch.no_grad():
            if self.policy.use_recurrent:
                (
                    value_estimates,
                    baseline_estimates,
                    all_next_value_mem,
                    all_next_baseline_mem,
                    next_value_mem,
                    next_baseline_mem,
                ) = self._evaluate_by_sequence_team(
                    current_obs,
                    groupmate_obs,
                    groupmate_actions,
                    _init_value_mem,
                    _init_baseline_mem,
                )
            else:
                value_estimates, next_value_mem = self.critic.critic_pass(
                    all_obs,
                    _init_value_mem,
                    sequence_length=batch.num_experiences)
                groupmate_obs_and_actions = (groupmate_obs, groupmate_actions)
                baseline_estimates, next_baseline_mem = self.critic.baseline(
                    current_obs,
                    groupmate_obs_and_actions,
                    _init_baseline_mem,
                    sequence_length=batch.num_experiences,
                )
        # Store the memory for the next trajectory
        self.value_memory_dict[agent_id] = next_value_mem
        self.baseline_memory_dict[agent_id] = next_baseline_mem

        all_next_obs = ([next_obs] + next_groupmate_obs
                        if next_groupmate_obs is not None else [next_obs])

        next_value_estimates, _ = self.critic.critic_pass(all_next_obs,
                                                          next_value_mem,
                                                          sequence_length=1)

        for name, estimate in baseline_estimates.items():
            baseline_estimates[name] = ModelUtils.to_numpy(estimate)

        for name, estimate in value_estimates.items():
            value_estimates[name] = ModelUtils.to_numpy(estimate)

        # the base line and V shpuld  not be on the same done flag
        for name, estimate in next_value_estimates.items():
            next_value_estimates[name] = ModelUtils.to_numpy(estimate)

        if done:
            for k in next_value_estimates:
                if not self.reward_signals[k].ignore_done:
                    next_value_estimates[k][-1] = 0.0

        return (
            value_estimates,
            baseline_estimates,
            next_value_estimates,
            all_next_value_mem,
            all_next_baseline_mem,
        )
예제 #28
0
    def update(self, batch: AgentBuffer,
               num_sequences: int) -> Dict[str, float]:
        """
        Performs update on model.
        :param batch: Batch of experiences.
        :param num_sequences: Number of sequences to process.
        :return: Results of update.
        """
        # Get decayed parameters
        decay_lr = self.decay_learning_rate.get_value(
            self.policy.get_current_step())
        decay_eps = self.decay_epsilon.get_value(
            self.policy.get_current_step())
        decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
        returns = {}
        old_values = {}
        for name in self.reward_signals:
            old_values[name] = ModelUtils.list_to_tensor(
                batch[RewardSignalUtil.value_estimates_key(name)])
            returns[name] = ModelUtils.list_to_tensor(
                batch[RewardSignalUtil.returns_key(name)])

        n_obs = len(self.policy.behavior_spec.observation_specs)
        current_obs = ObsUtil.from_buffer(batch, n_obs)
        # Convert to tensors
        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]

        act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK])
        actions = AgentAction.from_buffer(batch)

        memories = [
            ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) for i in
            range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
        ]
        if len(memories) > 0:
            memories = torch.stack(memories).unsqueeze(0)

        # Get value memories
        value_memories = [
            ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i])
            for i in range(0, len(batch[BufferKey.CRITIC_MEMORY]),
                           self.policy.sequence_length)
        ]
        if len(value_memories) > 0:
            value_memories = torch.stack(value_memories).unsqueeze(0)

        log_probs, entropy = self.policy.evaluate_actions(
            current_obs,
            masks=act_masks,
            actions=actions,
            memories=memories,
            seq_len=self.policy.sequence_length,
        )
        values, _ = self.critic.critic_pass(
            current_obs,
            memories=value_memories,
            sequence_length=self.policy.sequence_length,
        )
        old_log_probs = ActionLogProbs.from_buffer(batch).flatten()
        log_probs = log_probs.flatten()
        loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS],
                                               dtype=torch.bool)
        value_loss = self.ppo_value_loss(values, old_values, returns,
                                         decay_eps, loss_masks)
        policy_loss = self.ppo_policy_loss(
            ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]),
            log_probs,
            old_log_probs,
            loss_masks,
        )
        loss = (policy_loss + 0.5 * value_loss -
                decay_bet * ModelUtils.masked_mean(entropy, loss_masks))

        # Set optimizer learning rate
        ModelUtils.update_learning_rate(self.optimizer, decay_lr)
        self.optimizer.zero_grad()
        loss.backward()

        self.optimizer.step()
        update_stats = {
            # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow.
            # TODO: After PyTorch is default, change to something more correct.
            "Losses/Policy Loss": torch.abs(policy_loss).item(),
            "Losses/Value Loss": value_loss.item(),
            "Policy/Learning Rate": decay_lr,
            "Policy/Epsilon": decay_eps,
            "Policy/Beta": decay_bet,
        }

        for reward_provider in self.reward_signals.values():
            update_stats.update(reward_provider.update(batch))

        return update_stats
예제 #29
0
    def update(self, batch: AgentBuffer,
               num_sequences: int) -> Dict[str, float]:
        """
        Updates model using buffer.
        :param num_sequences: Number of trajectories in batch.
        :param batch: Experience mini-batch.
        :param update_target: Whether or not to update target value network
        :param reward_signal_batches: Minibatches to use for updating the reward signals,
            indexed by name. If none, don't update the reward signals.
        :return: Output from update process.
        """
        rewards = {}
        for name in self.reward_signals:
            rewards[name] = ModelUtils.list_to_tensor(batch[f"{name}_rewards"])

        n_obs = len(self.policy.behavior_spec.sensor_specs)
        current_obs = ObsUtil.from_buffer(batch, n_obs)
        # Convert to tensors
        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]

        next_obs = ObsUtil.from_buffer_next(batch, n_obs)
        # Convert to tensors
        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]

        act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
        actions = AgentAction.from_dict(batch)

        memories_list = [
            ModelUtils.list_to_tensor(batch["memory"][i]) for i in range(
                0, len(batch["memory"]), self.policy.sequence_length)
        ]
        # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true.
        offset = 1 if self.policy.sequence_length > 1 else 0
        next_memories_list = [
            ModelUtils.list_to_tensor(
                batch["memory"][i]
                [self.policy.m_size //
                 2:])  # only pass value part of memory to target network
            for i in range(offset, len(batch["memory"]),
                           self.policy.sequence_length)
        ]

        if len(memories_list) > 0:
            memories = torch.stack(memories_list).unsqueeze(0)
            next_memories = torch.stack(next_memories_list).unsqueeze(0)
        else:
            memories = None
            next_memories = None
        # Q network memories are 0'ed out, since we don't have them during inference.
        q_memories = (torch.zeros_like(next_memories)
                      if next_memories is not None else None)

        # Copy normalizers from policy
        self.value_network.q1_network.network_body.copy_normalization(
            self.policy.actor_critic.network_body)
        self.value_network.q2_network.network_body.copy_normalization(
            self.policy.actor_critic.network_body)
        self.target_network.network_body.copy_normalization(
            self.policy.actor_critic.network_body)
        (
            sampled_actions,
            log_probs,
            _,
            value_estimates,
            _,
        ) = self.policy.actor_critic.get_action_stats_and_value(
            current_obs,
            masks=act_masks,
            memories=memories,
            sequence_length=self.policy.sequence_length,
        )

        cont_sampled_actions = sampled_actions.continuous_tensor
        cont_actions = actions.continuous_tensor
        q1p_out, q2p_out = self.value_network(
            current_obs,
            cont_sampled_actions,
            memories=q_memories,
            sequence_length=self.policy.sequence_length,
            q2_grad=False,
        )
        q1_out, q2_out = self.value_network(
            current_obs,
            cont_actions,
            memories=q_memories,
            sequence_length=self.policy.sequence_length,
        )

        if self._action_spec.discrete_size > 0:
            disc_actions = actions.discrete_tensor
            q1_stream = self._condense_q_streams(q1_out, disc_actions)
            q2_stream = self._condense_q_streams(q2_out, disc_actions)
        else:
            q1_stream, q2_stream = q1_out, q2_out

        with torch.no_grad():
            target_values, _ = self.target_network(
                next_obs,
                memories=next_memories,
                sequence_length=self.policy.sequence_length,
            )
        masks = ModelUtils.list_to_tensor(batch["masks"], dtype=torch.bool)
        dones = ModelUtils.list_to_tensor(batch["done"])

        q1_loss, q2_loss = self.sac_q_loss(q1_stream, q2_stream, target_values,
                                           dones, rewards, masks)
        value_loss = self.sac_value_loss(log_probs, value_estimates, q1p_out,
                                         q2p_out, masks)
        policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks)
        entropy_loss = self.sac_entropy_loss(log_probs, masks)

        total_value_loss = q1_loss + q2_loss + value_loss

        decay_lr = self.decay_learning_rate.get_value(
            self.policy.get_current_step())
        ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr)
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        ModelUtils.update_learning_rate(self.value_optimizer, decay_lr)
        self.value_optimizer.zero_grad()
        total_value_loss.backward()
        self.value_optimizer.step()

        ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr)
        self.entropy_optimizer.zero_grad()
        entropy_loss.backward()
        self.entropy_optimizer.step()

        # Update target network
        ModelUtils.soft_update(self.policy.actor_critic.critic,
                               self.target_network, self.tau)
        update_stats = {
            "Losses/Policy Loss":
            policy_loss.item(),
            "Losses/Value Loss":
            value_loss.item(),
            "Losses/Q1 Loss":
            q1_loss.item(),
            "Losses/Q2 Loss":
            q2_loss.item(),
            "Policy/Discrete Entropy Coeff":
            torch.mean(torch.exp(self._log_ent_coef.discrete)).item(),
            "Policy/Continuous Entropy Coeff":
            torch.mean(torch.exp(self._log_ent_coef.continuous)).item(),
            "Policy/Learning Rate":
            decay_lr,
        }

        return update_stats
예제 #30
0
    def get_trajectory_value_estimates(
        self,
        batch: AgentBuffer,
        next_obs: List[np.ndarray],
        done: bool,
        agent_id: str = "",
    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]:
        """
        Get value estimates and memories for a trajectory, in batch form.
        :param batch: An AgentBuffer that consists of a trajectory.
        :param next_obs: the next observation (after the trajectory). Used for boostrapping
            if this is not a termiinal trajectory.
        :param done: Set true if this is a terminal trajectory.
        :param agent_id: Agent ID of the agent that this trajectory belongs to.
        :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)],
            the final value estimate as a Dict of [name, float], and optionally (if using memories)
            an AgentBufferField of initial critic memories to be used during update.
        """
        n_obs = len(self.policy.behavior_spec.observation_specs)

        if agent_id in self.critic_memory_dict:
            memory = self.critic_memory_dict[agent_id]
        else:
            memory = (
                torch.zeros((1, 1, self.critic.memory_size))
                if self.policy.use_recurrent
                else None
            )

        # Convert to tensors
        current_obs = [
            ModelUtils.list_to_tensor(obs) for obs in ObsUtil.from_buffer(batch, n_obs)
        ]
        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]

        next_obs = [obs.unsqueeze(0) for obs in next_obs]

        # If we're using LSTM, we want to get all the intermediate memories.
        all_next_memories: Optional[AgentBufferField] = None

        # To prevent memory leak and improve performance, evaluate with no_grad.
        with torch.no_grad():
            if self.policy.use_recurrent:
                (
                    value_estimates,
                    all_next_memories,
                    next_memory,
                ) = self._evaluate_by_sequence(current_obs, memory)
            else:
                value_estimates, next_memory = self.critic.critic_pass(
                    current_obs, memory, sequence_length=batch.num_experiences
                )

        # Store the memory for the next trajectory. This should NOT have a gradient.
        self.critic_memory_dict[agent_id] = next_memory

        next_value_estimate, _ = self.critic.critic_pass(
            next_obs, next_memory, sequence_length=1
        )

        for name, estimate in value_estimates.items():
            value_estimates[name] = ModelUtils.to_numpy(estimate)
            next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name])

        if done:
            for k in next_value_estimate:
                if not self.reward_signals[k].ignore_done:
                    next_value_estimate[k] = 0.0
            if agent_id in self.critic_memory_dict:
                self.critic_memory_dict.pop(agent_id)
        return value_estimates, next_value_estimate, all_next_memories