예제 #1
0
    def run_step(self):
        """
        Run one training step, which is composed of M gradient steps

        For M gradients steps:
        - sample a batch from buffer
        - perform one gradient step in all three networks (policy, qf1 and qf2)
        """

        total_losses = [0, 0, 0]
        for _ in range(self._gradient_steps):
            if self.replay_buffer.n_transitions_stored >= self._min_buffer_size:
                samples = as_torch_dict(self.replay_buffer.sample_transitions(
                    self._buffer_batch_size
                ))
                policy_loss, qf1_loss, qf2_loss = self.optimize_policy(samples)
                total_losses[0] += policy_loss
                total_losses[1] += qf1_loss
                total_losses[2] += qf2_loss
                self._update_targets()

        # Normalize losses by total of gradient updates
        total_losses = [loss / self._gradient_steps for loss in total_losses]

        return total_losses
예제 #2
0
    def _train_once(self, itr):
        """Perform one iteration of training.

        Args:
            itr (int): Iteration number.

        """
        for grad_step_timer in range(self._grad_steps_per_env_step):
            if (self._replay_buffer.n_transitions_stored >=
                    self._min_buffer_size):
                # Sample from buffer
                samples = self._replay_buffer.sample_transitions(
                    self._buffer_batch_size)
                samples = as_torch_dict(samples)

                # Optimize
                qf_loss, y, q, policy_loss = torch_to_np(
                    self._optimize_policy(samples, grad_step_timer))

                self._episode_policy_losses.append(policy_loss)
                self._episode_qf_losses.append(qf_loss)
                self._epoch_ys.append(y)
                self._epoch_qs.append(q)

        if itr % self._steps_per_epoch == 0:
            logger.log('Training finished')
            epoch = itr // self._steps_per_epoch

            if (self._replay_buffer.n_transitions_stored >=
                    self._min_buffer_size):
                tabular.record('Epoch', epoch)
                self._log_statistics()
예제 #3
0
    def optimize_policy(self, samples_data):
        """Perform algorithm optimizing.

        Args:
            samples_data (dict): Processed batch data.

        Returns:
            action_loss: Loss of action predicted by the policy network.
            qval_loss: Loss of Q-value predicted by the Q-network.
            ys: y_s.
            qval: Q-value predicted by the Q-network.

        """
        transitions = as_torch_dict(samples_data)

        observations = transitions['observations']
        rewards = transitions['rewards'].reshape(-1, 1)
        actions = transitions['actions']
        next_observations = transitions['next_observations']
        terminals = transitions['terminals'].reshape(-1, 1)

        next_inputs = next_observations
        inputs = observations
        with torch.no_grad():
            next_actions = self._target_policy(next_inputs)
            target_qvals = self._target_qf(next_inputs, next_actions)

        clip_range = (-self._clip_return,
                      0. if self._clip_pos_returns else self._clip_return)

        y_target = rewards + (1.0 - terminals) * self._discount * target_qvals
        y_target = torch.clamp(y_target, clip_range[0], clip_range[1])

        # optimize critic
        qval = self._qf(inputs, actions)
        qf_loss = torch.nn.MSELoss()
        qval_loss = qf_loss(qval, y_target)
        zero_optim_grads(self._qf_optimizer)
        qval_loss.backward()
        self._qf_optimizer.step()

        # optimize actor
        actions = self.policy(inputs)
        action_loss = -1 * self._qf(inputs, actions).mean()
        zero_optim_grads(self._policy_optimizer)
        action_loss.backward()
        self._policy_optimizer.step()

        # update target networks
        self.update_target()
        return (qval_loss.detach(), y_target, qval.detach(),
                action_loss.detach())
예제 #4
0
    def train_once(self, itr=None, paths=None):
        """Complete 1 training iteration of SAC.

        Args:
            itr (int): Iteration number. This argument is deprecated.
            paths (list[dict]): A list of collected paths.
                This argument is deprecated.

        Returns:
            torch.Tensor: loss from actor/policy network after optimization.
            torch.Tensor: loss from 1st q-function after optimization.
            torch.Tensor: loss from 2nd q-function after optimization.

        """
        del itr
        del paths
        if self.replay_buffer.n_transitions_stored >= self._min_buffer_size:
            samples = self.replay_buffer.sample_transitions(
                self._buffer_batch_size)
            samples = as_torch_dict(samples)
            policy_loss, qf1_loss, qf2_loss = self.optimize_policy(samples)
            self._update_targets()

        return policy_loss, qf1_loss, qf2_loss
예제 #5
0
def test_as_torch_dict():
    """Test if dict whose values are tensors can be converted to np arrays."""
    dic = {'a': np.zeros(1), 'b': np.ones(1)}
    as_torch_dict(dic)
    for dic_value in dic.values():
        assert isinstance(dic_value, torch.Tensor)