예제 #1
0
    def _train_once(self, trainer, epoch):
        """Obtain samplers and train for one epoch.

        Args:
            trainer (Trainer): Experiment trainer, which may be used to
                obtain samples.
            epoch (int): The current epoch.

        Returns:
            List[float]: Losses.

        """
        batch = self._obtain_samples(trainer, epoch)
        indices = np.random.permutation(len(batch.actions))
        minibatches = np.array_split(indices, self._minibatches_per_epoch)
        losses = []
        for minibatch in minibatches:
            observations = np_to_torch(batch.observations[minibatch])
            actions = np_to_torch(batch.actions[minibatch])
            self._optimizer.zero_grad()
            loss = self._compute_loss(observations, actions)
            loss.backward()
            losses.append(loss.item())
            self._optimizer.step()
        return losses
예제 #2
0
파일: dqn.py 프로젝트: ziyiwu9494/garage
    def _optimize_qf(self, timesteps):
        """Perform algorithm optimizing.

        Args:
            timesteps (TimeStepBatch): Processed batch data.

        Returns:
            qval_loss: Loss of Q-value predicted by the Q-network.
            ys: y_s.
            qval: Q-value predicted by the Q-network.

        """
        observations = np_to_torch(timesteps.observations)
        rewards = np_to_torch(timesteps.rewards).reshape(-1, 1)
        rewards *= self._reward_scale
        actions = np_to_torch(timesteps.actions)
        next_observations = np_to_torch(timesteps.next_observations)
        terminals = np_to_torch(timesteps.terminals).reshape(-1, 1)

        next_inputs = next_observations
        inputs = observations
        with torch.no_grad():
            if self._double_q:
                # Use online qf to get optimal actions
                selected_actions = torch.argmax(self._qf(next_inputs), axis=1)
                # use target qf to get Q values for those actions
                selected_actions = selected_actions.long().unsqueeze(1)
                best_qvals = torch.gather(self._target_qf(next_inputs),
                                          dim=1,
                                          index=selected_actions)
            else:
                target_qvals = self._target_qf(next_inputs)
                best_qvals, _ = torch.max(target_qvals, 1)
                best_qvals = best_qvals.unsqueeze(1)

        rewards_clipped = rewards
        if self._clip_reward is not None:
            rewards_clipped = torch.clamp(rewards, -1 * self._clip_reward,
                                          self._clip_reward)
        y_target = (rewards_clipped +
                    (1.0 - terminals) * self._discount * best_qvals)
        y_target = y_target.squeeze(1)

        # optimize qf
        qvals = self._qf(inputs)
        selected_qs = torch.sum(qvals * actions, axis=1)
        qval_loss = F.smooth_l1_loss(selected_qs, y_target)

        self._qf_optimizer.zero_grad()
        qval_loss.backward()

        # optionally clip the gradients
        if self._clip_grad is not None:
            torch.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                           self._clip_grad)
        self._qf_optimizer.step()

        return (qval_loss.detach(), y_target, selected_qs.detach())
예제 #3
0
def test_double_dqn_loss(setup):
    algo, env, buff, _, batch_size = setup

    algo._double_q = True
    trainer = Trainer(snapshot_config)
    trainer.setup(algo, env, sampler_cls=LocalSampler)

    paths = trainer.obtain_episodes(0, batch_size=batch_size)
    buff.add_episode_batch(paths)
    timesteps = buff.sample_timesteps(algo._buffer_batch_size)
    timesteps_copy = copy.deepcopy(timesteps)

    observations = np_to_torch(timesteps.observations)
    rewards = np_to_torch(timesteps.rewards).reshape(-1, 1)
    actions = np_to_torch(timesteps.actions)
    next_observations = np_to_torch(timesteps.next_observations)
    terminals = np_to_torch(timesteps.terminals).reshape(-1, 1)

    next_inputs = next_observations
    inputs = observations
    with torch.no_grad():
        # double Q loss
        selected_actions = torch.argmax(algo._qf(next_inputs), axis=1)
        # use target qf to get Q values for those actions
        selected_actions = selected_actions.long().unsqueeze(1)
        best_qvals = torch.gather(algo._target_qf(next_inputs),
                                  dim=1,
                                  index=selected_actions)

    rewards_clipped = rewards
    y_target = (rewards_clipped +
                (1.0 - terminals) * algo._discount * best_qvals)
    y_target = y_target.squeeze(1)

    # optimize qf
    qvals = algo._qf(inputs)
    selected_qs = torch.sum(qvals * actions, axis=1)
    qval_loss = F.smooth_l1_loss(selected_qs, y_target)

    algo_loss, algo_targets, algo_selected_qs = algo._optimize_qf(
        timesteps_copy)
    env.close()

    assert (qval_loss.detach() == algo_loss).all()
    assert (y_target == algo_targets).all()
    assert (selected_qs == algo_selected_qs).all()
예제 #4
0
    def get_actions(self, observations):
        """Get actions given observations.

        Args:
            observations (np.ndarray): Batch of observations, should
                have shape :math:`(N, O)`.

        Returns:
            torch.Tensor: Predicted actions. Tensor has shape :math:`(N, A)`.
            dict: Empty since this policy does not produce a distribution.
        """
        with torch.no_grad():
            return self(np_to_torch(observations)).cpu().numpy(), dict()