Пример #1
0
    def step(self, states):
        stats = Statistics()
        self._step += 1

        if not self.eval:
            self._sample_noise()
        states_tensor = torch.from_numpy(states).float().to(self._device)
        self._policy_net.train(False)
        with torch.no_grad():
            q_values = self._policy_net(states_tensor)
        policy = self._greedy_policy if self.eval else self._policy
        actions = policy.get_action(q_values.cpu().numpy())

        if not self.eval:  # During training
            # Do logging
            q = torch.max(q_values).detach()
            stats.set('q', q)
            self._policy_net.log_scalars(stats.set)

            try:
                stats.set('epsilon', self._policy.get_epsilon())
            except AttributeError:
                pass

        return actions
Пример #2
0
 def transitions(self, states, actions, rewards, next_states, dones):
     stats = Statistics()
     assert not self.eval
     for idx in range(len(states)):
         self._buffer.push(state=states[idx],
                           action=actions[idx],
                           reward=rewards[idx],
                           next_state=next_states[idx],
                           done=dones[idx])
     stats.set("replay_buffer_size", len(self._buffer))
     if len(self._buffer) >= self._min_replay_buffer_size:
         t0 = time.time()  # time spent for optimization
         stats.set_all(self._optimize())
         stats.set("optimization_time", time.time() - t0)
     return stats
Пример #3
0
    def _run_one_iteration(self):
        stats = Statistics(self._summary_writer, self._iteration)

        phase_stats, agent_stats = self._run_one_phase(is_training=True)
        stats.set("training_episodes", phase_stats.sum("episodes"))
        stats.set("training_steps", phase_stats.sum("steps"))
        stats.set_all(phase_stats.get(["agent_time", "step_time", "env_time"]))
        stats.set_all(agent_stats)

        if self._evaluation_steps != 0:
            phase_stats, _ = self._run_one_phase(is_training=False)
            stats.set("eval_episodes", phase_stats.sum("episodes"))
        stats.set("episode_reward", phase_stats.get("rewards"))
        stats.set("episode_steps", phase_stats.get("steps"))

        return stats
Пример #4
0
    def step(self, actions):
        stats = Statistics()

        t0 = time.time()
        next_states = []
        states = []
        rewards = []
        dones = []
        if isinstance(actions, np.ndarray):
            actions = actions.tolist()

        step_promises = []
        for env_idx, env in enumerate(self._envs):
            count = env.n_envs * env.n_agents
            start = env_idx * count
            end = start + count
            env_actions = actions[start:end]
            step_promises.append(env.step(env_actions))

        reset_states = []
        for env_idx, step_promise in enumerate(step_promises):
            env_next_states, env_rewards, env_dones = step_promise()

            env = self._envs[env_idx]
            env_states = env_next_states
            env_stat = self._env_stats[env]
            env_stat.set("steps", 1)
            env_stat.set("rewards", sum(env_rewards))
            if env_dones.any():
                stats.set("steps", env_stat.sum("steps"))
                avg_reward = env_stat.sum("rewards") / (
                        env.n_envs * env.n_agents)
                stats.set("rewards", avg_reward)
                stats.set("episodes", 1)
                self._env_stats[env] = Statistics()
                reset_states.append((env_idx, env.reset()))

            next_states.append(env_next_states)
            states.append(env_states)
            rewards.append(env_rewards)
            dones.append(env_dones)

        for env_idx, step_promise in reset_states:
            states[env_idx] = step_promise()

        rewards = np.concatenate(rewards, axis=0)
        dones = np.concatenate(dones, axis=0)
        next_states = np.concatenate(next_states, axis=0)
        self.states = np.concatenate(states, axis=0)

        stats.set("env_time", time.time() - t0)

        return rewards, next_states, dones, stats
Пример #5
0
    def _run_one_phase(self, is_training):
        stats = Statistics()
        agent_stats = Statistics()

        self._agent.eval = not is_training
        min_steps = (self._training_steps if is_training else
                     self._evaluation_steps) * self._env.n_agents

        self._env.reset()
        while stats.sum("steps") < min_steps:
            step_time0 = time.time()

            states = np.copy(self._env.states)
            actions = self._agent.step(states)

            rewards, next_states, dones, env_stats = \
                self._env.step(actions)
            stats.set_all(env_stats)

            if self._traj_buffer is not None:
                self._traj_buffer.push(states, actions, rewards, next_states,
                                       dones)

            if is_training:
                t0 = time.time()
                agent_stats.set_all(
                    self._agent.transitions(states, actions, rewards,
                                            next_states, dones))
                stats.set("agent_time", time.time() - t0)
                stats.set("step_time", time.time() - step_time0)

            sys.stdout.write(
                "Iteration {} ({}). ".format(
                    self._iteration, "train" if is_training else "eval") +
                "Steps executed: {} ".format(stats.sum("steps")) +
                "Episode length: {} ".format(int(stats.avg("steps"))) +
                "Return: {:.4f}      \r".format(stats.avg("rewards")))
            sys.stdout.flush()
        print()
        self._agent.episodes_end()
        return stats, agent_stats
Пример #6
0
    def transitions(self, states, actions, rewards, next_states, term):
        stats = Statistics()
        if self.eval:
            return stats
        t0 = time.time()
        self._net.train(True)

        states = torch.from_numpy(states).float().to(self._device)
        actions = torch.tensor(actions).long().to(self._device)
        actions = torch.unsqueeze(actions, dim=1)
        rewards = torch.from_numpy(rewards).float().to(self._device)
        rewards = torch.unsqueeze(rewards, dim=1)
        term_mask = torch.from_numpy(term.astype(np.uint8)).to(self._device)
        term_mask = torch.unsqueeze(term_mask, dim=1)
        next_states = torch.from_numpy(next_states).float().to(self._device)

        _, v_next = self._net(next_states)
        v_next = v_next * (1 - term_mask).float()  # 0 -> term
        action_logits, v = self._net(states)

        # it's used as:
        # 1. loss for the critic
        # 2. advantage for the actor
        delta = rewards + self._gamma * v_next - v

        critic_loss = delta.abs().mean()

        log_softmax = torch.nn.LogSoftmax(dim=1)
        action_log_probs = log_softmax(action_logits)
        action_log_probs = action_log_probs.gather(dim=1, index=actions)

        # minus here because optimizer is going to *minimize* the
        # loss. If we were going to update the weights manually,
        # (without optimizer) we would remove the -1.
        actor_loss = -(delta * action_log_probs).mean()

        loss = actor_loss + critic_loss

        # Optimize
        self._optimizer.zero_grad()
        loss.backward()
        self._optimizer.step()

        stats.set('loss', loss.detach())
        stats.set('loss_critic', critic_loss.detach())
        stats.set('loss_actor', actor_loss.detach())
        stats.set('optimization_time', time.time() - t0)

        # Log entropy metric (opposite to confidence)
        action_probs = torch.nn.Softmax(dim=1)(action_logits)
        entropy = -(action_probs * action_log_probs).sum(dim=1).mean()
        stats.set('entropy', entropy.detach())

        # Log gradients
        for p in self._net.parameters():
            if p.grad is not None:
                stats.set('grad_max', p.grad.abs().max().detach())
                stats.set('grad_mean', (p.grad**2).mean().sqrt().detach())

        # Log Kullback-Leibler divergence between the new
        # and the old policy.
        new_action_logits, _ = self._net(states)
        new_action_probs = torch.nn.Softmax(dim=1)(new_action_logits)
        kl = -((new_action_probs / action_probs).log() *
               action_probs).sum(dim=1).mean()
        stats.set('kl', kl.detach())
        return stats
Пример #7
0
    def _optimize(self):
        stats = Statistics()
        t0 = time.time()
        stats.set('replay_buffer_size', len(self._buffer))
        if not self._buffer.ready():
            return stats

        self.net.train(True)

        # Create tensors: state, action, next_state, term
        states, actions, target_v, advantage = self._buffer.sample()
        batch_size = len(states)
        assert batch_size == self._buffer.capacity()

        states = torch.from_numpy(states).float().to(self._device)
        actions = torch.from_numpy(actions).to(self._device)
        if self._is_continous:
            actions_shape = (batch_size, ) + self._action_space.shape
            actions = actions.float()
        else:
            actions_shape = (batch_size, )
            actions = actions.long()
        assert actions.shape == actions_shape, actions.shape
        target_v = torch.from_numpy(target_v).float().to(self._device)
        target_v = torch.unsqueeze(target_v, dim=1)

        advantage = torch.from_numpy(advantage).float().to(self._device)
        advantage = (advantage - advantage.mean()) / advantage.std()
        advantage = advantage.detach()
        assert not torch.isnan(advantage).any(), advantage
        if self._is_continous:
            advantage = torch.unsqueeze(advantage, dim=1)
            assert advantage.shape == (batch_size, 1), advantage.shape
        else:
            assert advantage.shape == (batch_size, )
        stats.set('advantage', advantage.abs().mean())

        # Iteratively optimize the network
        critic_loss_fn = nn.MSELoss()

        # Action probabilities of the network before optimization
        old_log_probs = None
        old_dist = None

        for _ in range(self._epochs):

            # Calculate Actor Loss
            if self._is_continous:
                actions_mu, actions_var, v = self.net(states)
                assert actions_var.shape == actions_shape, actions_var.shape
                assert actions_mu.shape == actions_shape, actions_mu.shape
                assert len(self._action_space.shape) == 1
                log_probs_arr = []
                for action_idx in range(self._action_space.shape[0]):
                    action_mu = actions_mu[:, action_idx]
                    action_var = actions_var[:, action_idx]
                    assert action_mu.shape == (batch_size, ), action_mu.shape
                    assert action_var.shape == (batch_size, ), action_var.shape
                    dist = torch.distributions.Normal(action_mu, action_var)
                    sub_actions = actions[:, action_idx]
                    assert sub_actions.shape == (batch_size, )
                    log_probs = dist.log_prob(sub_actions)
                    log_probs_arr.append(log_probs)
                log_probs = torch.stack(log_probs_arr, dim=1)
            else:
                action_logits, _, v = self.net(states)
                assert action_logits.shape == (batch_size,
                                               self._action_space.n)
                dist = torch.distributions.categorical.Categorical(
                    logits=action_logits)
                log_probs = dist.log_prob(actions)
            assert log_probs.shape == actions_shape, log_probs.shape

            if old_log_probs is None:
                old_log_probs = log_probs.detach()
                old_dist = dist

            r = (log_probs - old_log_probs).exp()

            assert not torch.isnan(r).any(), r
            assert r.shape == actions_shape, r.shape
            obj = torch.min(
                r * advantage,
                torch.clamp(r, 1. - self._epsilon, 1. + self._epsilon) *
                advantage)
            assert obj.shape == actions_shape, obj.shape

            # Minus is here because optimizer is going to *minimize* the
            # loss. If we were going to update the weights manually,
            # (without optimizer) we would remove the -1.
            actor_loss = -obj.mean()

            # Calculate Critic Loss
            assert v.shape == (batch_size, 1)
            assert target_v.shape == (batch_size, 1)

            critic_loss = critic_loss_fn(v, target_v)

            # Optimize
            loss = critic_loss + actor_loss
            self._optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.net.parameters(), 30.0)
            self._optimizer.step()

            stats.set('loss_actor', actor_loss.detach())
            stats.set('loss_critic', critic_loss.detach())
            # Log gradients
            for p in self.net.parameters():
                if p.grad is not None:
                    stats.set('grad_max', p.grad.abs().max().detach())
                    stats.set('grad_mean', (p.grad**2).mean().sqrt().detach())

        self._buffer.reset()

        # Log stats
        stats.set('optimization_time', time.time() - t0)
        stats.set('ppo_optimization_epochs', self._epochs)
        stats.set('ppo_optimization_samples', batch_size)

        # Log entropy metric (opposite to confidence)
        if self._is_continous:
            action_mu, action_var, _ = self.net(states)
            stats.set('action_variance', action_var.mean().detach())
            stats.set('action_mu_mean', (action_mu**2).mean().sqrt().detach())
            stats.set('action_mu_max', action_mu.abs().max().detach())

        stats.set('entropy', dist.entropy().mean().detach())

        # Log Kullback-Leibler divergence between the new
        # and the old policy.
        kl = torch.distributions.kl.kl_divergence(dist, old_dist)
        stats.set('kl', kl.mean().detach())

        return stats
Пример #8
0
    def _optimize(self):
        stats = Statistics()
        if self.eval:
            return stats
        if self._step % self._train_freq != 0:
            return stats
        self._policy_net.train(True)

        # Increase ReplayBuffer beta parameter 0.4 → 1.0
        # (These numbers are taken from the Rainbow paper)
        beta0 = 0.4
        beta1 = 1.0
        bonus = min(1.0, self._optimization_step / self._beta_decay)
        beta = beta0 + (beta1 - beta0) * bonus
        try:
            self._buffer.set_beta(beta)
            stats.set('replay_beta', beta)
        except AttributeError:
            # In case it's not a PriorityReplayBuffer
            pass
        states, actions, rewards, next_states, term, ids = self._buffer.sample(
            self._batch_size)

        # Make Replay Buffer values consumable by PyTorch
        states = torch.from_numpy(states).float().to(self._device)
        actions = torch.from_numpy(actions).long().to(self._device)
        actions = torch.unsqueeze(actions, dim=1)
        rewards = torch.from_numpy(rewards).float().to(self._device)
        rewards = torch.unsqueeze(rewards, dim=1)
        # For term states the Q value is calculated differently:
        #   Q(term_state) = R
        term_mask = torch.from_numpy(term).to(self._device)
        term_mask = torch.unsqueeze(term_mask, dim=1)
        term_mask = (1 - term_mask).float()
        next_states = torch.from_numpy(next_states).float().to(self._device)

        # Calculate TD Target
        self._sample_noise()
        if self._double:
            # Double DQN: use target_net for Q values estimation of the
            # next_state and policy_net for choosing the action
            # in the next_state.
            next_q_pnet = self._policy_net(next_states).detach()
            next_actions = torch.argmax(next_q_pnet, dim=1).unsqueeze(dim=1)
        else:
            next_q_tnet = self._target_net(next_states).detach()
            next_actions = torch.argmax(next_q_tnet, dim=1).unsqueeze(dim=1)
        self._sample_noise()
        next_q = self._target_net(next_states).gather(
            1, next_actions).detach()  # detach → don't backpropagate

        next_q = next_q * (1 - term_mask).float()  # 0 -> term

        target_q = rewards + self._gamma * next_q

        self._sample_noise()
        q = self._policy_net(states).gather(dim=1, index=actions)

        loss = self._loss_fn(q, target_q)
        try:
            w = self._buffer.importance_sampling_weights(ids)
            w = torch.from_numpy(w).float().to(self._device)
            loss = w * loss
        except AttributeError:
            # Not a priority replay buffer
            pass
        loss = torch.mean(loss)

        stats.set('loss', loss.detach())

        self._optimizer.zero_grad()
        loss.backward()
        for param in self._policy_net.parameters():
            if param.grad is not None:
                param.grad.data.clamp_(-1, 1)
        self._optimizer.step()

        self._update_target_net()

        self._optimization_step += 1

        # Update priorities in the Replay Buffer
        with torch.no_grad():
            buffer_loss = self._buffer_loss_fn(q, target_q)
            buffer_loss = torch.squeeze(buffer_loss)
            buffer_loss = buffer_loss.cpu().numpy()
            try:
                self._buffer.update_priorities(ids, buffer_loss)
            except AttributeError:
                # That's not a priority replay buffer
                pass

        return stats