Exemplo n.º 1
0
 def q(self, observation, prev_action, prev_reward, action):
     model_inputs = buffer_to(
         (observation, prev_action, prev_reward, action),
         device=self.device)
     q1 = self.q_model(*model_inputs)
     q2 = self.q2_model(*model_inputs)
     return q1.cpu(), q2.cpu()
Exemplo n.º 2
0
 def target(self, observation, prev_action, prev_reward):
     """Returns the target Q-values for states/observations."""
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     target_q = self.target_model(*model_inputs)
     return target_q.cpu()
 def target_q(self, observation, prev_action, prev_reward, action):
     model_inputs = buffer_to(
         (observation, prev_action, prev_reward, action),
         device=self.device)
     target_q1 = self.target_q1_model(*model_inputs)
     target_q2 = self.target_q2_model(*model_inputs)
     return target_q1.cpu(), target_q2.cpu()
Exemplo n.º 4
0
 def __call__(self, observation, prev_action, prev_reward):
     """Returns Q-values for states/observations (with grad)."""
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     q = self.model(*model_inputs)
     return q.cpu()
Exemplo n.º 5
0
 def step(self, observation, prev_action, prev_reward):
     agent_inputs = buffer_to((observation, prev_action, prev_reward),
         device=self.device)
     mu, log_std, value, rnn_state = self.model(*agent_inputs, self.prev_rnn_state)
     dist_info = DistInfoStd(mean=mu, log_std=log_std)
     action = self.distribution.sample(dist_info)
     # Model handles None, but Buffer does not, make zeros if needed:
     prev_rnn_state = self.prev_rnn_state or buffer_func(rnn_state, torch.zeros_like)
     # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage.
     # (Special case: model should always leave B dimension in.)
     prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1)
     agent_info = AgentInfoRnn(dist_info=dist_info, value=value,
         prev_rnn_state=prev_rnn_state)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     self.advance_rnn_state(rnn_state)  # Keep on device.
     return AgentStep(action=action, agent_info=agent_info)
Exemplo n.º 6
0
    def step(self, observation, prev_action, prev_reward):
        prev_action = self.distribution.to_onehot(prev_action)
        agent_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)

        pi, value, rnn_state = self.model(*agent_inputs, self.prev_rnn_state)
        dist_info = DistInfo(prob=pi)

        if self.dual_model:
            int_pi, int_value, int_rnn_state = self.model_int(
                *agent_inputs, self.prev_int_rnn_state)
            dist_int_info = DistInfo(prob=int_pi)
            if self._mode == "eval":
                action = self.distribution.sample(dist_info)
            else:
                action = self.distribution.sample(dist_int_info)
        else:
            action = self.distribution.sample(dist_info)

        # Model handles None, but Buffer does not, make zeros if needed:
        prev_rnn_state = self.prev_rnn_state or buffer_func(
            rnn_state, torch.zeros_like)
        # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage.
        # (Special case: model should always leave B dimension in.)
        prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1)

        if self.dual_model:
            prev_int_rnn_state = self.prev_int_rnn_state or buffer_func(
                int_rnn_state, torch.zeros_like)
            prev_int_rnn_state = buffer_method(prev_int_rnn_state, "transpose",
                                               0, 1)
            agent_info = AgentInfoRnnTwin(
                dist_info=dist_info,
                value=value,
                prev_rnn_state=prev_rnn_state,
                dist_int_info=dist_int_info,
                int_value=int_value,
                prev_int_rnn_state=prev_int_rnn_state)
        else:
            agent_info = AgentInfoRnn(dist_info=dist_info,
                                      value=value,
                                      prev_rnn_state=prev_rnn_state)
        action, agent_info = buffer_to((action, agent_info), device="cpu")
        self.advance_rnn_state(rnn_state)  # Keep on device.
        if self.dual_model:
            self.advance_int_rnn_state(int_rnn_state)
        return AgentStep(action=action, agent_info=agent_info)
Exemplo n.º 7
0
    def loss(self, samples):
        """
        Computes losses for twin Q-values against the min of twin target Q-values
        and an entropy term.  Computes reparameterized policy loss, and loss for
        tuning entropy weighting, alpha.  
        
        Input samples have leading batch dimension [B,..] (but not time).
        """
        agent_inputs, target_inputs, action = buffer_to(
            (samples.agent_inputs, samples.target_inputs, samples.action))

        if self.mid_batch_reset and not self.agent.recurrent:
            valid = torch.ones_like(samples.done, dtype=torch.float)  # or None
        else:
            valid = valid_from_done(samples.done)
        if self.bootstrap_timelimit:
            # To avoid non-use of bootstrap when environment is 'done' due to
            # time-limit, turn off training on these samples.
            valid *= (1 - samples.timeout_n.float())

        q1, q2 = self.agent.q(*agent_inputs, action)
        with torch.no_grad():
            target_action, target_log_pi, _ = self.agent.pi(*target_inputs)
            target_q1, target_q2 = self.agent.target_q(*target_inputs, target_action)
        min_target_q = torch.min(target_q1, target_q2)
        target_value = min_target_q - self._alpha * target_log_pi
        disc = self.discount ** self.n_step_return
        y = (self.reward_scale * samples.return_ +
            (1 - samples.done_n.float()) * disc * target_value)

        q1_loss = 0.5 * valid_mean((y - q1) ** 2, valid)
        q2_loss = 0.5 * valid_mean((y - q2) ** 2, valid)

        new_action, log_pi, (pi_mean, pi_log_std) = self.agent.pi(*agent_inputs)
        if not self.reparameterize:
            new_action = new_action.detach()  # No grad.
        log_target1, log_target2 = self.agent.q(*agent_inputs, new_action)
        min_log_target = torch.min(log_target1, log_target2)
        prior_log_pi = self.get_action_prior(new_action.cpu())

        if self.reparameterize:
            pi_losses = self._alpha * log_pi - min_log_target - prior_log_pi
        else:
            raise NotImplementedError

        # if self.policy_output_regularization > 0:
        #     pi_losses += self.policy_output_regularization * torch.mean(
        #         0.5 * pi_mean ** 2 + 0.5 * pi_log_std ** 2, dim=-1)
        pi_loss = valid_mean(pi_losses, valid)

        if self.target_entropy is not None and self.fixed_alpha is None:
            alpha_losses = - self._log_alpha * (log_pi.detach() + self.target_entropy)
            alpha_loss = valid_mean(alpha_losses, valid)
        else:
            alpha_loss = None

        losses = (q1_loss, q2_loss, pi_loss, alpha_loss)
        values = tuple(val.detach() for val in (q1, q2, pi_mean, pi_log_std))
        return losses, values
Exemplo n.º 8
0
    def optimize_agent(self, itr, samples):
        """
        Train the agent, for multiple epochs over minibatches taken from the
        input samples.  Organizes agent inputs from the training data, and
        moves them to device (e.g. GPU) up front, so that minibatches are
        formed within device, without further data transfer.
        """
        recurrent = self.agent.recurrent
        agent_inputs = AgentInputs(  # Move inputs to device once, index there.
            observation=samples.env.observation,
            prev_action=samples.agent.prev_action,
            prev_reward=samples.env.prev_reward,
        )
        agent_inputs = buffer_to(agent_inputs, device=self.agent.device)
        if hasattr(self.agent, "update_obs_rms"):
            self.agent.update_obs_rms(agent_inputs.observation)
        return_, advantage, valid = self.process_returns(samples, self.normalize_rewards)
        loss_inputs = LossInputs(  # So can slice all.
            agent_inputs=agent_inputs,
            action=samples.agent.action,
            return_=return_,
            advantage=advantage,
            valid=valid,
            old_dist_info=samples.agent.agent_info.dist_info,
        )
        if recurrent:
            # Leave in [B,N,H] for slicing to minibatches.
            init_rnn_state = samples.agent.agent_info.prev_rnn_state[0]  # T=0.
        T, B = samples.env.reward.shape[:2]
        opt_info = OptInfo(*([] for _ in range(len(OptInfo._fields))))
        # If recurrent, use whole trajectories, only shuffle B; else shuffle all.
        batch_size = B if self.agent.recurrent else T * B
        mb_size = batch_size // self.minibatches
        for _ in range(self.epochs):
            for idxs in iterate_mb_idxs(batch_size, mb_size, shuffle=True):
                T_idxs = slice(None) if recurrent else idxs % T
                B_idxs = idxs if recurrent else idxs // T
                self.optimizer.zero_grad()
                rnn_state = init_rnn_state[B_idxs] if recurrent else None
                # NOTE: if not recurrent, will lose leading T dim, should be OK.
                loss, entropy, perplexity = self.loss(
                    *loss_inputs[T_idxs, B_idxs], rnn_state)
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    self.agent.parameters(), self.clip_grad_norm)
                self.optimizer.step()

                opt_info.loss.append(loss.item())
                opt_info.gradNorm.append(grad_norm)
                opt_info.entropy.append(entropy.item())
                opt_info.perplexity.append(perplexity.item())
                self.update_counter += 1
        if self.linear_lr_schedule:
            self.lr_scheduler.step()
            self.ratio_clip = self._ratio_clip * (self.n_itr - itr) / self.n_itr
        # if self.vae_lr_scheduler:
        #     self.vae_lr_scheduler.step()

        return opt_info
Exemplo n.º 9
0
    def inverse_loss(self, samples):
        observation = samples.observation[0]  # [T,B,C,H,W]->[B,C,H,W]
        last_observation = samples.observation[-1]

        if self.random_shift_prob > 0.:
            observation = random_shift(
                imgs=observation,
                pad=self.random_shift_pad,
                prob=self.random_shift_prob,
            )
            last_observation = random_shift(
                imgs=last_observation,
                pad=self.random_shift_pad,
                prob=self.random_shift_prob,
            )

        action = samples.action  # [T,B,A]
        # if self.onehot_actions:
        #     action = to_onehot(action, self._act_dim, dtype=torch.float)
        observation, last_observation, action = buffer_to(
            (observation, last_observation, action), device=self.device)

        _, conv_obs = self.encoder(observation)
        _, conv_last = self.encoder(last_observation)

        valid = valid_from_done(samples.done).type(torch.bool)  # [T,B]
        # All timesteps invalid if the last_observation is:
        valid = valid[-1].repeat(self.delta_T, 1).transpose(1, 0)  # [B,T-1]

        if self.onehot_actions:
            logits = self.inverse_model(conv_obs, conv_last)  # [B,T-1,A]
            labels = action[:-1].transpose(1,
                                           0)  # [B,T-1], not the last action
            labels[~valid] = IGNORE_INDEX

            b, t, a = logits.shape
            logits = logits.view(b * t, a)
            labels = labels.reshape(b * t)
            logits = logits - torch.max(logits, dim=1, keepdim=True)[0]
            inv_loss = self.c_e_loss(logits, labels)

            valid = valid.reshape(b * t).to(self.device)
            dist_info = DistInfo(prob=F.softmax(logits, dim=1))
            entropy = self.distribution.mean_entropy(
                dist_info=dist_info,
                valid=valid,
            )
            entropy_loss = -self.entropy_loss_coeff * entropy

            correct = torch.argmax(logits.detach(), dim=1) == labels
            accuracy = torch.mean(correct[valid].float())

        else:
            raise NotImplementedError

        perplexity = self.distribution.mean_perplexity(dist_info,
                                                       valid.to(self.device))

        return inv_loss, entropy_loss, accuracy, perplexity, conv_obs
Exemplo n.º 10
0
 def __call__(self,
              observation,
              prev_action,
              prev_reward,
              sampled_option,
              device="cpu"):
     """Performs forward pass on training data, for algorithm. Returns sampled distinfo, q, beta, and piomega distinfo"""
     model_inputs = buffer_to(
         (observation, prev_action, prev_reward, sampled_option),
         device=self.device)
     mu, log_std, beta, q, pi = self.model(*model_inputs[:-1])
     # Need gradients from intra-option (DistInfoStd), q_o (q), termination (beta), and pi_omega (DistInfo)
     return buffer_to(
         (DistInfoStd(mean=select_at_indexes(sampled_option, mu),
                      log_std=select_at_indexes(sampled_option, log_std)),
          q, beta, DistInfo(prob=pi)),
         device=device)
    def pi(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)

        actions, dist_info = [], []
        log_pi_total = 0
        self.model.start()
        while self.model.has_next():
            dist_info.append(self.model.next(actions, *model_inputs))
            action, log_pi = self.model.sample_loglikelihood(dist_info[-1])

            log_pi_total += log_pi
            actions.append(action)

        log_pi_total = buffer_to(log_pi_total, device="cpu")
        action = torch.cat(actions, dim=-1)
        return action, log_pi_total, None  # Action stays on device for q models.
Exemplo n.º 12
0
 def step(self, observation, prev_action, prev_reward):
     """Computes Q-values for states/observations and selects actions by
     epsilon-greedy (no grad).  Advances RNN state."""
     prev_action = self.distribution.to_onehot(prev_action)
     agent_inputs = buffer_to((observation, prev_action, prev_reward),
         device=self.device)
     q, rnn_state = self.model(*agent_inputs, self.prev_rnn_state)  # Model handles None.
     q = q.cpu()
     action = self.distribution.sample(q)
     prev_rnn_state = self.prev_rnn_state if self.prev_rnn_state is not None else buffer_func(rnn_state, torch.zeros_like)
     # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage.
     # (Special case, model should always leave B dimension in.)
     prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1)
     prev_rnn_state = buffer_to(prev_rnn_state, device="cpu")
     agent_info = AgentInfo(q=q, prev_rnn_state=prev_rnn_state)
     self.advance_rnn_state(rnn_state)  # Keep on device.
     return AgentStep(action=action, agent_info=agent_info)
Exemplo n.º 13
0
 def step(self, observation, prev_action, prev_reward):
     """Computes Q-values for states/observations and selects actions by
     epsilon-greedy. (no grad)"""
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     q = self.model(*model_inputs)
     return self.to_agent_step(q)
Exemplo n.º 14
0
 def value(self, observation, prev_action, prev_reward):
     """
     Compute the value estimate for the environment state, e.g. for the
     bootstrap value, V(s_{T+1}), in the sampler.  (no grad)
     """
     model_inputs = buffer_to((observation, ), device=self.device)[0]
     _, _, value, _ = self.model(model_inputs)
     return value.to("cpu")
Exemplo n.º 15
0
 def step(self, observation, prev_action, prev_reward):
     """Compute the discrete distribution for the Q-value for each
     action for each state/observation (no grad)."""
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
         device=self.device)
     p = self.model(*model_inputs)
     return self.to_agent_step(p)
Exemplo n.º 16
0
 def q_at_mu(self, observation, prev_action, prev_reward):
     """Compute Q-value for input state/observation, through the mu_model
     (with grad)."""
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     mu = self.model(*model_inputs)
     q = self.q_model(*model_inputs, mu)
     return q.cpu()
Exemplo n.º 17
0
 def target(self, observation, prev_action, prev_reward, init_rnn_state):
     # Assume init_rnn_state already shaped: [N,B,H]
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to(
         (observation, prev_action, prev_reward, init_rnn_state),
         device=self.device)
     target_q, rnn_state = self.target_model(*model_inputs)
     return target_q.cpu(), rnn_state  # Leave rnn state on device.
Exemplo n.º 18
0
 def target_q_at_mu(self, observation, prev_action, prev_reward):
     """Compute target Q-value for input state/observation, through the
     target mu_model."""
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     target_mu = self.target_model(*model_inputs)
     target_q_at_mu = self.target_q_model(*model_inputs, target_mu)
     return target_q_at_mu.cpu()
Exemplo n.º 19
0
 def target_q(self, observation, prev_action, prev_reward):
     """Compute twin target Q-values for state/observation and input
     action."""
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     target_q1 = self.target_q1_model(*model_inputs)
     target_q2 = self.target_q2_model(*model_inputs)
     return target_q1.cpu(), target_q2.cpu()
Exemplo n.º 20
0
 def step(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     agent_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     q, rnn_state = self.model(*agent_inputs,
                               self.prev_rnn_state)  # Model handles None.
     q = q.cpu()
     action = self.distribution.sample(q)
     prev_rnn_state = self.prev_rnn_state or buffer_func(
         rnn_state, torch.zeros_like)
     # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage.
     # (Special case, model should always leave B dimension in.)
     prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1)
     prev_rnn_state = buffer_to(prev_rnn_state, device="cpu")
     agent_info = AgentInfo(q=q, prev_rnn_state=prev_rnn_state)
     self.advance_rnn_state(rnn_state)  # Keep on device.
     return AgentStep(action=action, agent_info=agent_info)
Exemplo n.º 21
0
 def reconstructions(self, observation, prev_action, prev_reward):
     prev_action = self.distribution.to_onehot(prev_action)
     observation = observation.type(torch.float)
     observation = observation.mul_(1. / 255)
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     _pi, _value, _latent, reconstruction = self.model(*model_inputs)
     return reconstruction.to("cpu")
Exemplo n.º 22
0
 def q(self, observation, prev_action, prev_reward):
     """Compute twin Q-values for state/observation and input action 
     (with grad)."""
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     q1 = self.q1_model(*model_inputs)
     q2 = self.q2_model(*model_inputs)
     return q1.cpu(), q2.cpu()
Exemplo n.º 23
0
 def step(self, observation, prev_action, prev_reward):
     """"
     Compute policy's action distribution from inputs, and sample an
     action. Calls the model to produce mean, log_std, value estimate, and
     next recurrent state.  Moves inputs to device and returns outputs back
     to CPU, for the sampler.  Advances the recurrent state of the agent.
     (no grad)
     """
     model_inputs = buffer_to((observation, prev_action), device=self.device)
     action, state = self.model(*model_inputs, self.prev_rnn_state)
     action = self.exploration(action)
     # Model handles None, but Buffer does not, make zeros if needed:
     prev_state = self.prev_rnn_state or buffer_func(state, torch.zeros_like)
     self.advance_rnn_state(state)
     agent_info = DreamerAgentInfo(prev_state=prev_state)
     agent_step = AgentStep(action=action, agent_info=agent_info)
     return buffer_to(agent_step, device='cpu')
Exemplo n.º 24
0
 def step(self, observation, prev_action, prev_reward):
     """Computes Q-values for states/observations and selects actions by
     epsilon-greedy (no grad).  Advances RNN state."""
     prev_action = self.distribution.to_onehot(prev_action)
     agent_inputs = buffer_to((observation, prev_action, prev_reward),
         device=self.device)
     output = self.model(*agent_inputs, self.prev_rnn_state)  # Model handles None.
     return self.to_agent_step(output)
Exemplo n.º 25
0
 def predict_next_obs_at_mu(self, observation, prev_action, prev_reward):
     """Compute Q-value for input state/observation, through the mu_model
     (with grad)."""
     model_inputs = buffer_to((observation, prev_action, prev_reward),
         device=self.device)
     mu = self.model(*model_inputs)
     next_obs = self.d_model(
         *model_inputs, mu) + model_inputs[0] # model_inputs[0] is the observation
     return next_obs.cpu()
Exemplo n.º 26
0
 def target_q_at_mu(self, observation, prev_action, prev_reward):
     model_inputs = buffer_to((observation, prev_action, prev_reward),
                              device=self.device)
     target_mu = self.target_model(*model_inputs)
     target_action = self.target_distribution.sample(
         DistInfo(mean=target_mu))
     target_q1_at_mu = self.target_q_model(*model_inputs, target_action)
     target_q2_at_mu = self.target_q2_model(*model_inputs, target_action)
     return target_q1_at_mu.cpu(), target_q2_at_mu.cpu()
Exemplo n.º 27
0
 def value(self, observation, prev_action, prev_reward):
     """
     Compute the value estimate for the environment state using the
     currently held recurrent state, without advancing the recurrent state,
     e.g. for the bootstrap value V(s_{T+1}), in the sampler.  (no grad)
     """
     agent_inputs = buffer_to((observation, prev_action), device=self.device)
     action, action_dist, value, reward, state = self.model(*agent_inputs, self.prev_rnn_state)
     return value.to("cpu")
Exemplo n.º 28
0
 def to_agent_step(self, output):
     """Convert the output of the NN model into step info for the agent.
     """
     q = output
     # q = q.cpu()
     action = self.distribution.sample(q)
     agent_info = AgentInfo(q=q)
     action, agent_info = buffer_to((action, agent_info), device="cpu")
     return AgentStep(action=action, agent_info=agent_info)
Exemplo n.º 29
0
 def predict_obs_delta(self, observation, prev_action, prev_reward, action, train=True):
     """Compute the next state for input state/observation and action (with grad)."""
     model_inputs = buffer_to((observation, prev_action, prev_reward,
         action), device=self.device)
     predict_obs_delta = self.d_model(*model_inputs, train=train)
     # Warning: Ideally, the output of the agent should always be on cpu.
     # But due to the complexity to migrate the GP output from gpu to cpu,
     # I decide to just leave it on device and defer to data sync in algo
     return predict_obs_delta
Exemplo n.º 30
0
 def __call__(self, observation, prev_action, prev_reward):
     """
     __call__使得一个class可以像一个method一样调用,即:假设agent为DqnAgent的一个对象,那么agent(observation, prev_action,
     prev_reward)就等同于调用agent.__call__(observation, prev_action, prev_reward)
     """
     prev_action = self.distribution.to_onehot(prev_action)
     model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device)
     q = self.model(*model_inputs)  # torch.nn.Module子类的实例,使用torch.nn.Module里定义的__call__调用,相当于计算模型输出(一个Tensor)
     return q.cpu()  # 将tensor移动到CPU(内存)