示例#1
0
    def value_loss(self, trajectory, act=None):
        """ Computes value loss. """
        if act is None:
            act = self.policy.act(trajectory, training=True)
        if "value_targets" not in trajectory:
            raise ValueError("trajectory does not contain 'value_targets'")

        value_targets = self.torch_from_numpy(trajectory["value_targets"])
        old_value_preds = self.torch_from_numpy(trajectory["values"])
        values = act["values"]

        if values.shape != value_targets.shape:
            raise ValueError("trajectory has mismatched shapes "
                             f"values.shape={values.shape} "
                             f"value_targets.shape={value_targets.shape}")

        value_loss = torch.pow(values - value_targets, 2)
        if self.cliprange is not None:
            values_clipped = old_value_preds + torch.clamp(
                values - old_value_preds, -self.cliprange, self.cliprange)
            value_loss_clipped = torch.pow(values_clipped - value_targets, 2)
            value_loss = torch.max(value_loss, value_loss_clipped)

        value_loss = torch.mean(value_loss)
        if summary.should_record():
            summaries = dict(value_loss=value_loss,
                             value_targets=torch.mean(value_targets),
                             value_preds=torch.mean(values),
                             r_squared=r_squared(value_targets, values))
            for key, val in summaries.items():
                summary.add_scalar(f"ppo/{key}",
                                   val,
                                   global_step=self.call_count)
        value_loss = torch.mean(value_loss)
        return value_loss
示例#2
0
    def value_loss(self, trajectory, act=None):
        """ Compute value loss. """
        if act is None:
            act = self.policy.act(trajectory, training=True)
        values = act["values"]
        value_targets = self.torch_from_numpy(trajectory["value_targets"])

        if values.shape != value_targets.shape:
            raise ValueError("trajectory has mismatched shapes "
                             f"values.shape={values.shape} "
                             f"value_targets.shape={value_targets.shape}")

        value_loss = torch.mean(torch.pow(values - value_targets, 2))

        if summary.should_record():
            summaries = dict(value_targets=torch.mean(value_targets),
                             value_preds=torch.mean(values),
                             value_loss=value_loss,
                             r_squared=r_squared(values, value_targets))
            for key, val in summaries.items():
                summary.add_scalar(f"{self.name}/{key}",
                                   val,
                                   global_step=self.call_count)

        return value_loss
示例#3
0
    def policy_loss(self, trajectory, act=None):
        """ Compute policiy loss including entropy regularization. """
        if act is None:
            act = self.policy.act(trajectory, training=True)
        log_prob = act["distribution"].log_prob(
            self.torch_from_numpy(trajectory["actions"]))
        advantages = self.torch_from_numpy(trajectory["advantages"])

        if log_prob.shape != advantages.shape:
            raise ValueError("trajectory has mismatched shapes: "
                             f"log_prob.shape={log_prob.shape} "
                             f"advantages.shape={advantages.shape}")

        policy_loss = -torch.mean(log_prob * advantages)
        entropy = torch.mean(act["distribution"].entropy())

        if summary.should_record():
            summaries = dict(advantages=torch.mean(advantages),
                             entropy=torch.mean(entropy),
                             policy_loss=policy_loss)
            for key, val in summaries.items():
                summary.add_scalar(f"{self.name}/{key}",
                                   val,
                                   global_step=self.call_count)

        return policy_loss - self.entropy_coef * entropy
示例#4
0
    def __call__(self, data):
        obs, actions, rewards, resets, next_obs = (self.torch_from_numpy(
            data[k]) for k in ("observations", "actions", "rewards", "resets",
                               "next_observations"))

        qtargets = self.compute_targets(rewards, resets, next_obs)
        qvalues = self.make_predictions(obs, actions)
        if "update_priorities" in data:
            data["update_priorities"](
                torch.abs(qtargets - qvalues).cpu().detach().numpy())

        weights = None
        if "weights" in data:
            weights = self.torch_from_numpy(data["weights"])
        loss = huber_loss(qtargets, qvalues, weights=weights)

        if summary.should_record():
            summary.add_scalar(f"{self.name}/r_squared",
                               r_squared(qtargets, qvalues),
                               global_step=self.call_count)
            summary.add_scalar(f"{self.name}/loss",
                               loss,
                               global_step=self.call_count)
        self.call_count += 1
        return loss
示例#5
0
 def __call__(self, data):
     act = self.policy.act(data, training=True)
     policy_loss = self.policy_loss(data, act)
     value_loss = self.value_loss(data, act)
     loss = policy_loss + self.value_loss_coef * value_loss
     if summary.should_record():
         summary.add_scalar("ppo/loss", loss, global_step=self.call_count)
     self.call_count += 1
     return loss
示例#6
0
 def step(self, alg):
     """ Performs single training step of a given algorithm. """
     alg.accumulate_gradients()
     self.preprocess_gradients(alg.model.parameters(), alg.name)
     for anneal in self.anneals:
         if summary.should_record():
             anneal.summarize(alg.runner.step_count)
         anneal.step_to(alg.runner.step_count)
     self.optimizer.step()
     self.optimizer.zero_grad()
     self.step_count += 1
示例#7
0
 def preprocess_gradients(self, parameters, name):
     """ Applies gradient preprocessing. """
     grad_norm = None
     if self.max_grad_norm is not None:
         grad_norm = torch.nn.utils.clip_grad_norm_(parameters,
                                                    self.max_grad_norm)
     if summary.should_record():
         if grad_norm is None:
             grad_norm = total_norm(p.grad for p in parameters
                                    if p.grad is not None)
         summary.add_scalar(f"{name}/grad_norm",
                            grad_norm,
                            global_step=self.step_count)
示例#8
0
 def run(self, obs=None):
     if not self.initialized_storage:
         obs = self.initialize_storage(obs=obs)
     for interactions in self.runner.run(obs=obs):
         interactions = [
             interactions[k]
             for k in ("observations", "actions", "rewards", "resets")
         ]
         self.storage.add_batch(*interactions)
         for anneal in self.anneals:
             if summary.should_record():
                 anneal.summarize(self.step_count)
             anneal.step_to(self.step_count)
         yield self.storage.sample(self.batch_size)
示例#9
0
    def policy_loss(self, trajectory, act=None):
        """ Compute policy loss (including entropy regularization). """
        if act is None:
            act = self.policy.act(trajectory, training=True)
        if "advantages" not in trajectory:
            raise ValueError("trajectory does not contain 'advantages'")

        old_log_prob = self.torch_from_numpy(trajectory["log_prob"])
        advantages = self.torch_from_numpy(trajectory["advantages"])
        actions = self.torch_from_numpy(trajectory["actions"])

        log_prob = act["distribution"].log_prob(actions)
        if log_prob.shape != old_log_prob.shape:
            raise ValueError("trajectory has mismatched shapes: "
                             f"log_prob.shape={log_prob.shape} "
                             f"old_log_prob.shape={old_log_prob.shape}")
        if log_prob.shape != advantages.shape:
            raise ValueError("trajectory has mismatched shapes: "
                             f"log_prob.shape={log_prob.shape} "
                             f"advantages.shape={advantages.shape}")

        ratio = torch.exp(log_prob - old_log_prob)
        policy_loss = -ratio * advantages
        if self.cliprange is not None:
            ratio_clipped = torch.clamp(ratio, 1. - self.cliprange,
                                        1. + self.cliprange)
            policy_loss_clipped = -ratio_clipped * advantages
            policy_loss = torch.max(policy_loss, policy_loss_clipped)

        policy_loss = torch.mean(policy_loss)
        entropy = torch.mean(act["distribution"].entropy())

        if summary.should_record():
            summaries = dict(advantages=torch.mean(advantages),
                             policy_loss=policy_loss,
                             entropy=entropy)
            for key, val in summaries.items():
                summary.add_scalar(f"ppo/{key}",
                                   val,
                                   global_step=self.call_count)

        return policy_loss - self.entropy_coef * entropy
示例#10
0
 def should_add_summaries(self):
   """ Returns `True` if it is time to write summaries. """
   return summary.should_record() and np.all(self.had_ended_episodes)