def value_loss(self, trajectory, act=None): """ Computes value loss. """ if act is None: act = self.policy.act(trajectory, training=True) if "value_targets" not in trajectory: raise ValueError("trajectory does not contain 'value_targets'") value_targets = self.torch_from_numpy(trajectory["value_targets"]) old_value_preds = self.torch_from_numpy(trajectory["values"]) values = act["values"] if values.shape != value_targets.shape: raise ValueError("trajectory has mismatched shapes " f"values.shape={values.shape} " f"value_targets.shape={value_targets.shape}") value_loss = torch.pow(values - value_targets, 2) if self.cliprange is not None: values_clipped = old_value_preds + torch.clamp( values - old_value_preds, -self.cliprange, self.cliprange) value_loss_clipped = torch.pow(values_clipped - value_targets, 2) value_loss = torch.max(value_loss, value_loss_clipped) value_loss = torch.mean(value_loss) if summary.should_record(): summaries = dict(value_loss=value_loss, value_targets=torch.mean(value_targets), value_preds=torch.mean(values), r_squared=r_squared(value_targets, values)) for key, val in summaries.items(): summary.add_scalar(f"ppo/{key}", val, global_step=self.call_count) value_loss = torch.mean(value_loss) return value_loss
def value_loss(self, trajectory, act=None): """ Compute value loss. """ if act is None: act = self.policy.act(trajectory, training=True) values = act["values"] value_targets = self.torch_from_numpy(trajectory["value_targets"]) if values.shape != value_targets.shape: raise ValueError("trajectory has mismatched shapes " f"values.shape={values.shape} " f"value_targets.shape={value_targets.shape}") value_loss = torch.mean(torch.pow(values - value_targets, 2)) if summary.should_record(): summaries = dict(value_targets=torch.mean(value_targets), value_preds=torch.mean(values), value_loss=value_loss, r_squared=r_squared(values, value_targets)) for key, val in summaries.items(): summary.add_scalar(f"{self.name}/{key}", val, global_step=self.call_count) return value_loss
def policy_loss(self, trajectory, act=None): """ Compute policiy loss including entropy regularization. """ if act is None: act = self.policy.act(trajectory, training=True) log_prob = act["distribution"].log_prob( self.torch_from_numpy(trajectory["actions"])) advantages = self.torch_from_numpy(trajectory["advantages"]) if log_prob.shape != advantages.shape: raise ValueError("trajectory has mismatched shapes: " f"log_prob.shape={log_prob.shape} " f"advantages.shape={advantages.shape}") policy_loss = -torch.mean(log_prob * advantages) entropy = torch.mean(act["distribution"].entropy()) if summary.should_record(): summaries = dict(advantages=torch.mean(advantages), entropy=torch.mean(entropy), policy_loss=policy_loss) for key, val in summaries.items(): summary.add_scalar(f"{self.name}/{key}", val, global_step=self.call_count) return policy_loss - self.entropy_coef * entropy
def __call__(self, data): obs, actions, rewards, resets, next_obs = (self.torch_from_numpy( data[k]) for k in ("observations", "actions", "rewards", "resets", "next_observations")) qtargets = self.compute_targets(rewards, resets, next_obs) qvalues = self.make_predictions(obs, actions) if "update_priorities" in data: data["update_priorities"]( torch.abs(qtargets - qvalues).cpu().detach().numpy()) weights = None if "weights" in data: weights = self.torch_from_numpy(data["weights"]) loss = huber_loss(qtargets, qvalues, weights=weights) if summary.should_record(): summary.add_scalar(f"{self.name}/r_squared", r_squared(qtargets, qvalues), global_step=self.call_count) summary.add_scalar(f"{self.name}/loss", loss, global_step=self.call_count) self.call_count += 1 return loss
def __call__(self, data): act = self.policy.act(data, training=True) policy_loss = self.policy_loss(data, act) value_loss = self.value_loss(data, act) loss = policy_loss + self.value_loss_coef * value_loss if summary.should_record(): summary.add_scalar("ppo/loss", loss, global_step=self.call_count) self.call_count += 1 return loss
def step(self, alg): """ Performs single training step of a given algorithm. """ alg.accumulate_gradients() self.preprocess_gradients(alg.model.parameters(), alg.name) for anneal in self.anneals: if summary.should_record(): anneal.summarize(alg.runner.step_count) anneal.step_to(alg.runner.step_count) self.optimizer.step() self.optimizer.zero_grad() self.step_count += 1
def preprocess_gradients(self, parameters, name): """ Applies gradient preprocessing. """ grad_norm = None if self.max_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_(parameters, self.max_grad_norm) if summary.should_record(): if grad_norm is None: grad_norm = total_norm(p.grad for p in parameters if p.grad is not None) summary.add_scalar(f"{name}/grad_norm", grad_norm, global_step=self.step_count)
def run(self, obs=None): if not self.initialized_storage: obs = self.initialize_storage(obs=obs) for interactions in self.runner.run(obs=obs): interactions = [ interactions[k] for k in ("observations", "actions", "rewards", "resets") ] self.storage.add_batch(*interactions) for anneal in self.anneals: if summary.should_record(): anneal.summarize(self.step_count) anneal.step_to(self.step_count) yield self.storage.sample(self.batch_size)
def policy_loss(self, trajectory, act=None): """ Compute policy loss (including entropy regularization). """ if act is None: act = self.policy.act(trajectory, training=True) if "advantages" not in trajectory: raise ValueError("trajectory does not contain 'advantages'") old_log_prob = self.torch_from_numpy(trajectory["log_prob"]) advantages = self.torch_from_numpy(trajectory["advantages"]) actions = self.torch_from_numpy(trajectory["actions"]) log_prob = act["distribution"].log_prob(actions) if log_prob.shape != old_log_prob.shape: raise ValueError("trajectory has mismatched shapes: " f"log_prob.shape={log_prob.shape} " f"old_log_prob.shape={old_log_prob.shape}") if log_prob.shape != advantages.shape: raise ValueError("trajectory has mismatched shapes: " f"log_prob.shape={log_prob.shape} " f"advantages.shape={advantages.shape}") ratio = torch.exp(log_prob - old_log_prob) policy_loss = -ratio * advantages if self.cliprange is not None: ratio_clipped = torch.clamp(ratio, 1. - self.cliprange, 1. + self.cliprange) policy_loss_clipped = -ratio_clipped * advantages policy_loss = torch.max(policy_loss, policy_loss_clipped) policy_loss = torch.mean(policy_loss) entropy = torch.mean(act["distribution"].entropy()) if summary.should_record(): summaries = dict(advantages=torch.mean(advantages), policy_loss=policy_loss, entropy=entropy) for key, val in summaries.items(): summary.add_scalar(f"ppo/{key}", val, global_step=self.call_count) return policy_loss - self.entropy_coef * entropy
def should_add_summaries(self): """ Returns `True` if it is time to write summaries. """ return summary.should_record() and np.all(self.had_ended_episodes)