def process_fn( self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray ) -> Batch: v_s, v_s_ = [], [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): v_s.append(self.critic(b.obs)) v_s_.append(self.critic(b.obs_next)) batch.v_s = torch.cat(v_s, dim=0).flatten() # old value v_s = to_numpy(batch.v_s) v_s_ = to_numpy(torch.cat(v_s_, dim=0).flatten()) # when normalizing values, we do not minus self.ret_rms.mean to be numerically # consistent with OPENAI baselines' value normalization pipeline. Emperical # study also shows that "minus mean" will harm performances a tiny little bit # due to unknown reasons (on Mujoco envs, not confident, though). if self._rew_norm: # unnormalize v_s & v_s_ v_s = v_s * np.sqrt(self.ret_rms.var + self._eps) v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) unnormalized_returns, advantages = self.compute_episodic_return( batch, buffer, indice, v_s_, v_s, gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: batch.returns = unnormalized_returns / \ np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(unnormalized_returns) else: batch.returns = unnormalized_returns batch.act = to_torch_as(batch.act, batch.v_s) batch.returns = to_torch_as(batch.returns, batch.v_s) batch.adv = to_torch_as(advantages, batch.v_s) return batch
def process_fn(self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray) -> Batch: if self._rew_norm: mean, std = batch.rew.mean(), batch.rew.std() if not np.isclose(std, 0, 1e-2): batch.rew = (batch.rew - mean) / std v, v_, old_log_prob = [], [], [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False): v_.append(self.critic(b.obs_next)) v.append(self.critic(b.obs)) old_log_prob.append(self(b).dist.log_prob( to_torch_as(b.act, v[0]))) v_ = to_numpy(torch.cat(v_, dim=0)) batch = self.compute_episodic_return( batch, v_, gamma=self._gamma, gae_lambda=self._lambda, rew_norm=self._rew_norm) batch.v = torch.cat(v, dim=0).flatten() # old value batch.act = to_torch_as(batch.act, v[0]) batch.logp_old = torch.cat(old_log_prob, dim=0) batch.returns = to_torch_as(batch.returns, v[0]) batch.adv = batch.returns - batch.v if self._rew_norm: mean, std = batch.adv.mean(), batch.adv.std() if not np.isclose(std.item(), 0, 1e-2): batch.adv = (batch.adv - mean) / std return batch
def process_fn( self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray ) -> Batch: v_s, v_s_, old_log_prob = [], [], [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): v_s.append(self.critic(b.obs)) v_s_.append(self.critic(b.obs_next)) old_log_prob.append(self(b).dist.log_prob(to_torch_as(b.act, v_s[0]))) batch.v_s = torch.cat(v_s, dim=0).flatten() # old value v_s = to_numpy(batch.v_s) v_s_ = to_numpy(torch.cat(v_s_, dim=0).flatten()) if self._rew_norm: # unnormalize v_s & v_s_ v_s = v_s * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean unnormalized_returns, advantages = self.compute_episodic_return( batch, buffer, indice, v_s_, v_s, gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: batch.returns = (unnormalized_returns - self.ret_rms.mean) / \ np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(unnormalized_returns) mean, std = np.mean(advantages), np.std(advantages) advantages = (advantages - mean) / std # per-batch norm else: batch.returns = unnormalized_returns batch.act = to_torch_as(batch.act, batch.v_s) batch.logp_old = torch.cat(old_log_prob, dim=0) batch.returns = to_torch_as(batch.returns, batch.v_s) batch.adv = to_torch_as(advantages, batch.v_s) return batch
def process_fn(self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray) -> Batch: if self._recompute_adv: # buffer input `buffer` and `indice` to be used in `learn()`. self._buffer, self._indice = buffer, indice batch = self._compute_returns(batch, buffer, indice) batch.act = to_torch_as(batch.act, batch.v_s) old_log_prob = [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): old_log_prob.append(self(b).dist.log_prob(b.act)) batch.logp_old = torch.cat(old_log_prob, dim=0) return batch
def learn(self, batch: Batch, **kwargs) -> Dict[str, float]: with torch.no_grad(): obs_next_result = self(batch, input='obs_next') a_ = obs_next_result.act dev = a_.device batch.act = torch.tensor(batch.act, dtype=torch.float, device=dev) target_q = torch.min( self.critic1_old(batch.obs_next, a_), self.critic2_old(batch.obs_next, a_), ) - self._alpha * obs_next_result.log_prob rew = torch.tensor(batch.rew, dtype=torch.float, device=dev)[:, None] done = torch.tensor(batch.done, dtype=torch.float, device=dev)[:, None] target_q = (rew + (1. - done) * self._gamma * target_q) # critic 1 current_q1 = self.critic1(batch.obs, batch.act) critic1_loss = F.mse_loss(current_q1, target_q) self.critic1_optim.zero_grad() critic1_loss.backward() self.critic1_optim.step() # critic 2 current_q2 = self.critic2(batch.obs, batch.act) critic2_loss = F.mse_loss(current_q2, target_q) self.critic2_optim.zero_grad() critic2_loss.backward() self.critic2_optim.step() # actor obs_result = self(batch) a = obs_result.act current_q1a = self.critic1(batch.obs, a) current_q2a = self.critic2(batch.obs, a) actor_loss = (self._alpha * obs_result.log_prob - torch.min(current_q1a, current_q2a)).mean() self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.sync_weight() return { 'loss/actor': actor_loss.item(), 'loss/critic1': critic1_loss.item(), 'loss/critic2': critic2_loss.item(), }
def process_fn(self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray) -> Batch: batch = self._compute_returns(batch, buffer, indice) batch.act = to_torch_as(batch.act, batch.v_s) return batch
def learn(self, batch: Batch, batch_size: int, repeat: int, **kwargs) -> Dict[str, List[float]]: self._batch = batch_size losses, clip_losses, vf_losses, ent_losses = [], [], [], [] v = [] old_log_prob = [] with torch.no_grad(): for b in batch.split(batch_size, shuffle=False): v.append(self.critic(b.obs)) old_log_prob.append( self(b).dist.log_prob( torch.tensor(b.act, device=v[0].device))) batch.v = torch.cat(v, dim=0) # old value dev = batch.v.device batch.act = torch.tensor(batch.act, dtype=torch.float, device=dev) batch.logp_old = torch.cat(old_log_prob, dim=0) batch.returns = torch.tensor(batch.returns, dtype=torch.float, device=dev).reshape(batch.v.shape) if self._rew_norm: mean, std = batch.returns.mean(), batch.returns.std() if std > self.__eps: batch.returns = (batch.returns - mean) / std batch.adv = batch.returns - batch.v if self._rew_norm: mean, std = batch.adv.mean(), batch.adv.std() if std > self.__eps: batch.adv = (batch.adv - mean) / std for _ in range(repeat): for b in batch.split(batch_size): dist = self(b).dist value = self.critic(b.obs) ratio = (dist.log_prob(b.act) - b.logp_old).exp().float() surr1 = ratio * b.adv surr2 = ratio.clamp(1. - self._eps_clip, 1. + self._eps_clip) * b.adv if self._dual_clip: clip_loss = -torch.max(torch.min(surr1, surr2), self._dual_clip * b.adv).mean() else: clip_loss = -torch.min(surr1, surr2).mean() clip_losses.append(clip_loss.item()) if self._value_clip: v_clip = b.v + (value - b.v).clamp(-self._eps_clip, self._eps_clip) vf1 = (b.returns - value).pow(2) vf2 = (b.returns - v_clip).pow(2) vf_loss = .5 * torch.max(vf1, vf2).mean() else: vf_loss = .5 * (b.returns - value).pow(2).mean() vf_losses.append(vf_loss.item()) e_loss = dist.entropy().mean() ent_losses.append(e_loss.item()) loss = clip_loss + self._w_vf * vf_loss - self._w_ent * e_loss losses.append(loss.item()) self.optim.zero_grad() loss.backward() nn.utils.clip_grad_norm_( list(self.actor.parameters()) + list(self.critic.parameters()), self._max_grad_norm) self.optim.step() return { 'loss': losses, 'loss/clip': clip_losses, 'loss/vf': vf_losses, 'loss/ent': ent_losses, }