def _step(self, replay_buffer, optimizers, env, model, step, writer=None): (obs, action, reward, next_obs, done), sample_idx = replay_buffer.sample() not_done = int(not done) critic_loss = self.update_critic(obs, action, reward, next_obs, not_done, env, model, optimizers) replay_buffer.report_sample_loss(critic_loss, sample_idx) if step % self.cfg.actor_update_frequency == 0: actor_loss, alpha_loss = self.update_actor_and_alpha( obs, env, model, optimizers) if writer is not None: writer.add_scalar("loss/actor", actor_loss, self.global_writer_loss_count.value()) writer.add_scalar("loss/temperature", alpha_loss, self.global_writer_loss_count.value()) if step % self.cfg.critic_target_update_frequency == 0: soft_update_params(model.module.critic, model.module.critic_tgt, self.critic_tau) if writer is not None: writer.add_scalar("loss/critic", critic_loss, self.global_writer_loss_count.value())
def _step(self, step): actor_loss, min_entropy, loc_mean = None, None, None (obs, action, reward), sample_idx = self.memory.sample() critic_loss, mean_reward = self.update_critic(obs, action, reward) self.memory.report_sample_loss(critic_loss + mean_reward, sample_idx) self.mov_sum_losses.critic.apply(critic_loss) wandb.log({"loss/critic": critic_loss}, step=self.global_counter) if self.cfg.actor_update_after < step and step % self.cfg.actor_update_frequency == 0: actor_loss, min_entropy, loc_mean = self.update_actor(obs, reward, action) self.mov_sum_losses.actor.apply(actor_loss) wandb.log({"loss/actor": actor_loss}, step=self.global_counter) if step % self.cfg.post_stats_frequency == 0: if min_entropy != "nl": wandb.log({"min_entropy": min_entropy}, step=self.global_counter) wandb.log({"mov_avg/critic": self.mov_sum_losses.critic.avg}, step=self.global_counter) wandb.log({"mov_avg/actor": self.mov_sum_losses.actor.avg}, step=self.global_counter) wandb.log({"lr/critic": self.optimizers.critic_shed.optimizer.param_groups[0]['lr']}, step=self.global_counter) wandb.log({"lr/actor": self.optimizers.actor_shed.optimizer.param_groups[0]['lr']}, step=self.global_counter) self.global_counter = self.global_counter + 1 if step % self.cfg.critic_target_update_frequency == 0: soft_update_params(self.model.critic, self.model.critic_tgt, self.cfg.critic_tau) return critic_loss, actor_loss, loc_mean
def _step(self, replay_buffer, optimizers, mov_sum_loss, env, model, step, writer=None): (obs, action, reward, next_obs, done), sample_idx = replay_buffer.sample() not_done = int(not done) n_prep_steps = self.cfg.trainer.t_max - self.cfg.fe.update_after_steps embeddings_opt = step - n_prep_steps > 0 and ( step - n_prep_steps) % self.cfg.fe.update_frequency == 0 if "extra" in self.cfg.fe.optim: if embeddings_opt: embedd_loss = self.update_embeddings(obs, env, model, optimizers) mov_sum_loss.embeddings.apply(embedd_loss) optimizers.embed_shed.step(mov_sum_loss.embeddings.avg) if writer is not None: writer.add_scalar("loss/embedd", embedd_loss, self.global_writer_loss_count.value()) return critic_loss, mean_reward = self.update_critic(obs, action, reward, next_obs, not_done, env, model, optimizers) mov_sum_loss.critic.apply(critic_loss) # optimizers.critic_shed.step(mov_sum_loss.critic.avg) replay_buffer.report_sample_loss(critic_loss + mean_reward, sample_idx) if step % self.cfg.sac.actor_update_frequency == 0: actor_loss, alpha_loss = self.update_actor_and_alpha( obs, env, model, optimizers, embeddings_opt) mov_sum_loss.actor.apply(actor_loss) mov_sum_loss.temperature.apply(alpha_loss) # optimizers.temp_shed.step(mov_sum_loss.actor.avg) # optimizers.temp_shed.step(mov_sum_loss.temperature.avg) if writer is not None: writer.add_scalar("loss/actor", actor_loss, self.global_writer_loss_count.value()) writer.add_scalar("loss/temperature", alpha_loss, self.global_writer_loss_count.value()) if step % self.cfg.sac.critic_target_update_frequency == 0: soft_update_params(model.module.critic, model.module.critic_tgt, self.cfg.sac.critic_tau) if writer is not None: writer.add_scalar("loss/critic", critic_loss, self.global_writer_loss_count.value())
def _step_episodic_mem(self, replay_buffer, optimizers, env, model, step, writer=None): loss_critic, loss_actor, loss_alpha = 0, 0, 0 batch = [] for it in range(self.batch_size): batch.append(replay_buffer.sample()) for episode in batch: loss = self.update_critic_episodic(episode.episode, env, model) loss_critic = loss_critic + loss loss_critic = loss_critic / self.batch_size optimizers.critic.zero_grad() loss_critic.backward() optimizers.critic.step() if step % self.cfg.actor_update_frequency == 0: for episode in batch: loss_a, loss_t = self.update_actor_and_alpha_episodic( episode.episode, env, model) loss_actor = loss_actor + loss_a loss_alpha = loss_alpha + loss_t loss_actor = loss_actor / self.batch_size loss_alpha = loss_alpha / self.batch_size optimizers.actor.zero_grad() loss_actor.backward() optimizers.actor.step() if self.cfg.temperature_regulation == 'optimize': optimizers.temperature.zero_grad() loss_alpha.backward() optimizers.temperature.step() if step % self.cfg.critic_target_update_frequency == 0: soft_update_params(model.module.critic, model.module.critic_tgt, self.critic_tau) if writer is not None: writer.add_scalar("loss/critic", loss_critic.item(), self.global_writer_loss_count.value()) writer.add_scalar("loss/actor", loss_actor.item(), self.global_writer_loss_count.value()) if self.cfg.temperature_regulation == 'optimize': writer.add_scalar("loss/temperature", loss_alpha.item(), self.global_writer_loss_count.value()) writer.add_scalar("value/temperature", self.alpha.detach().item(), self.global_writer_loss_count.value())
def _step(self, step): actor_loss, alpha_loss, min_entropy, loc_mean = None, None, None, None (obs, action, reward), sample_idx = self.memory.sample() action = action.to(self.device) for i in range(len(reward)): reward[i] = reward[i].to(self.device) critic_loss, mean_reward = self.update_critic(obs, action, reward) self.memory.report_sample_loss(critic_loss + mean_reward, sample_idx) self.mov_sum_losses.critic.apply(critic_loss) # self.optimizers.critic_shed.step(self.mov_sum_losses.critic.avg) wandb.log({"loss/critic": critic_loss}) if self.cfg.actor_update_after < step and step % self.cfg.actor_update_frequency == 0: actor_loss, alpha_loss, min_entropy, loc_mean = self.update_actor_and_alpha( obs, reward, action) self.mov_sum_losses.actor.apply(actor_loss) self.mov_sum_losses.temperature.apply(alpha_loss) # self.optimizers.actor_shed.step(self.mov_sum_losses.actor.avg) # self.optimizers.temp_shed.step(self.mov_sum_losses.actor.avg) wandb.log({"loss/actor": actor_loss}) wandb.log({"loss/alpha": alpha_loss}) if step % self.cfg.post_stats_frequency == 0: if min_entropy != "nl": wandb.log({"min_entropy": min_entropy}) wandb.log({"mov_avg/critic": self.mov_sum_losses.critic.avg}) wandb.log({"mov_avg/actor": self.mov_sum_losses.actor.avg}) wandb.log( {"mov_avg/temperature": self.mov_sum_losses.temperature.avg}) wandb.log({ "lr/critic": self.optimizers.critic_shed.optimizer.param_groups[0]['lr'] }) wandb.log({ "lr/actor": self.optimizers.actor_shed.optimizer.param_groups[0]['lr'] }) wandb.log({ "lr/temperature": self.optimizers.temp_shed.optimizer.param_groups[0]['lr'] }) if step % self.cfg.critic_target_update_frequency == 0: soft_update_params(self.model.critic, self.model.critic_tgt, self.cfg.critic_tau) return [critic_loss, actor_loss, alpha_loss, loc_mean]
def _step(self, replay_buffer, optimizers, env, model, step, writer=None): loss_critic, loss_actor, loss_alpha = 0, 0, 0 batch = [] for it in range(self.batch_size): batch.append(replay_buffer.sample()) for obs, action, reward, next_obs, done in batch: not_done = int(not done) loss = self.update_critic(obs, action, reward, next_obs, not_done, env, model) loss_critic = loss_critic + loss loss_critic = loss_critic / self.batch_size optimizers.critic.zero_grad() loss_critic.backward() optimizers.critic.step() if step % self.cfg.actor_update_frequency == 0: for obs, action, reward, next_obs, done in batch: loss_a, loss_t = self.update_actor_and_alpha(obs, env, model) loss_actor = loss_actor + loss_a loss_alpha = loss_alpha + loss_t loss_actor = loss_actor / self.batch_size loss_alpha = loss_alpha / self.batch_size optimizers.actor.zero_grad() loss_actor.backward() optimizers.actor.step() optimizers.temperature.zero_grad() loss_alpha.backward() optimizers.temperature.step() if writer is not None: writer.add_scalar("loss/actor", loss_actor.item(), self.global_writer_loss_count.value()) writer.add_scalar("loss/temperature", loss_alpha.item(), self.global_writer_loss_count.value()) if step % self.cfg.critic_target_update_frequency == 0: soft_update_params(model.module.critic, model.module.critic_tgt, self.critic_tau) if writer is not None: writer.add_scalar("loss/critic", loss_critic.item(), self.global_writer_loss_count.value())