示例#1
0
    def _step(self, replay_buffer, optimizers, env, model, step, writer=None):

        (obs, action, reward, next_obs,
         done), sample_idx = replay_buffer.sample()
        not_done = int(not done)

        critic_loss = self.update_critic(obs, action, reward, next_obs,
                                         not_done, env, model, optimizers)
        replay_buffer.report_sample_loss(critic_loss, sample_idx)

        if step % self.cfg.actor_update_frequency == 0:
            actor_loss, alpha_loss = self.update_actor_and_alpha(
                obs, env, model, optimizers)
            if writer is not None:
                writer.add_scalar("loss/actor", actor_loss,
                                  self.global_writer_loss_count.value())
                writer.add_scalar("loss/temperature", alpha_loss,
                                  self.global_writer_loss_count.value())

        if step % self.cfg.critic_target_update_frequency == 0:
            soft_update_params(model.module.critic, model.module.critic_tgt,
                               self.critic_tau)

        if writer is not None:
            writer.add_scalar("loss/critic", critic_loss,
                              self.global_writer_loss_count.value())
示例#2
0
文件: a2c.py 项目: paulhfu/RLForSeg
    def _step(self, step):
        actor_loss, min_entropy, loc_mean = None, None, None

        (obs, action, reward), sample_idx = self.memory.sample()

        critic_loss, mean_reward = self.update_critic(obs, action, reward)
        self.memory.report_sample_loss(critic_loss + mean_reward, sample_idx)
        self.mov_sum_losses.critic.apply(critic_loss)
        wandb.log({"loss/critic": critic_loss}, step=self.global_counter)

        if self.cfg.actor_update_after < step and step % self.cfg.actor_update_frequency == 0:
            actor_loss, min_entropy, loc_mean = self.update_actor(obs, reward, action)
            self.mov_sum_losses.actor.apply(actor_loss)
            wandb.log({"loss/actor": actor_loss}, step=self.global_counter)

        if step % self.cfg.post_stats_frequency == 0:
            if min_entropy != "nl":
                wandb.log({"min_entropy": min_entropy}, step=self.global_counter)
            wandb.log({"mov_avg/critic": self.mov_sum_losses.critic.avg}, step=self.global_counter)
            wandb.log({"mov_avg/actor": self.mov_sum_losses.actor.avg}, step=self.global_counter)
            wandb.log({"lr/critic": self.optimizers.critic_shed.optimizer.param_groups[0]['lr']}, step=self.global_counter)
            wandb.log({"lr/actor": self.optimizers.actor_shed.optimizer.param_groups[0]['lr']}, step=self.global_counter)

        self.global_counter = self.global_counter + 1

        if step % self.cfg.critic_target_update_frequency == 0:
            soft_update_params(self.model.critic, self.model.critic_tgt, self.cfg.critic_tau)

        return critic_loss, actor_loss, loc_mean
示例#3
0
    def _step(self,
              replay_buffer,
              optimizers,
              mov_sum_loss,
              env,
              model,
              step,
              writer=None):

        (obs, action, reward, next_obs,
         done), sample_idx = replay_buffer.sample()
        not_done = int(not done)
        n_prep_steps = self.cfg.trainer.t_max - self.cfg.fe.update_after_steps
        embeddings_opt = step - n_prep_steps > 0 and (
            step - n_prep_steps) % self.cfg.fe.update_frequency == 0

        if "extra" in self.cfg.fe.optim:
            if embeddings_opt:
                embedd_loss = self.update_embeddings(obs, env, model,
                                                     optimizers)
                mov_sum_loss.embeddings.apply(embedd_loss)
                optimizers.embed_shed.step(mov_sum_loss.embeddings.avg)
                if writer is not None:
                    writer.add_scalar("loss/embedd", embedd_loss,
                                      self.global_writer_loss_count.value())
                return

        critic_loss, mean_reward = self.update_critic(obs, action, reward,
                                                      next_obs, not_done, env,
                                                      model, optimizers)
        mov_sum_loss.critic.apply(critic_loss)
        # optimizers.critic_shed.step(mov_sum_loss.critic.avg)
        replay_buffer.report_sample_loss(critic_loss + mean_reward, sample_idx)

        if step % self.cfg.sac.actor_update_frequency == 0:
            actor_loss, alpha_loss = self.update_actor_and_alpha(
                obs, env, model, optimizers, embeddings_opt)
            mov_sum_loss.actor.apply(actor_loss)
            mov_sum_loss.temperature.apply(alpha_loss)
            # optimizers.temp_shed.step(mov_sum_loss.actor.avg)
            # optimizers.temp_shed.step(mov_sum_loss.temperature.avg)
            if writer is not None:
                writer.add_scalar("loss/actor", actor_loss,
                                  self.global_writer_loss_count.value())
                writer.add_scalar("loss/temperature", alpha_loss,
                                  self.global_writer_loss_count.value())

        if step % self.cfg.sac.critic_target_update_frequency == 0:
            soft_update_params(model.module.critic, model.module.critic_tgt,
                               self.cfg.sac.critic_tau)

        if writer is not None:
            writer.add_scalar("loss/critic", critic_loss,
                              self.global_writer_loss_count.value())
示例#4
0
    def _step_episodic_mem(self,
                           replay_buffer,
                           optimizers,
                           env,
                           model,
                           step,
                           writer=None):
        loss_critic, loss_actor, loss_alpha = 0, 0, 0
        batch = []
        for it in range(self.batch_size):
            batch.append(replay_buffer.sample())

        for episode in batch:
            loss = self.update_critic_episodic(episode.episode, env, model)
            loss_critic = loss_critic + loss

        loss_critic = loss_critic / self.batch_size
        optimizers.critic.zero_grad()
        loss_critic.backward()
        optimizers.critic.step()

        if step % self.cfg.actor_update_frequency == 0:
            for episode in batch:
                loss_a, loss_t = self.update_actor_and_alpha_episodic(
                    episode.episode, env, model)
                loss_actor = loss_actor + loss_a
                loss_alpha = loss_alpha + loss_t

            loss_actor = loss_actor / self.batch_size
            loss_alpha = loss_alpha / self.batch_size

            optimizers.actor.zero_grad()
            loss_actor.backward()
            optimizers.actor.step()
            if self.cfg.temperature_regulation == 'optimize':
                optimizers.temperature.zero_grad()
                loss_alpha.backward()
                optimizers.temperature.step()

            if step % self.cfg.critic_target_update_frequency == 0:
                soft_update_params(model.module.critic,
                                   model.module.critic_tgt, self.critic_tau)

        if writer is not None:
            writer.add_scalar("loss/critic", loss_critic.item(),
                              self.global_writer_loss_count.value())
            writer.add_scalar("loss/actor", loss_actor.item(),
                              self.global_writer_loss_count.value())
            if self.cfg.temperature_regulation == 'optimize':
                writer.add_scalar("loss/temperature", loss_alpha.item(),
                                  self.global_writer_loss_count.value())
            writer.add_scalar("value/temperature",
                              self.alpha.detach().item(),
                              self.global_writer_loss_count.value())
示例#5
0
    def _step(self, step):
        actor_loss, alpha_loss, min_entropy, loc_mean = None, None, None, None

        (obs, action, reward), sample_idx = self.memory.sample()
        action = action.to(self.device)
        for i in range(len(reward)):
            reward[i] = reward[i].to(self.device)
        critic_loss, mean_reward = self.update_critic(obs, action, reward)
        self.memory.report_sample_loss(critic_loss + mean_reward, sample_idx)
        self.mov_sum_losses.critic.apply(critic_loss)
        # self.optimizers.critic_shed.step(self.mov_sum_losses.critic.avg)
        wandb.log({"loss/critic": critic_loss})

        if self.cfg.actor_update_after < step and step % self.cfg.actor_update_frequency == 0:
            actor_loss, alpha_loss, min_entropy, loc_mean = self.update_actor_and_alpha(
                obs, reward, action)
            self.mov_sum_losses.actor.apply(actor_loss)
            self.mov_sum_losses.temperature.apply(alpha_loss)
            # self.optimizers.actor_shed.step(self.mov_sum_losses.actor.avg)
            # self.optimizers.temp_shed.step(self.mov_sum_losses.actor.avg)
            wandb.log({"loss/actor": actor_loss})
            wandb.log({"loss/alpha": alpha_loss})

        if step % self.cfg.post_stats_frequency == 0:
            if min_entropy != "nl":
                wandb.log({"min_entropy": min_entropy})
            wandb.log({"mov_avg/critic": self.mov_sum_losses.critic.avg})
            wandb.log({"mov_avg/actor": self.mov_sum_losses.actor.avg})
            wandb.log(
                {"mov_avg/temperature": self.mov_sum_losses.temperature.avg})
            wandb.log({
                "lr/critic":
                self.optimizers.critic_shed.optimizer.param_groups[0]['lr']
            })
            wandb.log({
                "lr/actor":
                self.optimizers.actor_shed.optimizer.param_groups[0]['lr']
            })
            wandb.log({
                "lr/temperature":
                self.optimizers.temp_shed.optimizer.param_groups[0]['lr']
            })

        if step % self.cfg.critic_target_update_frequency == 0:
            soft_update_params(self.model.critic, self.model.critic_tgt,
                               self.cfg.critic_tau)

        return [critic_loss, actor_loss, alpha_loss, loc_mean]
示例#6
0
    def _step(self, replay_buffer, optimizers, env, model, step, writer=None):
        loss_critic, loss_actor, loss_alpha = 0, 0, 0
        batch = []
        for it in range(self.batch_size):
            batch.append(replay_buffer.sample())

        for obs, action, reward, next_obs, done in batch:
            not_done = int(not done)

            loss = self.update_critic(obs, action, reward, next_obs, not_done,
                                      env, model)
            loss_critic = loss_critic + loss

        loss_critic = loss_critic / self.batch_size
        optimizers.critic.zero_grad()
        loss_critic.backward()
        optimizers.critic.step()

        if step % self.cfg.actor_update_frequency == 0:
            for obs, action, reward, next_obs, done in batch:
                loss_a, loss_t = self.update_actor_and_alpha(obs, env, model)
                loss_actor = loss_actor + loss_a
                loss_alpha = loss_alpha + loss_t

            loss_actor = loss_actor / self.batch_size
            loss_alpha = loss_alpha / self.batch_size
            optimizers.actor.zero_grad()
            loss_actor.backward()
            optimizers.actor.step()
            optimizers.temperature.zero_grad()
            loss_alpha.backward()
            optimizers.temperature.step()
            if writer is not None:
                writer.add_scalar("loss/actor", loss_actor.item(),
                                  self.global_writer_loss_count.value())
                writer.add_scalar("loss/temperature", loss_alpha.item(),
                                  self.global_writer_loss_count.value())

        if step % self.cfg.critic_target_update_frequency == 0:
            soft_update_params(model.module.critic, model.module.critic_tgt,
                               self.critic_tau)

        if writer is not None:
            writer.add_scalar("loss/critic", loss_critic.item(),
                              self.global_writer_loss_count.value())