def training_step(self, batch: Tuple[Tensor, Tensor], _) -> OrderedDict: """Carries out a single step through the environment to update the replay buffer. Then calculates loss based on the minibatch recieved. Args: batch: current mini batch of replay data _: batch number, not used Returns: Training loss and log metrics """ # calculates training loss loss = double_dqn_loss(batch, self.net, self.target_net, self.gamma) if self._use_dp_or_ddp2(self.trainer): loss = loss.unsqueeze(0) # Soft update of target network if self.global_step % self.sync_rate == 0: self.target_net.load_state_dict(self.net.state_dict()) self.log_dict({ "total_reward": self.total_rewards[-1], "avg_reward": self.avg_rewards, "train_loss": loss, # "episodes": self.total_episode_steps, }) return OrderedDict({ "loss": loss, "avg_reward": self.avg_rewards, })
def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: """ Carries out a single step through the environment to update the replay buffer. Then calculates loss based on the minibatch recieved Args: batch: current mini batch of replay data _: batch number, not used Returns: Training loss and log metrics """ # calculates training loss loss = double_dqn_loss(batch, self.net, self.target_net) if self.trainer.use_dp or self.trainer.use_ddp2: loss = loss.unsqueeze(0) # Soft update of target network if self.global_step % self.sync_rate == 0: self.target_net.load_state_dict(self.net.state_dict()) log = { "total_reward": self.total_rewards[-1], "avg_reward": self.avg_rewards, "train_loss": loss, # "episodes": self.total_episode_steps, } status = { "steps": self.global_step, "avg_reward": self.avg_rewards, "total_reward": self.total_rewards[-1], "episodes": self.done_episodes, # "episode_steps": self.episode_steps, "epsilon": self.agent.epsilon, } return OrderedDict({ "loss": loss, "avg_reward": self.avg_rewards, "log": log, "progress_bar": status, })
def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: """ Carries out a single step through the environment to update the replay buffer. Then calculates loss based on the minibatch recieved Args: batch: current mini batch of replay data _: batch number, not used Returns: Training loss and log metrics """ self.agent.update_epsilon(self.global_step) # step through environment with agent and add to buffer exp, reward, done = self.source.step(self.device) self.buffer.append(exp) self.episode_reward += reward self.episode_steps += 1 # calculates training loss loss = double_dqn_loss(batch, self.net, self.target_net) if self.trainer.use_dp or self.trainer.use_ddp2: loss = loss.unsqueeze(0) if done: self.total_reward = self.episode_reward self.reward_list.append(self.total_reward) self.avg_reward = sum(self.reward_list[-100:]) / 100 self.episode_count += 1 self.episode_reward = 0 self.total_episode_steps = self.episode_steps self.episode_steps = 0 # Soft update of target network if self.global_step % self.sync_rate == 0: self.target_net.load_state_dict(self.net.state_dict()) log = { "total_reward": self.total_reward, "avg_reward": self.avg_reward, "train_loss": loss, "episode_steps": self.total_episode_steps, } status = { "steps": self.global_step, "avg_reward": self.avg_reward, "total_reward": self.total_reward, "episodes": self.episode_count, "episode_steps": self.episode_steps, "epsilon": self.agent.epsilon, } return OrderedDict({ "loss": loss, "avg_reward": self.avg_reward, "log": log, "progress_bar": status, })
def test_double_dqn_loss(self): """Test the double dqn loss function""" loss = double_dqn_loss(self.batch, self.net, self.target_net) self.assertIsInstance(loss, torch.Tensor)