def setUp(self) -> None: self.env = gym.make("CartPole-v0") self.net = Mock(return_value=Tensor([[0.0, 100.0]])) self.state = [self.env.reset()] self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.value_agent = ValueAgent(self.net, self.env.action_space.n)
class TestValueAgent(TestCase): def setUp(self) -> None: self.env = gym.make("CartPole-v0") self.net = Mock(return_value=torch.Tensor([[0.0, 100.0]])) self.state = torch.tensor(self.env.reset()) self.device = self.state.device self.value_agent = ValueAgent(self.net, self.env.action_space.n) def test_value_agent(self): action = self.value_agent(self.state, self.device) self.assertIsInstance(action, int) def test_value_agent_GET_ACTION(self): action = self.value_agent.get_action(self.state, self.device) self.assertIsInstance(action, int) self.assertEqual(action, 1) def test_value_agent_RANDOM(self): action = self.value_agent.get_random_action() self.assertIsInstance(action, int)
class TestValueAgent(TestCase): def setUp(self) -> None: self.env = gym.make("CartPole-v0") self.net = Mock(return_value=Tensor([[0.0, 100.0]])) self.state = [self.env.reset()] self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.value_agent = ValueAgent(self.net, self.env.action_space.n) def test_value_agent(self): action = self.value_agent(self.state, self.device) self.assertIsInstance(action, list) self.assertIsInstance(action[0], int) def test_value_agent_get_action(self): action = self.value_agent.get_action(self.state, self.device) self.assertIsInstance(action, np.ndarray) self.assertEqual(action[0], 1) def test_value_agent_random(self): action = self.value_agent.get_random_action(self.state) self.assertIsInstance(action[0], int)
def __init__( self, env: str, eps_start: float = 1.0, eps_end: float = 0.02, eps_last_frame: int = 150000, sync_rate: int = 1000, gamma: float = 0.99, learning_rate: float = 1e-4, batch_size: int = 32, replay_size: int = 100000, warm_start_size: int = 10000, avg_reward_len: int = 100, min_episode_reward: int = -21, seed: int = 123, batches_per_epoch: int = 1000, n_steps: int = 1, **kwargs, ): """ Args: env: gym environment tag eps_start: starting value of epsilon for the epsilon-greedy exploration eps_end: final value of epsilon for the epsilon-greedy exploration eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end sync_rate: the number of iterations between syncing up the target network with the train network gamma: discount factor learning_rate: learning rate batch_size: size of minibatch pulled from the DataLoader replay_size: total capacity of the replay buffer warm_start_size: how many random steps through the environment to be carried out at the start of training to fill the buffer with a starting point avg_reward_len: how many episodes to take into account when calculating the avg reward min_episode_reward: the minimum score that can be achieved in an episode. Used for filling the avg buffer before training begins seed: seed value for all RNG used batches_per_epoch: number of batches per epoch n_steps: size of n step look ahead """ super().__init__() # Environment self.exp = None self.env = self.make_environment(env, seed) self.test_env = self.make_environment(env) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n # Model Attributes self.buffer = None self.dataset = None self.net = None self.target_net = None self.build_networks() self.agent = ValueAgent( self.net, self.n_actions, eps_start=eps_start, eps_end=eps_end, eps_frames=eps_last_frame, ) # Hyperparameters self.sync_rate = sync_rate self.gamma = gamma self.lr = learning_rate self.batch_size = batch_size self.replay_size = replay_size self.warm_start_size = warm_start_size self.batches_per_epoch = batches_per_epoch self.n_steps = n_steps self.save_hyperparameters() # Metrics self.total_episode_steps = [0] self.total_rewards = [0] self.done_episodes = 0 self.total_steps = 0 # Average Rewards self.avg_reward_len = avg_reward_len for _ in range(avg_reward_len): self.total_rewards.append( torch.tensor(min_episode_reward, device=self.device)) self.avg_rewards = float( np.mean(self.total_rewards[-self.avg_reward_len:])) self.state = self.env.reset()
class DQN(pl.LightningModule): """ Basic DQN Model PyTorch Lightning implementation of `DQN <https://arxiv.org/abs/1312.5602>`_ Paper authors: Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller. Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.dqn_model import DQN ... >>> model = DQN("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Note: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter06/02_dqn_pong.py Note: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ def __init__( self, env: str, eps_start: float = 1.0, eps_end: float = 0.02, eps_last_frame: int = 150000, sync_rate: int = 1000, gamma: float = 0.99, learning_rate: float = 1e-4, batch_size: int = 32, replay_size: int = 100000, warm_start_size: int = 10000, avg_reward_len: int = 100, min_episode_reward: int = -21, seed: int = 123, batches_per_epoch: int = 1000, n_steps: int = 1, **kwargs, ): """ Args: env: gym environment tag eps_start: starting value of epsilon for the epsilon-greedy exploration eps_end: final value of epsilon for the epsilon-greedy exploration eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end sync_rate: the number of iterations between syncing up the target network with the train network gamma: discount factor learning_rate: learning rate batch_size: size of minibatch pulled from the DataLoader replay_size: total capacity of the replay buffer warm_start_size: how many random steps through the environment to be carried out at the start of training to fill the buffer with a starting point avg_reward_len: how many episodes to take into account when calculating the avg reward min_episode_reward: the minimum score that can be achieved in an episode. Used for filling the avg buffer before training begins seed: seed value for all RNG used batches_per_epoch: number of batches per epoch n_steps: size of n step look ahead """ super().__init__() # Environment self.exp = None self.env = self.make_environment(env, seed) self.test_env = self.make_environment(env) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n # Model Attributes self.buffer = None self.dataset = None self.net = None self.target_net = None self.build_networks() self.agent = ValueAgent( self.net, self.n_actions, eps_start=eps_start, eps_end=eps_end, eps_frames=eps_last_frame, ) # Hyperparameters self.sync_rate = sync_rate self.gamma = gamma self.lr = learning_rate self.batch_size = batch_size self.replay_size = replay_size self.warm_start_size = warm_start_size self.batches_per_epoch = batches_per_epoch self.n_steps = n_steps self.save_hyperparameters() # Metrics self.total_episode_steps = [0] self.total_rewards = [0] self.done_episodes = 0 self.total_steps = 0 # Average Rewards self.avg_reward_len = avg_reward_len for _ in range(avg_reward_len): self.total_rewards.append( torch.tensor(min_episode_reward, device=self.device)) self.avg_rewards = float( np.mean(self.total_rewards[-self.avg_reward_len:])) self.state = self.env.reset() def run_n_episodes(self, env, n_epsiodes: int = 1, epsilon: float = 1.0) -> List[int]: """ Carries out N episodes of the environment with the current agent Args: env: environment to use, either train environment or test environment n_epsiodes: number of episodes to run epsilon: epsilon value for DQN agent """ total_rewards = [] for _ in range(n_epsiodes): episode_state = env.reset() done = False episode_reward = 0 while not done: self.agent.epsilon = epsilon action = self.agent(episode_state, self.device) next_state, reward, done, _ = self.env.step(action[0]) episode_state = next_state episode_reward += reward total_rewards.append(episode_reward) return total_rewards def populate(self, warm_start: int) -> None: """Populates the buffer with initial experience""" if warm_start > 0: self.state = self.env.reset() for _ in range(warm_start): self.agent.epsilon = 1.0 action = self.agent(self.state, self.device) next_state, reward, done, _ = self.env.step(action[0]) exp = Experience(state=self.state, action=action[0], reward=reward, done=done, new_state=next_state) self.buffer.append(exp) self.state = next_state if done: self.state = self.env.reset() def build_networks(self) -> None: """Initializes the DQN train and target networks""" self.net = CNN(self.obs_shape, self.n_actions) self.target_net = CNN(self.obs_shape, self.n_actions) def forward(self, x: torch.Tensor) -> torch.Tensor: """ Passes in a state x through the network and gets the q_values of each action as an output Args: x: environment state Returns: q values """ output = self.net(x) return output def train_batch( self, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Contains the logic for generating a new batch of data to be passed to the DataLoader Returns: yields a Experience tuple containing the state, action, reward, done and next_state. """ episode_reward = 0 episode_steps = 0 while True: self.total_steps += 1 action = self.agent(self.state, self.device) next_state, r, is_done, _ = self.env.step(action[0]) episode_reward += r episode_steps += 1 exp = Experience(state=self.state, action=action[0], reward=r, done=is_done, new_state=next_state) self.agent.update_epsilon(self.global_step) self.buffer.append(exp) self.state = next_state if is_done: self.done_episodes += 1 self.total_rewards.append(episode_reward) self.total_episode_steps.append(episode_steps) self.avg_rewards = float( np.mean(self.total_rewards[-self.avg_reward_len:])) self.state = self.env.reset() episode_steps = 0 episode_reward = 0 states, actions, rewards, dones, new_states = self.buffer.sample( self.batch_size) for idx, _ in enumerate(dones): yield states[idx], actions[idx], rewards[idx], dones[ idx], new_states[idx] # Simulates epochs if self.total_steps % self.batches_per_epoch == 0: break def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: """ Carries out a single step through the environment to update the replay buffer. Then calculates loss based on the minibatch recieved Args: batch: current mini batch of replay data _: batch number, not used Returns: Training loss and log metrics """ # calculates training loss loss = dqn_loss(batch, self.net, self.target_net) if self.trainer.use_dp or self.trainer.use_ddp2: loss = loss.unsqueeze(0) # Soft update of target network if self.global_step % self.sync_rate == 0: self.target_net.load_state_dict(self.net.state_dict()) self.log_dict({ "total_reward": self.total_rewards[-1], "avg_reward": self.avg_rewards, "train_loss": loss, "episodes": self.done_episodes, "episode_steps": self.total_episode_steps[-1] }) return OrderedDict({ "loss": loss, "avg_reward": self.avg_rewards, }) def test_step(self, *args, **kwargs) -> Dict[str, torch.Tensor]: """Evaluate the agent for 10 episodes""" test_reward = self.run_n_episodes(self.test_env, 1, 0) avg_reward = sum(test_reward) / len(test_reward) return {"test_reward": avg_reward} def test_epoch_end(self, outputs) -> Dict[str, torch.Tensor]: """Log the avg of the test results""" rewards = [x["test_reward"] for x in outputs] avg_reward = sum(rewards) / len(rewards) self.log("avg_test_reward", avg_reward) return {"avg_test_reward": avg_reward} def configure_optimizers(self) -> List[Optimizer]: """ Initialize Adam optimizer""" optimizer = optim.Adam(self.net.parameters(), lr=self.lr) return [optimizer] def _dataloader(self) -> DataLoader: """Initialize the Replay Buffer dataset used for retrieving experiences""" self.buffer = MultiStepBuffer(self.replay_size, self.n_steps) self.populate(self.warm_start_size) self.dataset = ExperienceSourceDataset(self.train_batch) return DataLoader(dataset=self.dataset, batch_size=self.batch_size) def train_dataloader(self) -> DataLoader: """Get train loader""" return self._dataloader() def test_dataloader(self) -> DataLoader: """Get test loader""" return self._dataloader() @staticmethod def make_environment(env_name: str, seed: Optional[int] = None) -> Env: """ Initialise gym environment Args: env_name: environment name or tag seed: value to seed the environment RNG for reproducibility Returns: gym environment """ env = make_environment(env_name) if seed: env.seed(seed) return env @staticmethod def add_model_specific_args( arg_parser: argparse.ArgumentParser, ) -> argparse.ArgumentParser: """ Adds arguments for DQN model Note: These params are fine tuned for Pong env. Args: arg_parser: parent parser """ arg_parser.add_argument( "--sync_rate", type=int, default=1000, help="how many frames do we update the target network", ) arg_parser.add_argument( "--replay_size", type=int, default=100000, help="capacity of the replay buffer", ) arg_parser.add_argument( "--warm_start_size", type=int, default=10000, help= "how many samples do we use to fill our buffer at the start of training", ) arg_parser.add_argument( "--eps_last_frame", type=int, default=150000, help="what frame should epsilon stop decaying", ) arg_parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon") arg_parser.add_argument("--eps_end", type=float, default=0.02, help="final value of epsilon") arg_parser.add_argument("--batches_per_epoch", type=int, default=10000, help="number of batches in an epoch") arg_parser.add_argument("--batch_size", type=int, default=32, help="size of the batches") arg_parser.add_argument("--lr", type=float, default=1e-4, help="learning rate") arg_parser.add_argument("--env", type=str, required=True, help="gym environment tag") arg_parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") arg_parser.add_argument( "--avg_reward_len", type=int, default=100, help="how many episodes to include in avg reward", ) arg_parser.add_argument( "--n_steps", type=int, default=1, help="how many frames do we update the target network", ) return arg_parser
def __init__( self, env: str, eps_start: float = 1.0, eps_end: float = 0.02, eps_last_frame: int = 150000, sync_rate: int = 1000, gamma: float = 0.99, learning_rate: float = 1e-4, batch_size: int = 32, replay_size: int = 100000, warm_start_size: int = 10000, avg_reward_len: int = 100, min_episode_reward: int = -21, seed: int = 123, batches_per_epoch: int = 1000, n_steps: int = 1, **kwargs, ): """ PyTorch Lightning implementation of `DQN <https://arxiv.org/abs/1312.5602>`_ Paper authors: Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller. Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.dqn_model import DQN ... >>> model = DQN("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag eps_start: starting value of epsilon for the epsilon-greedy exploration eps_end: final value of epsilon for the epsilon-greedy exploration eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end sync_rate: the number of iterations between syncing up the target network with the train network gamma: discount factor learning_rate: learning rate batch_size: size of minibatch pulled from the DataLoader replay_size: total capacity of the replay buffer warm_start_size: how many random steps through the environment to be carried out at the start of training to fill the buffer with a starting point avg_reward_len: how many episodes to take into account when calculating the avg reward min_episode_reward: the minimum score that can be achieved in an episode. Used for filling the avg buffer before training begins seed: seed value for all RNG used batches_per_epoch: number of batches per epoch n_steps: size of n step look ahead Note: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter06/02_dqn_pong.py Note: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() # Environment self.exp = None self.env = self.make_environment(env, seed) self.test_env = self.make_environment(env) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n # Model Attributes self.buffer = None self.dataset = None self.net = None self.target_net = None self.build_networks() self.agent = ValueAgent( self.net, self.n_actions, eps_start=eps_start, eps_end=eps_end, eps_frames=eps_last_frame, ) # Hyperparameters self.sync_rate = sync_rate self.gamma = gamma self.lr = learning_rate self.batch_size = batch_size self.replay_size = replay_size self.warm_start_size = warm_start_size self.batches_per_epoch = batches_per_epoch self.n_steps = n_steps self.save_hyperparameters() # Metrics self.total_episode_steps = [0] self.total_rewards = [0] self.done_episodes = 0 self.total_steps = 0 # Average Rewards self.avg_reward_len = avg_reward_len for _ in range(avg_reward_len): self.total_rewards.append( torch.tensor(min_episode_reward, device=self.device) ) self.avg_rewards = float( np.mean(self.total_rewards[-self.avg_reward_len:]) ) self.state = self.env.reset()
def __init__( self, env: str, gpus: int = 0, eps_start: float = 1.0, eps_end: float = 0.02, eps_last_frame: int = 150000, sync_rate: int = 1000, gamma: float = 0.99, learning_rate: float = 1e-4, batch_size: int = 32, replay_size: int = 100000, warm_start_size: int = 10000, num_samples: int = 500, **kwargs, ): """ PyTorch Lightning implementation of `DQN <https://arxiv.org/abs/1312.5602>`_ Paper authors: Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller. Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.dqn_model import DQN ... >>> model = DQN("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gpus: number of gpus being used eps_start: starting value of epsilon for the epsilon-greedy exploration eps_end: final value of epsilon for the epsilon-greedy exploration eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end sync_rate: the number of iterations between syncing up the target network with the train network gamma: discount factor learning_rate: learning rate batch_size: size of minibatch pulled from the DataLoader replay_size: total capacity of the replay buffer warm_start_size: how many random steps through the environment to be carried out at the start of training to fill the buffer with a starting point num_samples: the number of samples to pull from the dataset iterator and feed to the DataLoader .. note:: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition\ /blob/master/Chapter06/02_dqn_pong.py .. note:: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() # Environment self.env = wrappers.make_env(env) self.env.seed(123) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n # Model Attributes self.buffer = None self.source = None self.dataset = None self.net = None self.target_net = None self.build_networks() self.agent = ValueAgent( self.net, self.n_actions, eps_start=eps_start, eps_end=eps_end, eps_frames=eps_last_frame, ) # Hyperparameters self.sync_rate = sync_rate self.gamma = gamma self.lr = learning_rate self.batch_size = batch_size self.replay_size = replay_size self.warm_start_size = warm_start_size self.sample_len = num_samples self.save_hyperparameters() # Metrics self.total_reward = 0 self.episode_reward = 0 self.episode_count = 0 self.episode_steps = 0 self.total_episode_steps = 0 self.reward_list = [] for _ in range(100): self.reward_list.append(-21) self.avg_reward = -21
class DQN(pl.LightningModule): """ Basic DQN Model """ def __init__( self, env: str, gpus: int = 0, eps_start: float = 1.0, eps_end: float = 0.02, eps_last_frame: int = 150000, sync_rate: int = 1000, gamma: float = 0.99, learning_rate: float = 1e-4, batch_size: int = 32, replay_size: int = 100000, warm_start_size: int = 10000, num_samples: int = 500, **kwargs, ): """ PyTorch Lightning implementation of `DQN <https://arxiv.org/abs/1312.5602>`_ Paper authors: Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller. Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.dqn_model import DQN ... >>> model = DQN("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gpus: number of gpus being used eps_start: starting value of epsilon for the epsilon-greedy exploration eps_end: final value of epsilon for the epsilon-greedy exploration eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end sync_rate: the number of iterations between syncing up the target network with the train network gamma: discount factor learning_rate: learning rate batch_size: size of minibatch pulled from the DataLoader replay_size: total capacity of the replay buffer warm_start_size: how many random steps through the environment to be carried out at the start of training to fill the buffer with a starting point num_samples: the number of samples to pull from the dataset iterator and feed to the DataLoader .. note:: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition\ /blob/master/Chapter06/02_dqn_pong.py .. note:: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() # Environment self.env = wrappers.make_env(env) self.env.seed(123) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n # Model Attributes self.buffer = None self.source = None self.dataset = None self.net = None self.target_net = None self.build_networks() self.agent = ValueAgent( self.net, self.n_actions, eps_start=eps_start, eps_end=eps_end, eps_frames=eps_last_frame, ) # Hyperparameters self.sync_rate = sync_rate self.gamma = gamma self.lr = learning_rate self.batch_size = batch_size self.replay_size = replay_size self.warm_start_size = warm_start_size self.sample_len = num_samples self.save_hyperparameters() # Metrics self.total_reward = 0 self.episode_reward = 0 self.episode_count = 0 self.episode_steps = 0 self.total_episode_steps = 0 self.reward_list = [] for _ in range(100): self.reward_list.append(-21) self.avg_reward = -21 def populate(self, warm_start: int) -> None: """Populates the buffer with initial experience""" if warm_start > 0: for _ in range(warm_start): self.source.agent.epsilon = 1.0 exp, _, _ = self.source.step() self.buffer.append(exp) def build_networks(self) -> None: """Initializes the DQN train and target networks""" self.net = CNN(self.obs_shape, self.n_actions) self.target_net = CNN(self.obs_shape, self.n_actions) def forward(self, x: torch.Tensor) -> torch.Tensor: """ Passes in a state x through the network and gets the q_values of each action as an output Args: x: environment state Returns: q values """ output = self.net(x) return output def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: """ Carries out a single step through the environment to update the replay buffer. Then calculates loss based on the minibatch recieved Args: batch: current mini batch of replay data _: batch number, not used Returns: Training loss and log metrics """ self.agent.update_epsilon(self.global_step) # step through environment with agent and add to buffer exp, reward, done = self.source.step() self.buffer.append(exp) self.episode_reward += reward self.episode_steps += 1 # calculates training loss loss = dqn_loss(batch, self.net, self.target_net) if self.trainer.use_dp or self.trainer.use_ddp2: loss = loss.unsqueeze(0) if done: self.total_reward = self.episode_reward self.reward_list.append(self.total_reward) self.avg_reward = sum(self.reward_list[-100:]) / 100 self.episode_count += 1 self.episode_reward = 0 self.total_episode_steps = self.episode_steps self.episode_steps = 0 # Soft update of target network if self.global_step % self.sync_rate == 0: self.target_net.load_state_dict(self.net.state_dict()) log = { "total_reward": self.total_reward, "avg_reward": self.avg_reward, "train_loss": loss, "episode_steps": self.total_episode_steps, } status = { "steps": self.global_step, "avg_reward": self.avg_reward, "total_reward": self.total_reward, "episodes": self.episode_count, "episode_steps": self.episode_steps, "epsilon": self.agent.epsilon, } return OrderedDict( { "loss": loss, "avg_reward": self.avg_reward, "log": log, "progress_bar": status, } ) def test_step(self, *args, **kwargs) -> Dict[str, torch.Tensor]: """Evaluate the agent for 10 episodes""" self.agent.epsilon = 0.0 test_reward = self.source.run_episode() return {"test_reward": test_reward} def test_epoch_end(self, outputs) -> Dict[str, torch.Tensor]: """Log the avg of the test results""" rewards = [x["test_reward"] for x in outputs] avg_reward = sum(rewards) / len(rewards) tensorboard_logs = {"avg_test_reward": avg_reward} return {"avg_test_reward": avg_reward, "log": tensorboard_logs} def configure_optimizers(self) -> List[Optimizer]: """ Initialize Adam optimizer""" optimizer = optim.Adam(self.net.parameters(), lr=self.lr) return [optimizer] def prepare_data(self) -> None: """Initialize the Replay Buffer dataset used for retrieving experiences""" device = torch.device(self.trainer.root_gpu) if self.trainer.num_gpus >= 1 else self.device self.source = ExperienceSource(self.env, self.agent, device) self.buffer = ReplayBuffer(self.replay_size) self.populate(self.warm_start_size) self.dataset = RLDataset(self.buffer, self.sample_len) def train_dataloader(self) -> DataLoader: """Get train loader""" return DataLoader(dataset=self.dataset, batch_size=self.batch_size) def test_dataloader(self) -> DataLoader: """Get test loader""" return DataLoader(dataset=self.dataset, batch_size=self.batch_size) @staticmethod def add_model_specific_args(arg_parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """ Adds arguments for DQN model Note: these params are fine tuned for Pong env Args: arg_parser: parent parser """ arg_parser.add_argument( "--sync_rate", type=int, default=1000, help="how many frames do we update the target network", ) arg_parser.add_argument( "--replay_size", type=int, default=100000, help="capacity of the replay buffer", ) arg_parser.add_argument( "--warm_start_size", type=int, default=10000, help="how many samples do we use to fill our buffer at the start of training", ) arg_parser.add_argument( "--eps_last_frame", type=int, default=150000, help="what frame should epsilon stop decaying", ) arg_parser.add_argument( "--eps_start", type=float, default=1.0, help="starting value of epsilon" ) arg_parser.add_argument( "--eps_end", type=float, default=0.02, help="final value of epsilon" ) arg_parser.add_argument( "--warm_start_steps", type=int, default=10000, help="max episode reward in the environment", ) return arg_parser
def setUp(self) -> None: self.env = gym.make("CartPole-v0") self.net = Mock(return_value=torch.Tensor([[0.0, 100.0]])) self.state = torch.tensor(self.env.reset()) self.device = self.state.device self.value_agent = ValueAgent(self.net, self.env.action_space.n)