def setUp(self) -> None: self.net = Mock() self.agent = DummyAgent(net=self.net) self.env = [gym.make("CartPole-v0") for _ in range(2)] self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.n_steps = 3 self.gamma = 0.9 self.source = DiscountedExperienceSource( self.env, self.agent, n_steps=self.n_steps, gamma=self.gamma ) self.state = torch.ones(3) self.next_state = torch.zeros(3) self.reward = 1 self.exp1 = Experience( state=self.state, action=1, reward=self.reward, done=False, new_state=self.next_state, ) self.exp2 = Experience( state=self.next_state, action=1, reward=self.reward, done=False, new_state=self.state, ) self.env1 = Mock() self.env1.step = Mock( return_value=(self.next_state, self.reward, True, self.state) )
def test_source_step_done(self): """Tests that the source returns a single experience""" self.source = DiscountedExperienceSource( self.env1, self.agent, n_steps=self.n_steps ) self.source.histories[0].append(self.exp1) self.source.histories[0].append(self.exp2) for idx, exp in enumerate(self.source.runner(self.device)): self.assertTrue(isinstance(exp, Experience)) self.assertTrue(torch.all(torch.eq(exp.new_state, self.next_state))) break
def test_source_discounted_return(self): """Tests that the source returns a single experience with discounted rewards. discounted returns: G(t) = R(t+1) + γ*R(t+2) + γ^2*R(t+3) ... + γ^N-1*R(t+N) """ self.source = DiscountedExperienceSource(self.env1, self.agent, n_steps=self.n_steps) self.source.histories[0] += [self.exp1, self.exp2] discounted_reward = (self.exp1.reward + (self.source.gamma * self.exp2.reward) + (self.source.gamma * self.reward)**2) for idx, exp in enumerate(self.source.runner(self.device)): self.assertTrue(isinstance(exp, Experience)) self.assertEqual(exp.reward, discounted_reward) break
def setUp(self) -> None: self.env = ToTensor(gym.make("CartPole-v0")) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n self.net = MLP(self.obs_shape, self.n_actions) self.agent = Agent(self.net) self.exp_source = DiscountedExperienceSource(self.env, self.agent) parent_parser = argparse.ArgumentParser(add_help=False) parent_parser = Reinforce.add_model_specific_args(parent_parser) args_list = [ "--env", "CartPole-v0", "--batch_size", "32", "--gamma", "0.99" ] self.hparams = parent_parser.parse_args(args_list) self.model = Reinforce(**vars(self.hparams)) self.rl_dataloader = self.model.train_dataloader()
class TestDiscountedExperienceSource(TestCase): def setUp(self) -> None: self.net = Mock() self.agent = DummyAgent(net=self.net) self.env = [gym.make("CartPole-v0") for _ in range(2)] self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.n_steps = 3 self.gamma = 0.9 self.source = DiscountedExperienceSource( self.env, self.agent, n_steps=self.n_steps, gamma=self.gamma ) self.state = torch.ones(3) self.next_state = torch.zeros(3) self.reward = 1 self.exp1 = Experience( state=self.state, action=1, reward=self.reward, done=False, new_state=self.next_state, ) self.exp2 = Experience( state=self.next_state, action=1, reward=self.reward, done=False, new_state=self.state, ) self.env1 = Mock() self.env1.step = Mock( return_value=(self.next_state, self.reward, True, self.state) ) def test_init(self): """Test that experience source is setup correctly""" self.assertEqual(self.source.n_steps, self.n_steps + 1) self.assertEqual(self.source.steps, self.n_steps) self.assertEqual(self.source.gamma, self.gamma) def test_source_step(self): """Tests that the source returns a single experience""" for idx, exp in enumerate(self.source.runner(self.device)): self.assertTrue(isinstance(exp, Experience)) break def test_source_step_done(self): """Tests that the source returns a single experience""" self.source = DiscountedExperienceSource( self.env1, self.agent, n_steps=self.n_steps ) self.source.histories[0].append(self.exp1) self.source.histories[0].append(self.exp2) for idx, exp in enumerate(self.source.runner(self.device)): self.assertTrue(isinstance(exp, Experience)) self.assertTrue(torch.all(torch.eq(exp.new_state, self.next_state))) break def test_source_discounted_return(self): """ Tests that the source returns a single experience with discounted rewards discounted returns: G(t) = R(t+1) + γ*R(t+2) + γ^2*R(t+3) ... + γ^N-1*R(t+N) """ self.source = DiscountedExperienceSource( self.env1, self.agent, n_steps=self.n_steps ) self.source.histories[0] += [self.exp1, self.exp2] discounted_reward = ( self.exp1.reward + (self.source.gamma * self.exp2.reward) + (self.source.gamma * self.reward) ** 2 ) for idx, exp in enumerate(self.source.runner(self.device)): self.assertTrue(isinstance(exp, Experience)) self.assertEqual(exp.reward, discounted_reward) break
def __init__( self, env: str, eps_start: float = 1.0, eps_end: float = 0.02, eps_last_frame: int = 150000, sync_rate: int = 1000, gamma: float = 0.99, learning_rate: float = 1e-4, batch_size: int = 32, replay_size: int = 100000, warm_start_size: int = 10000, avg_reward_len: int = 100, min_episode_reward: int = -21, n_steps: int = 1, seed: int = 123, num_envs: int = 1, **kwargs, ): """ PyTorch Lightning implementation of `DQN <https://arxiv.org/abs/1312.5602>`_ Paper authors: Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller. Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.dqn_model import DQN ... >>> model = DQN("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag eps_start: starting value of epsilon for the epsilon-greedy exploration eps_end: final value of epsilon for the epsilon-greedy exploration eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end sync_rate: the number of iterations between syncing up the target network with the train network gamma: discount factor learning_rate: learning rate batch_size: size of minibatch pulled from the DataLoader replay_size: total capacity of the replay buffer warm_start_size: how many random steps through the environment to be carried out at the start of training to fill the buffer with a starting point avg_reward_len: how many episodes to take into account when calculating the avg reward min_episode_reward: the minimum score that can be achieved in an episode. Used for filling the avg buffer before training begins seed: seed value for all RNG used num_envs: number of environments to run the agent in at once Note: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition\ /blob/master/Chapter06/02_dqn_pong.py Note: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() # Environment self.exp = None self.env = [self.make_environment(env, seed) for _ in range(num_envs)] self.obs_shape = self.env[0].observation_space.shape self.n_actions = self.env[0].action_space.n # Model Attributes self.buffer = None self.source = None self.dataset = None self.net = None self.target_net = None self.build_networks() self.agent = ValueAgent( self.net, self.n_actions, eps_start=eps_start, eps_end=eps_end, eps_frames=eps_last_frame, ) self.source = DiscountedExperienceSource(self.env, self.agent, n_steps=n_steps) # Hyperparameters self.num_envs = num_envs self.sync_rate = sync_rate self.gamma = gamma self.lr = learning_rate self.batch_size = batch_size * num_envs self.replay_size = replay_size self.warm_start_size = warm_start_size self.n_steps = n_steps self.save_hyperparameters() # Metrics self.total_reward = 0 self.episode_reward = 0 self.episode_count = 0 self.episode_steps = [0] self.total_episode_steps = 0 self.total_rewards = [0] self.done_episodes = 0 self.avg_reward_len = avg_reward_len self.reward_list = [] for _ in range(avg_reward_len): self.reward_list.append( torch.tensor(min_episode_reward, device=self.device)) self.avg_rewards = 0
class DQN(pl.LightningModule): """ Basic DQN Model """ def __init__( self, env: str, eps_start: float = 1.0, eps_end: float = 0.02, eps_last_frame: int = 150000, sync_rate: int = 1000, gamma: float = 0.99, learning_rate: float = 1e-4, batch_size: int = 32, replay_size: int = 100000, warm_start_size: int = 10000, avg_reward_len: int = 100, min_episode_reward: int = -21, n_steps: int = 1, seed: int = 123, num_envs: int = 1, **kwargs, ): """ PyTorch Lightning implementation of `DQN <https://arxiv.org/abs/1312.5602>`_ Paper authors: Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller. Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.dqn_model import DQN ... >>> model = DQN("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag eps_start: starting value of epsilon for the epsilon-greedy exploration eps_end: final value of epsilon for the epsilon-greedy exploration eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end sync_rate: the number of iterations between syncing up the target network with the train network gamma: discount factor learning_rate: learning rate batch_size: size of minibatch pulled from the DataLoader replay_size: total capacity of the replay buffer warm_start_size: how many random steps through the environment to be carried out at the start of training to fill the buffer with a starting point avg_reward_len: how many episodes to take into account when calculating the avg reward min_episode_reward: the minimum score that can be achieved in an episode. Used for filling the avg buffer before training begins seed: seed value for all RNG used num_envs: number of environments to run the agent in at once Note: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition\ /blob/master/Chapter06/02_dqn_pong.py Note: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() # Environment self.exp = None self.env = [self.make_environment(env, seed) for _ in range(num_envs)] self.obs_shape = self.env[0].observation_space.shape self.n_actions = self.env[0].action_space.n # Model Attributes self.buffer = None self.source = None self.dataset = None self.net = None self.target_net = None self.build_networks() self.agent = ValueAgent( self.net, self.n_actions, eps_start=eps_start, eps_end=eps_end, eps_frames=eps_last_frame, ) self.source = DiscountedExperienceSource(self.env, self.agent, n_steps=n_steps) # Hyperparameters self.num_envs = num_envs self.sync_rate = sync_rate self.gamma = gamma self.lr = learning_rate self.batch_size = batch_size * num_envs self.replay_size = replay_size self.warm_start_size = warm_start_size self.n_steps = n_steps self.save_hyperparameters() # Metrics self.total_reward = 0 self.episode_reward = 0 self.episode_count = 0 self.episode_steps = [0] self.total_episode_steps = 0 self.total_rewards = [0] self.done_episodes = 0 self.avg_reward_len = avg_reward_len self.reward_list = [] for _ in range(avg_reward_len): self.reward_list.append( torch.tensor(min_episode_reward, device=self.device)) self.avg_rewards = 0 def populate(self, warm_start: int) -> None: """Populates the buffer with initial experience""" if warm_start > 0: for _ in range(warm_start): self.source.agent.epsilon = 1.0 exp = next(self.source.runner(self.device)) self.buffer.append(exp) def build_networks(self) -> None: """Initializes the DQN train and target networks""" self.net = CNN(self.obs_shape, self.n_actions) self.target_net = CNN(self.obs_shape, self.n_actions) def forward(self, x: torch.Tensor) -> torch.Tensor: """ Passes in a state x through the network and gets the q_values of each action as an output Args: x: environment state Returns: q values """ output = self.net(x) return output def train_batch( self, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Contains the logic for generating a new batch of data to be passed to the DataLoader Returns: yields a Experience tuple containing the state, action, reward, done and next_state. """ for step_idx, exp in enumerate(self.source.runner(self.device)): self.agent.update_epsilon(self.global_step) self.buffer.append(exp) episode_reward_steps = self.source.pop_rewards_steps() if episode_reward_steps: for reward, steps in episode_reward_steps: self.done_episodes += 1 self.total_rewards.append(reward) self.episode_steps.append(steps) self.avg_rewards = float( np.mean(self.total_rewards[-self.avg_reward_len:])) states, actions, rewards, dones, new_states = self.buffer.sample( self.batch_size) for idx, _ in enumerate(dones): yield states[idx], actions[idx], rewards[idx], dones[ idx], new_states[idx] def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: """ Carries out a single step through the environment to update the replay buffer. Then calculates loss based on the minibatch recieved Args: batch: current mini batch of replay data _: batch number, not used Returns: Training loss and log metrics """ # calculates training loss loss = dqn_loss(batch, self.net, self.target_net) if self.trainer.use_dp or self.trainer.use_ddp2: loss = loss.unsqueeze(0) # Soft update of target network if self.global_step % self.sync_rate == 0: self.target_net.load_state_dict(self.net.state_dict()) log = { "total_reward": self.total_rewards[-1], "avg_reward": self.avg_rewards, "train_loss": loss, "episodes": self.done_episodes, } status = { "steps": self.global_step, "avg_reward": self.avg_rewards, "total_reward": self.total_rewards[-1], "episodes": self.done_episodes, "epsilon": self.agent.epsilon, } return OrderedDict({ "loss": loss, "avg_reward": self.avg_rewards, "log": log, "progress_bar": status, }) def test_step(self, *args, **kwargs) -> Dict[str, torch.Tensor]: """Evaluate the agent for 10 episodes""" self.agent.epsilon = 0.0 test_reward = self.source.run_episode() return {"test_reward": test_reward} def test_epoch_end(self, outputs) -> Dict[str, torch.Tensor]: """Log the avg of the test results""" rewards = [x["test_reward"] for x in outputs] avg_reward = sum(rewards) / len(rewards) tensorboard_logs = {"avg_test_reward": avg_reward} return {"avg_test_reward": avg_reward, "log": tensorboard_logs} def configure_optimizers(self) -> List[Optimizer]: """ Initialize Adam optimizer""" optimizer = optim.Adam(self.net.parameters(), lr=self.lr) return [optimizer] def _dataloader(self) -> DataLoader: """Initialize the Replay Buffer dataset used for retrieving experiences""" self.buffer = ReplayBuffer(self.replay_size) self.populate(self.warm_start_size) self.dataset = ExperienceSourceDataset(self.train_batch) return DataLoader(dataset=self.dataset, batch_size=self.batch_size) def train_dataloader(self) -> DataLoader: """Get train loader""" return self._dataloader() def test_dataloader(self) -> DataLoader: """Get test loader""" return DataLoader(dataset=self.dataset, batch_size=self.batch_size) @staticmethod def make_environment(env_name: str, seed: int) -> gym.Env: """ Initialise gym environment Args: env_name: environment name or tag seed: value to seed the environment RNG for reproducibility Returns: gym environment """ env = wrappers.make_environment(env_name) env.seed(seed) return env @staticmethod def add_model_specific_args( arg_parser: argparse.ArgumentParser, ) -> argparse.ArgumentParser: """ Adds arguments for DQN model Note: these params are fine tuned for Pong env Args: arg_parser: parent parser """ arg_parser.add_argument( "--sync_rate", type=int, default=1000, help="how many frames do we update the target network", ) arg_parser.add_argument( "--replay_size", type=int, default=100000, help="capacity of the replay buffer", ) arg_parser.add_argument( "--warm_start_size", type=int, default=10000, help= "how many samples do we use to fill our buffer at the start of training", ) arg_parser.add_argument( "--eps_last_frame", type=int, default=150000, help="what frame should epsilon stop decaying", ) arg_parser.add_argument("--eps_start", type=float, default=1.0, help="starting value of epsilon") arg_parser.add_argument("--eps_end", type=float, default=0.02, help="final value of epsilon") arg_parser.add_argument( "--warm_start_steps", type=int, default=10000, help="max episode reward in the environment", ) return arg_parser
def __init__( self, env: str, gamma: float = 0.99, lr: float = 0.01, batch_size: int = 8, n_steps: int = 10, avg_reward_len: int = 100, num_envs: int = 1, entropy_beta: float = 0.01, epoch_len: int = 1000, num_batch_episodes: int = 4, **kwargs ) -> None: """ PyTorch Lightning implementation of `REINFORCE <https://papers.nips.cc/paper/ 1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_ Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.reinforce_model import Reinforce ... >>> model = Reinforce("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader batch_episodes: how many episodes to rollout for each batch of training avg_reward_len: how many episodes to take into account when calculating the avg reward Note: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter11/02_cartpole_reinforce.py Note: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() # Hyperparameters self.lr = lr self.batch_size = batch_size * num_envs self.batches_per_epoch = self.batch_size * epoch_len self.entropy_beta = entropy_beta self.gamma = gamma self.n_steps = n_steps self.num_batch_episodes = num_batch_episodes self.save_hyperparameters() # Model components self.env = [gym.make(env) for _ in range(num_envs)] self.net = MLP(self.env[0].observation_space.shape, self.env[0].action_space.n) self.agent = PolicyAgent(self.net) self.exp_source = DiscountedExperienceSource( self.env, self.agent, gamma=gamma, n_steps=self.n_steps ) # Tracking metrics self.total_steps = 0 self.total_rewards = [0] self.done_episodes = 0 self.avg_rewards = 0 self.reward_sum = 0.0 self.batch_episodes = 0 self.avg_reward_len = avg_reward_len self.batch_states = [] self.batch_actions = [] self.batch_qvals = [] self.cur_rewards = []
class Reinforce(pl.LightningModule): def __init__( self, env: str, gamma: float = 0.99, lr: float = 0.01, batch_size: int = 8, n_steps: int = 10, avg_reward_len: int = 100, num_envs: int = 1, entropy_beta: float = 0.01, epoch_len: int = 1000, num_batch_episodes: int = 4, **kwargs ) -> None: """ PyTorch Lightning implementation of `REINFORCE <https://papers.nips.cc/paper/ 1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_ Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.reinforce_model import Reinforce ... >>> model = Reinforce("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader batch_episodes: how many episodes to rollout for each batch of training avg_reward_len: how many episodes to take into account when calculating the avg reward Note: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter11/02_cartpole_reinforce.py Note: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() # Hyperparameters self.lr = lr self.batch_size = batch_size * num_envs self.batches_per_epoch = self.batch_size * epoch_len self.entropy_beta = entropy_beta self.gamma = gamma self.n_steps = n_steps self.num_batch_episodes = num_batch_episodes self.save_hyperparameters() # Model components self.env = [gym.make(env) for _ in range(num_envs)] self.net = MLP(self.env[0].observation_space.shape, self.env[0].action_space.n) self.agent = PolicyAgent(self.net) self.exp_source = DiscountedExperienceSource( self.env, self.agent, gamma=gamma, n_steps=self.n_steps ) # Tracking metrics self.total_steps = 0 self.total_rewards = [0] self.done_episodes = 0 self.avg_rewards = 0 self.reward_sum = 0.0 self.batch_episodes = 0 self.avg_reward_len = avg_reward_len self.batch_states = [] self.batch_actions = [] self.batch_qvals = [] self.cur_rewards = [] def forward(self, x: torch.Tensor) -> torch.Tensor: """ Passes in a state x through the network and gets the q_values of each action as an output Args: x: environment state Returns: q values """ output = self.net(x) return output def calc_qvals(self, rewards: List[float]) -> List[float]: """Calculate the discounted rewards of all rewards in list Args: rewards: list of rewards from latest batch Returns: list of discounted rewards """ assert isinstance(rewards[0], float) cumul_reward = [] sum_r = 0.0 for r in reversed(rewards): sum_r = (sum_r * self.gamma) + r cumul_reward.append(sum_r) return list(reversed(cumul_reward)) def train_batch( self, ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]: """ Contains the logic for generating a new batch of data to be passed to the DataLoader Yield: yields a tuple of Lists containing tensors for states, actions and rewards of the batch. """ for exp in self.exp_source.runner(self.device): self.batch_states.append(exp.state) self.batch_actions.append(exp.action) self.cur_rewards.append(exp.reward) # Check if episode is completed and update trackers if exp.done: self.batch_qvals.extend(self.calc_qvals(self.cur_rewards)) self.cur_rewards.clear() self.batch_episodes += 1 # Check if episodes have finished and use total reward new_rewards = self.exp_source.pop_total_rewards() if new_rewards: for reward in new_rewards: self.done_episodes += 1 self.total_rewards.append(reward) self.avg_rewards = float( np.mean(self.total_rewards[-self.avg_reward_len:]) ) self.total_steps += 1 if self.batch_episodes >= self.num_batch_episodes: for state, action, qval in zip( self.batch_states, self.batch_actions, self.batch_qvals ): yield state, action, qval self.batch_episodes = 0 # Simulates epochs if self.total_steps % self.batches_per_epoch == 0: break def loss(self, states, actions, scaled_rewards) -> torch.Tensor: logits = self.net(states) # policy loss log_prob = log_softmax(logits, dim=1) log_prob_actions = scaled_rewards * log_prob[range(self.batch_size), actions] loss = -log_prob_actions.mean() return loss def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], _) -> OrderedDict: """ Carries out a single step through the environment to update the replay buffer. Then calculates loss based on the minibatch recieved Args: batch: current mini batch of replay data _: batch number, not used Returns: Training loss and log metrics """ states, actions, scaled_rewards = batch loss = self.loss(states, actions, scaled_rewards) log = { "episodes": self.done_episodes, "reward": self.total_rewards[-1], "avg_reward": self.avg_rewards, } return OrderedDict( { "loss": loss, "avg_reward": self.avg_rewards, "log": log, "progress_bar": log, } ) def configure_optimizers(self) -> List[Optimizer]: """ Initialize Adam optimizer""" optimizer = optim.Adam(self.net.parameters(), lr=self.lr) return [optimizer] def _dataloader(self) -> DataLoader: """Initialize the Replay Buffer dataset used for retrieving experiences""" dataset = ExperienceSourceDataset(self.train_batch) dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size) return dataloader def train_dataloader(self) -> DataLoader: """Get train loader""" return self._dataloader() def get_device(self, batch) -> str: """Retrieve device currently being used by minibatch""" return batch[0][0][0].device.index if self.on_gpu else "cpu" @staticmethod def add_model_specific_args(arg_parser) -> argparse.ArgumentParser: """ Adds arguments for DQN model Note: these params are fine tuned for Pong env Args: arg_parser: the current argument parser to add to Returns: arg_parser with model specific cargs added """ arg_parser.add_argument( "--entropy_beta", type=float, default=0.01, help="entropy value", ) return arg_parser