def __init__(self, env: str, gpus: int = 0, eps_start: float = 1.0, eps_end: float = 0.02, eps_last_frame: int = 150000, sync_rate: int = 1000, gamma: float = 0.99, learning_rate: float = 1e-4, batch_size: int = 32, replay_size: int = 100000, warm_start_size: int = 10000, num_samples: int = 500, n_steps=4): """ PyTorch Lightning implementation of `N-Step DQN <http://incompleteideas.net/papers/sutton-88-with-erratum.pdf>`_ Paper authors: Richard Sutton Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.n_step_dqn.model import NStepDQN ... >>> model = NStepDQN("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gpus: number of gpus being used eps_start: starting value of epsilon for the epsilon-greedy exploration eps_end: final value of epsilon for the epsilon-greedy exploration eps_last_frame: the final frame in for the decrease of epsilon. At this frame espilon = eps_end sync_rate: the number of iterations between syncing up the target network with the train network gamma: discount factor learning_rate: learning rate batch_size: size of minibatch pulled from the DataLoader replay_size: total capacity of the replay buffer warm_start_size: how many random steps through the environment to be carried out at the start of training to fill the buffer with a starting point num_samples: the number of samples to pull from the dataset iterator and feed to the DataLoader n_steps: number of steps to approximate and use in the bellman update """ super().__init__(env, gpus, eps_start, eps_end, eps_last_frame, sync_rate, gamma, learning_rate, batch_size, replay_size, warm_start_size, num_samples) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.source = NStepExperienceSource(self.env, self.agent, device, n_steps=n_steps)
def test_discounted_transition(self): self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=3) self.source.n_step_buffer.append(self.experience01) self.source.n_step_buffer.append(self.experience02) self.source.n_step_buffer.append(self.experience03) reward, next_state, done = self.source.get_transition_info() reward_01 = self.experience02.reward + 0.9 * self.experience03.reward * (1 - done) reward_gt = self.experience01.reward + 0.9 * reward_01 * (1 - done) self.assertEqual(reward, reward_gt) self.assertEqual(next_state.all(), self.next_state_02.all()) self.assertEqual(self.experience03.done, done)
def test_multi_step_discount(self): self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=3) self.source.env.step = Mock(return_value=(self.next_state_02, self.reward_02, self.done_02, Mock())) self.source.n_step_buffer.append(self.experience01) self.source.n_step_buffer.append(self.experience02) reward_gt = 1.71 exp, reward, done = self.source.step() self.assertEqual(exp[0].all(), self.experience01.state.all()) self.assertEqual(exp[1], self.experience01.action) self.assertEqual(exp[2], reward_gt) self.assertEqual(exp[3], self.experience02.done) self.assertEqual(exp[4].all(), self.experience02.new_state.all())
def setUp(self) -> None: self.net = Mock() self.agent = DummyAgent(net=self.net) self.env = gym.make("CartPole-v0") self.n_step = 2 self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=self.n_step) self.state = np.zeros([32, 32]) self.state_02 = np.ones([32, 32]) self.next_state = np.zeros([32, 32]) self.next_state_02 = np.ones([32, 32]) self.action = np.zeros([1]) self.action_02 = np.ones([1]) self.reward = np.zeros([1]) self.reward_02 = np.ones([1]) self.done = np.zeros([1]) self.done_02 = np.zeros([1]) self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state) self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02) self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)