def setUp(self) -> None: self.gamma = 0.9 self.buffer = MultiStepBuffer(capacity=10, n_steps=2, gamma=self.gamma) self.state = np.zeros([32, 32]) self.state_02 = np.ones([32, 32]) self.next_state = np.zeros([32, 32]) self.next_state_02 = np.ones([32, 32]) self.action = np.zeros([1]) self.action_02 = np.ones([1]) self.reward = np.zeros([1]) self.reward_02 = np.ones([1]) self.done = np.zeros([1]) self.done_02 = np.zeros([1]) self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state) self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02) self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
def setUp(self) -> None: self.buffer = PERBuffer(10) self.state = np.random.rand(32, 32) self.next_state = np.random.rand(32, 32) self.action = np.ones([1]) self.reward = np.ones([1]) self.done = np.zeros([1]) self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state)
def train_batch( self, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Contains the logic for generating a new batch of data to be passed to the DataLoader Returns: yields a Experience tuple containing the state, action, reward, done and next_state. """ episode_reward = 0 episode_steps = 0 while True: self.total_steps += 1 action = self.agent(self.state, self.device) next_state, r, is_done, _ = self.env.step(action[0]) episode_reward += r episode_steps += 1 exp = Experience( state=self.state, action=action[0], reward=r, done=is_done, new_state=next_state, ) self.agent.update_epsilon(self.global_step) self.buffer.append(exp) self.state = next_state if is_done: self.done_episodes += 1 self.total_rewards.append(episode_reward) self.total_episode_steps.append(episode_steps) self.avg_rewards = float( np.mean(self.total_rewards[-self.avg_reward_len:]) ) self.state = self.env.reset() episode_steps = 0 episode_reward = 0 samples, indices, weights = self.buffer.sample(self.batch_size) states, actions, rewards, dones, new_states = samples for idx, _ in enumerate(dones): yield ( states[idx], actions[idx], rewards[idx], dones[idx], new_states[idx], ), indices[idx], weights[idx]
def setUp(self) -> None: self.net = Mock() self.agent = DummyAgent(net=self.net) self.env = gym.make("CartPole-v0") self.n_step = 2 self.source = NStepExperienceSource(self.env, self.agent, Mock(), n_steps=self.n_step) self.state = np.zeros([32, 32]) self.state_02 = np.ones([32, 32]) self.next_state = np.zeros([32, 32]) self.next_state_02 = np.ones([32, 32]) self.action = np.zeros([1]) self.action_02 = np.ones([1]) self.reward = np.zeros([1]) self.reward_02 = np.ones([1]) self.done = np.zeros([1]) self.done_02 = np.zeros([1]) self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state) self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02) self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
def setUp(self) -> None: self.buffer = MultiStepBuffer(buffer_size=10, n_step=2) self.state = np.zeros([32, 32]) self.state_02 = np.ones([32, 32]) self.next_state = np.zeros([32, 32]) self.next_state_02 = np.ones([32, 32]) self.action = np.zeros([1]) self.action_02 = np.ones([1]) self.reward = np.zeros([1]) self.reward_02 = np.ones([1]) self.done = np.zeros([1]) self.done_02 = np.zeros([1]) self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state) self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02) self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
def setUp(self) -> None: self.state = np.random.rand(32, 32) self.next_state = np.random.rand(32, 32) self.action = np.ones([1]) self.reward = np.ones([1]) self.done = np.zeros([1]) self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state) self.source = Mock() self.source.step = Mock(return_value=(self.experience, torch.tensor(0), False)) self.warm_start = 10 self.buffer = ReplayBuffer(20) for _ in range(self.warm_start): self.buffer.append(self.experience)
def setUp(self) -> None: self.state = np.random.rand(4, 84, 84) self.next_state = np.random.rand(4, 84, 84) self.action = np.ones([1]) self.reward = np.ones([1]) self.done = np.zeros([1]) self.experience = Experience(self.state, self.action, self.reward, self.done, self.next_state) self.source = Mock() self.source.step = Mock(return_value=(self.experience, torch.tensor(0), False)) self.batch_size = 8 self.buffer = Buffer(8) for _ in range(self.batch_size): self.buffer.append(self.experience)
def step(self, device: torch.device) -> Tuple[Experience, float, bool]: """Takes a single step through the environment""" action = self.agent(self.state, device) new_state, reward, done, _ = self.env.step(action) experience = Experience( state=self.state, action=action, reward=reward, new_state=new_state, done=done, ) self.state = new_state if done: self.state = self.env.reset() return experience, reward, done
def step(self, device: torch.device) -> Experience: """Carries out a single step in the environment""" action = self.agent(self.state, device) new_state, reward, done, _ = self.env.step(action) experience = Experience( state=self.state, action=action, reward=reward, new_state=new_state, done=done, ) self.state = new_state if done: self.state = self.env.reset() return experience
def step(self, device: torch.device) -> Tuple[Experience, float, bool]: """ Takes an n-step in the environment Returns: Experience """ exp = self.single_step(device) while len(self.n_step_buffer) < self.n_steps: self.single_step(device) reward, next_state, done = self.get_transition_info() first_experience = self.n_step_buffer[0] multi_step_experience = Experience(first_experience.state, first_experience.action, reward, done, next_state) return multi_step_experience, exp.reward, exp.done
def test_train_batch(self): state = np.random.rand(4, 84, 84) self.source = Mock() exp = Experience(state=state, action=0, reward=5, done=False, new_state=state) self.source.step = Mock(return_value=(exp, 1, False)) self.model.source = self.source xp_dataloader = self.model.train_dataloader() for i_batch, batch in enumerate(xp_dataloader): self.assertEqual(len(batch), 3) self.assertEqual(len(batch[0]), self.model.batch_size) self.assertTrue(isinstance(batch, list)) self.assertEqual(self.model.baseline, 5) self.assertIsInstance(batch[0], torch.Tensor) self.assertIsInstance(batch[1], torch.Tensor) self.assertIsInstance(batch[2], torch.Tensor) break