def setUp(self) -> None:
        self.net = Mock()
        self.agent = DummyAgent(net=self.net)
        self.env = [gym.make("CartPole-v0") for _ in range(2)]
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.n_steps = 3
        self.gamma = 0.9
        self.source = DiscountedExperienceSource(
            self.env, self.agent, n_steps=self.n_steps, gamma=self.gamma
        )

        self.state = torch.ones(3)
        self.next_state = torch.zeros(3)
        self.reward = 1

        self.exp1 = Experience(
            state=self.state,
            action=1,
            reward=self.reward,
            done=False,
            new_state=self.next_state,
        )
        self.exp2 = Experience(
            state=self.next_state,
            action=1,
            reward=self.reward,
            done=False,
            new_state=self.state,
        )

        self.env1 = Mock()
        self.env1.step = Mock(
            return_value=(self.next_state, self.reward, True, self.state)
        )
    def setUp(self) -> None:
        self.net = Mock()
        self.agent = DummyAgent(net=self.net)
        self.env = gym.make("CartPole-v0")
        self.n_step = 2
        self.source = NStepExperienceSource(self.env,
                                            self.agent,
                                            n_steps=self.n_step)
        self.device = torch.device('cpu')

        self.state = np.zeros([32, 32])
        self.state_02 = np.ones([32, 32])
        self.next_state = np.zeros([32, 32])
        self.next_state_02 = np.ones([32, 32])
        self.action = np.zeros([1])
        self.action_02 = np.ones([1])
        self.reward = np.zeros([1])
        self.reward_02 = np.ones([1])
        self.done = np.zeros([1])
        self.done_02 = np.zeros([1])

        self.experience01 = Experience(self.state, self.action, self.reward,
                                       self.done, self.next_state)
        self.experience02 = Experience(self.state_02, self.action_02,
                                       self.reward_02, self.done_02,
                                       self.next_state_02)
        self.experience03 = Experience(self.state_02, self.action_02,
                                       self.reward_02, self.done_02,
                                       self.next_state_02)
    def setUp(self) -> None:
        self.net = Mock()
        self.agent = DummyAgent(net=self.net)
        self.env = [gym.make("CartPole-v0") for _ in range(2)]
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.source = ExperienceSource(self.env, self.agent, n_steps=1)

        self.s1 = torch.ones(3)
        self.s2 = torch.zeros(3)

        self.mock_env = Mock()
        self.mock_env.step = Mock(return_value=(self.s1, 1, False, Mock()))

        self.exp1 = Experience(state=self.s1, action=1, reward=1, done=False, new_state=self.s2)
        self.exp2 = Experience(state=self.s1, action=1, reward=1, done=False, new_state=self.s2)
示例#4
0
    def train_batch(
        self,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
               torch.Tensor]:
        """
        Contains the logic for generating a new batch of data to be passed to the DataLoader.
        This is the same function as the standard DQN except that we dont update epsilon as it is always 0. The
        exploration comes from the noisy network.

        Returns:
            yields a Experience tuple containing the state, action, reward, done and next_state.
        """
        episode_reward = 0
        episode_steps = 0

        while True:
            self.total_steps += 1
            action = self.agent(self.state, self.device)

            next_state, r, is_done, _ = self.env.step(action[0])

            episode_reward += r
            episode_steps += 1

            exp = Experience(state=self.state,
                             action=action[0],
                             reward=r,
                             done=is_done,
                             new_state=next_state)

            self.buffer.append(exp)
            self.state = next_state

            if is_done:
                self.done_episodes += 1
                self.total_rewards.append(episode_reward)
                self.total_episode_steps.append(episode_steps)
                self.avg_rewards = float(
                    np.mean(self.total_rewards[-self.avg_reward_len:]))
                self.state = self.env.reset()
                episode_steps = 0
                episode_reward = 0

            states, actions, rewards, dones, new_states = self.buffer.sample(
                self.batch_size)

            for idx, _ in enumerate(dones):
                yield states[idx], actions[idx], rewards[idx], dones[
                    idx], new_states[idx]

            # Simulates epochs
            if self.total_steps % self.batches_per_epoch == 0:
                break
    def train_batch(
            self,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Contains the logic for generating a new batch of data to be passed to the DataLoader
        Returns:
            yields a Experience tuple containing the state, action, reward, done and next_state.
        """
        episode_reward = 0
        episode_steps = 0

        while True:
            self.total_steps += 1
            action = self.agent(self.state, self.env, self.device)

            next_state, reward, is_done, info = self.env.step(action)

            episode_reward += reward
            episode_steps += 1

            exp = Experience(
                state=self.state,
                action=action,
                reward=reward,
                done=is_done,
                new_state=next_state,
            )

            self.agent.update_epsilon(self.global_step)
            self.buffer.append(exp)
            self.state = next_state

            if is_done:
                self.done_episodes += 1

                self.total_rewards.append(episode_reward)
                self.total_step_count.append(episode_steps)
                self.total_at_goal.append(info["at_goal"])

                self.state = self.env.reset()
                episode_steps = 0
                episode_reward = 0

            states, actions, rewards, dones, new_states = self.buffer.sample(
                self.batch_size
            )

            for i, _ in enumerate(dones):
                yield states[i], actions[i], rewards[i], dones[i], new_states[i]

            # Simulates epochs
            if self.total_steps % self.batches_per_epoch == 0:
                break
示例#6
0
    def populate(self, warm_start: int) -> None:
        """Populates the buffer with initial experience."""
        if warm_start > 0:
            self.state = self.env.reset()

            for _ in range(warm_start):
                action = self.agent(self.state, self.device)
                next_state, reward, done, _ = self.env.step(action[0])
                exp = Experience(state=self.state, action=action[0], reward=reward, done=done, new_state=next_state)
                self.buffer.append(exp)
                self.state = next_state

                if done:
                    self.state = self.env.reset()
    def populate(self, warm_start: int) -> None:
        """Populates the buffer with initial experiences"""
        if warm_start > 0:
            self.state = self.env.reset()

            for _ in tqdm(range(warm_start), desc="Populate replay buffer"):
                if not self.resumed:
                    self.agent.epsilon = 1.0
                action = self.agent(self.state, self.env, self.device)
                next_state, reward, done, _ = self.env.step(action)
                exp = Experience(
                    state=self.state,
                    action=action,
                    reward=reward,
                    done=done,
                    new_state=next_state,
                )
                self.buffer.append(exp)
                self.state = next_state

                if done:
                    self.state = self.env.reset()