예제 #1
0
def test_measure_RL_performance_batched_env():
    batch_size = 3
    start = [0 for i in range(batch_size)]
    target = 5
    env = EnvDataset(
        SyncVectorEnv([
            partial(DummyEnvironment,
                    start=start[i],
                    target=target,
                    max_value=target * 2) for i in range(batch_size)
        ]))
    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for step, obs in enumerate(itertools.islice(env, 100)):
        print(f"step {step} obs: {obs}")
        action = np.ones(batch_size)
        reward = env.send(action)
        # print(obs, reward, done, info)
    from collections import defaultdict
    from sequoia.common.metrics import Metrics

    expected_metrics = defaultdict(Metrics)
    for i in range(101):
        for env_index in range(batch_size):
            if i and i % target == 0:
                expected_metrics[i] += EpisodeMetrics(
                    n_samples=1,
                    mean_episode_reward=
                    10.,  # ? FIXME: Actually understand this condition
                    mean_episode_length=target,
                )

            # FIXME: This test is a bit too complicated, hard to follow. I'll keep the
            # batches synced-up for now.
            # if i > 0 and (i + env_index) % target == 0:
            #     expected_metrics[i] += EpisodeMetrics(
            #         n_samples=1,
            #         mean_episode_reward=sum(target - (i + env_index % target) for j in range(start[env_index], target)),
            #         mean_episode_length=target - start[env_index] - 1
            #     )

    assert env.get_online_performance() == expected_metrics
    def get_metrics(
            self, action: Union[Actions, Any], reward: Union[Rewards, Any],
            done: Union[bool, Sequence[bool]]) -> Optional[EpisodeMetrics]:
        metrics = []

        rewards = reward.y if isinstance(reward, Rewards) else reward
        actions = action.y_pred if isinstance(action, Actions) else action
        dones: Sequence[bool]
        if not self.is_batched_env:
            rewards = [rewards]
            actions = [actions]
            assert isinstance(done, bool)
            dones = [done]
        else:
            assert isinstance(done, (np.ndarray, Tensor))
            dones = done

        for env_index, (env_is_done, reward) in enumerate(zip(dones, rewards)):
            if env_is_done:
                metrics.append(
                    EpisodeMetrics(
                        n_samples=1,
                        # The average reward per episode.
                        mean_episode_reward=self.
                        _current_episode_reward[env_index],
                        # The average length of each episode.
                        mean_episode_length=self.
                        _current_episode_steps[env_index],
                    ))
                self._current_episode_reward[env_index] = 0
                self._current_episode_steps[env_index] = 0

        if not metrics:
            return None

        metric = sum(metrics, Metrics())
        if wandb.run:
            log_dict = metric.to_log_dict()
            if self.wandb_prefix:
                log_dict = add_prefix(log_dict,
                                      prefix=self.wandb_prefix,
                                      sep="/")
            log_dict["steps"] = self._steps
            log_dict["episode"] = self._episodes
            wandb.log(log_dict)

        return metric
예제 #3
0
def test_measure_RL_performance_basics():
    env = DummyEnvironment(start=0, target=5, max_value=10)

    from sequoia.settings.active.continual.continual_rl_setting import \
        ContinualRLSetting

    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for episode in range(5):
        episode_steps = 0
        episode_reward = 0
        obs = env.reset()
        print(f"Episode {episode}, obs: {obs}")
        done = False
        while not done:
            action = env.action_space.sample()
            obs, reward, done, info = env.step(action)
            episode_reward += reward
            episode_steps += 1
            # print(obs, reward, done, info)

        all_episode_steps.append(episode_steps)
        all_episode_rewards.append(episode_reward)
    from itertools import accumulate

    expected_metrics = {}
    for episode_steps, cumul_step, episode_reward in zip(
            all_episode_steps, accumulate(all_episode_steps),
            all_episode_rewards):
        expected_metrics[cumul_step] = EpisodeMetrics(
            n_samples=1,
            mean_episode_reward=episode_reward,
            mean_episode_length=episode_steps,
        )

    assert env.get_online_performance() == expected_metrics
예제 #4
0
def test_measure_RL_performance_iteration():
    env = DummyEnvironment(start=0, target=5, max_value=10)
    from gym.wrappers import TimeLimit
    max_episode_steps = 50
    env = EnvDataset(env)
    env = TimeLimit(env, max_episode_steps=max_episode_steps)

    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for episode in range(5):
        episode_steps = 0
        episode_reward = 0
        for step, obs in enumerate(env):
            print(f"Episode {episode}, obs: {obs}")
            action = env.action_space.sample()
            reward = env.send(action)
            episode_reward += reward
            episode_steps += 1
            # print(obs, reward, done, info)
            assert step <= max_episode_steps, "shouldn't be able to iterate longer than that."

        all_episode_steps.append(episode_steps)
        all_episode_rewards.append(episode_reward)

    expected_metrics = {}
    for episode_steps, cumul_step, episode_reward in zip(
            all_episode_steps, accumulate(all_episode_steps),
            all_episode_rewards):
        expected_metrics[cumul_step] = EpisodeMetrics(
            n_samples=1,
            mean_episode_reward=episode_reward,
            mean_episode_length=episode_steps,
        )

    assert env.get_online_performance() == expected_metrics
예제 #5
0
    def get_results(self) -> TaskSequenceResults[EpisodeMetrics]:
        # TODO: Place the metrics in the right 'bin' at the end of each episode during
        # testing depending on the task at that time, rather than what's happening here,
        # where we're getting all the rewards and episode lengths at the end and then
        # sort it out into the bins based on the task schedule. ALSO: this would make it
        # easier to support monitoring batched RL environments, since these `Monitor`
        # methods (get_episode_rewards, get_episode_lengths, etc) assume the environment
        # isn't batched.
        rewards = self.get_episode_rewards()
        lengths = self.get_episode_lengths()

        task_schedule: Dict[int, Dict] = self.task_schedule
        task_steps = sorted(task_schedule.keys())
        # TODO: Removing the last entry since it's the terminal state.
        task_steps.pop(-1)

        assert 0 in task_steps
        import bisect
        nb_tasks = len(task_steps)
        assert nb_tasks >= 1

        test_results = TaskSequenceResults([TaskResults() for _ in range(nb_tasks)])
        # TODO: Fix this, since the task id might not be related to the steps!
        for step, episode_reward, episode_length in zip(
            itertools.accumulate(lengths), rewards, lengths
        ):
            # Given the step, find the task id.
            task_id = bisect.bisect_right(task_steps, step) - 1
            
            episode_metric = EpisodeMetrics(
                n_samples=1,
                mean_episode_reward=episode_reward,
                mean_episode_length=episode_length,
            )
            
            test_results.task_results[task_id].metrics.append(episode_metric)

        return test_results
예제 #6
0
    def get_episode_loss(self, env_index: int, done: bool) -> Optional[Loss]:
        # IDEA: Actually, now that I think about it, instead of detaching the
        # tensors, we could instead use the critic's 'value' estimate and get a
        # loss for that incomplete episode using the tensors in the buffer,
        # rather than detaching them!

        if not done:
            return None

        # TODO: Add something like a 'num_steps_since_update' for each env? (it
        # would actually be a num_steps_since_backward)
        # if self.num_steps_since_update?
        n_stored_steps = self.num_stored_steps(env_index)
        if n_stored_steps < 5:
            # For now, we only give back a loss at the end of the episode.
            # TODO: Test if giving back a loss at each step or every few steps
            # would work better!
            logger.warning(
                RuntimeWarning(
                    f"Returning None as the episode loss, because only have "
                    f"{n_stored_steps} steps stored for that environment."))
            return None

        inputs: Tensor
        actions: A2CHeadOutput
        rewards: Rewards
        inputs, actions, rewards = self.stack_buffers(env_index)
        logits: Tensor = actions.logits
        action_log_probs: Tensor = actions.action_log_prob
        values: Tensor = actions.value
        assert rewards.y is not None
        episode_rewards: Tensor = rewards.y

        # target values are calculated backward
        # it's super important to handle correctly done states,
        # for those cases we want our to target to be equal to the reward only
        episode_length = len(episode_rewards)
        dones = torch.zeros(episode_length, dtype=torch.bool)
        dones[-1] = bool(done)

        returns = self.get_returns(episode_rewards,
                                   gamma=self.hparams.gamma).type_as(values)
        advantages = returns - values

        # Normalize advantage (not present in the original implementation)
        if self.hparams.normalize_advantages:
            advantages = normalize(advantages)

        # Create the Loss to be returned.
        loss = Loss(self.name)

        # Policy gradient loss (actor loss)
        policy_gradient_loss = -(advantages.detach() * action_log_probs).mean()
        actor_loss = Loss("actor", policy_gradient_loss)
        loss += self.hparams.actor_loss_coef * actor_loss

        # Value loss: Try to get the critic's values close to the actual return,
        # which means the advantages should be close to zero.
        value_loss_tensor = F.mse_loss(values, returns.reshape(values.shape))
        critic_loss = Loss("critic", value_loss_tensor)
        loss += self.hparams.critic_loss_coef * critic_loss

        # Entropy loss, to "favor exploration".
        entropy_loss_tensor = -actions.action_dist.entropy().mean()
        entropy_loss = Loss("entropy", entropy_loss_tensor)
        loss += self.hparams.entropy_loss_coef * entropy_loss
        if done:
            episode_rewards_array = episode_rewards.reshape([-1])
            loss.metric = EpisodeMetrics(
                n_samples=1,
                mean_episode_reward=float(episode_rewards_array.sum()),
                mean_episode_length=len(episode_rewards_array),
            )
        loss.metrics["gradient_usage"] = self.get_gradient_usage_metrics(
            env_index)
        return loss