def test_measure_RL_performance_batched_env(): batch_size = 3 start = [0 for i in range(batch_size)] target = 5 env = EnvDataset( SyncVectorEnv([ partial(DummyEnvironment, start=start[i], target=target, max_value=target * 2) for i in range(batch_size) ])) # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for step, obs in enumerate(itertools.islice(env, 100)): print(f"step {step} obs: {obs}") action = np.ones(batch_size) reward = env.send(action) # print(obs, reward, done, info) from collections import defaultdict from sequoia.common.metrics import Metrics expected_metrics = defaultdict(Metrics) for i in range(101): for env_index in range(batch_size): if i and i % target == 0: expected_metrics[i] += EpisodeMetrics( n_samples=1, mean_episode_reward= 10., # ? FIXME: Actually understand this condition mean_episode_length=target, ) # FIXME: This test is a bit too complicated, hard to follow. I'll keep the # batches synced-up for now. # if i > 0 and (i + env_index) % target == 0: # expected_metrics[i] += EpisodeMetrics( # n_samples=1, # mean_episode_reward=sum(target - (i + env_index % target) for j in range(start[env_index], target)), # mean_episode_length=target - start[env_index] - 1 # ) assert env.get_online_performance() == expected_metrics
def get_metrics( self, action: Union[Actions, Any], reward: Union[Rewards, Any], done: Union[bool, Sequence[bool]]) -> Optional[EpisodeMetrics]: metrics = [] rewards = reward.y if isinstance(reward, Rewards) else reward actions = action.y_pred if isinstance(action, Actions) else action dones: Sequence[bool] if not self.is_batched_env: rewards = [rewards] actions = [actions] assert isinstance(done, bool) dones = [done] else: assert isinstance(done, (np.ndarray, Tensor)) dones = done for env_index, (env_is_done, reward) in enumerate(zip(dones, rewards)): if env_is_done: metrics.append( EpisodeMetrics( n_samples=1, # The average reward per episode. mean_episode_reward=self. _current_episode_reward[env_index], # The average length of each episode. mean_episode_length=self. _current_episode_steps[env_index], )) self._current_episode_reward[env_index] = 0 self._current_episode_steps[env_index] = 0 if not metrics: return None metric = sum(metrics, Metrics()) if wandb.run: log_dict = metric.to_log_dict() if self.wandb_prefix: log_dict = add_prefix(log_dict, prefix=self.wandb_prefix, sep="/") log_dict["steps"] = self._steps log_dict["episode"] = self._episodes wandb.log(log_dict) return metric
def test_measure_RL_performance_basics(): env = DummyEnvironment(start=0, target=5, max_value=10) from sequoia.settings.active.continual.continual_rl_setting import \ ContinualRLSetting # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for episode in range(5): episode_steps = 0 episode_reward = 0 obs = env.reset() print(f"Episode {episode}, obs: {obs}") done = False while not done: action = env.action_space.sample() obs, reward, done, info = env.step(action) episode_reward += reward episode_steps += 1 # print(obs, reward, done, info) all_episode_steps.append(episode_steps) all_episode_rewards.append(episode_reward) from itertools import accumulate expected_metrics = {} for episode_steps, cumul_step, episode_reward in zip( all_episode_steps, accumulate(all_episode_steps), all_episode_rewards): expected_metrics[cumul_step] = EpisodeMetrics( n_samples=1, mean_episode_reward=episode_reward, mean_episode_length=episode_steps, ) assert env.get_online_performance() == expected_metrics
def test_measure_RL_performance_iteration(): env = DummyEnvironment(start=0, target=5, max_value=10) from gym.wrappers import TimeLimit max_episode_steps = 50 env = EnvDataset(env) env = TimeLimit(env, max_episode_steps=max_episode_steps) # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for episode in range(5): episode_steps = 0 episode_reward = 0 for step, obs in enumerate(env): print(f"Episode {episode}, obs: {obs}") action = env.action_space.sample() reward = env.send(action) episode_reward += reward episode_steps += 1 # print(obs, reward, done, info) assert step <= max_episode_steps, "shouldn't be able to iterate longer than that." all_episode_steps.append(episode_steps) all_episode_rewards.append(episode_reward) expected_metrics = {} for episode_steps, cumul_step, episode_reward in zip( all_episode_steps, accumulate(all_episode_steps), all_episode_rewards): expected_metrics[cumul_step] = EpisodeMetrics( n_samples=1, mean_episode_reward=episode_reward, mean_episode_length=episode_steps, ) assert env.get_online_performance() == expected_metrics
def get_results(self) -> TaskSequenceResults[EpisodeMetrics]: # TODO: Place the metrics in the right 'bin' at the end of each episode during # testing depending on the task at that time, rather than what's happening here, # where we're getting all the rewards and episode lengths at the end and then # sort it out into the bins based on the task schedule. ALSO: this would make it # easier to support monitoring batched RL environments, since these `Monitor` # methods (get_episode_rewards, get_episode_lengths, etc) assume the environment # isn't batched. rewards = self.get_episode_rewards() lengths = self.get_episode_lengths() task_schedule: Dict[int, Dict] = self.task_schedule task_steps = sorted(task_schedule.keys()) # TODO: Removing the last entry since it's the terminal state. task_steps.pop(-1) assert 0 in task_steps import bisect nb_tasks = len(task_steps) assert nb_tasks >= 1 test_results = TaskSequenceResults([TaskResults() for _ in range(nb_tasks)]) # TODO: Fix this, since the task id might not be related to the steps! for step, episode_reward, episode_length in zip( itertools.accumulate(lengths), rewards, lengths ): # Given the step, find the task id. task_id = bisect.bisect_right(task_steps, step) - 1 episode_metric = EpisodeMetrics( n_samples=1, mean_episode_reward=episode_reward, mean_episode_length=episode_length, ) test_results.task_results[task_id].metrics.append(episode_metric) return test_results
def get_episode_loss(self, env_index: int, done: bool) -> Optional[Loss]: # IDEA: Actually, now that I think about it, instead of detaching the # tensors, we could instead use the critic's 'value' estimate and get a # loss for that incomplete episode using the tensors in the buffer, # rather than detaching them! if not done: return None # TODO: Add something like a 'num_steps_since_update' for each env? (it # would actually be a num_steps_since_backward) # if self.num_steps_since_update? n_stored_steps = self.num_stored_steps(env_index) if n_stored_steps < 5: # For now, we only give back a loss at the end of the episode. # TODO: Test if giving back a loss at each step or every few steps # would work better! logger.warning( RuntimeWarning( f"Returning None as the episode loss, because only have " f"{n_stored_steps} steps stored for that environment.")) return None inputs: Tensor actions: A2CHeadOutput rewards: Rewards inputs, actions, rewards = self.stack_buffers(env_index) logits: Tensor = actions.logits action_log_probs: Tensor = actions.action_log_prob values: Tensor = actions.value assert rewards.y is not None episode_rewards: Tensor = rewards.y # target values are calculated backward # it's super important to handle correctly done states, # for those cases we want our to target to be equal to the reward only episode_length = len(episode_rewards) dones = torch.zeros(episode_length, dtype=torch.bool) dones[-1] = bool(done) returns = self.get_returns(episode_rewards, gamma=self.hparams.gamma).type_as(values) advantages = returns - values # Normalize advantage (not present in the original implementation) if self.hparams.normalize_advantages: advantages = normalize(advantages) # Create the Loss to be returned. loss = Loss(self.name) # Policy gradient loss (actor loss) policy_gradient_loss = -(advantages.detach() * action_log_probs).mean() actor_loss = Loss("actor", policy_gradient_loss) loss += self.hparams.actor_loss_coef * actor_loss # Value loss: Try to get the critic's values close to the actual return, # which means the advantages should be close to zero. value_loss_tensor = F.mse_loss(values, returns.reshape(values.shape)) critic_loss = Loss("critic", value_loss_tensor) loss += self.hparams.critic_loss_coef * critic_loss # Entropy loss, to "favor exploration". entropy_loss_tensor = -actions.action_dist.entropy().mean() entropy_loss = Loss("entropy", entropy_loss_tensor) loss += self.hparams.entropy_loss_coef * entropy_loss if done: episode_rewards_array = episode_rewards.reshape([-1]) loss.metric = EpisodeMetrics( n_samples=1, mean_episode_reward=float(episode_rewards_array.sum()), mean_episode_length=len(episode_rewards_array), ) loss.metrics["gradient_usage"] = self.get_gradient_usage_metrics( env_index) return loss