def obs_to_tensor( self, observation: Union[np.ndarray, Dict[str, np.ndarray]] ) -> Tuple[th.Tensor, bool]: """ Convert an input observation to a PyTorch tensor that can be fed to a model. Includes sugar-coating to handle different observations (e.g. normalizing images). :param observation: the input observation :return: The observation as PyTorch tensor and whether the observation is vectorized or not """ vectorized_env = False if isinstance(observation, dict): # need to copy the dict as the dict in VecFrameStack will become a torch tensor observation = copy.deepcopy(observation) for key, obs in observation.items(): obs_space = self.observation_space.spaces[key] if is_image_space(obs_space): obs_ = maybe_transpose(obs, obs_space) else: obs_ = np.array(obs) vectorized_env = vectorized_env or is_vectorized_observation( obs_, obs_space) # Add batch dimension if needed if key != "scene_graph": observation[key] = obs_.reshape( (-1, ) + self.observation_space[key].shape) else: observation[key] = obs_ elif is_image_space(self.observation_space): # Handle the different cases for images # as PyTorch use channel first format observation = maybe_transpose(observation, self.observation_space) else: observation = np.array(observation) if not isinstance(observation, dict): # Dict obs need to be handled separately vectorized_env = is_vectorized_observation(observation, self.observation_space) # Add batch dimension if needed observation = observation.reshape((-1, ) + self.observation_space.shape) observation = obs_as_tensor(observation, self.device) return observation, vectorized_env
def predict( self, observation: Union[np.ndarray, Dict[str, np.ndarray]], state: Optional[np.ndarray] = None, mask: Optional[np.ndarray] = None, deterministic: bool = False, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ Get the policy action and state from an observation (and optional state). Includes sugar-coating to handle different observations (e.g. normalizing images). :param observation: the input observation :param state: The last states (can be None, used in recurrent policies) :param mask: The last masks (can be None, used in recurrent policies) :param deterministic: Whether or not to return deterministic actions. :return: the model's action and the next state (used in recurrent policies) """ # TODO (GH/1): add support for RNN policies # if state is None: # state = self.initial_state # if mask is None: # mask = [False for _ in range(self.n_envs)] observation, vectorized_env = self.preprocesses_obs_for_model(observation) observation = obs_as_tensor(observation, self.device) with th.no_grad(): actions = self._predict(observation, deterministic=deterministic) # Convert to numpy actions = actions.cpu().numpy() if isinstance(self.action_space, gym.spaces.Box): if self.squash_output: # Rescale to proper domain when using squashing actions = self.unscale_action(actions) else: # Actions could be on arbitrary scale, so clip the actions to avoid # out of bound error (e.g. if sampling from a Gaussian distribution) actions = np.clip(actions, self.action_space.low, self.action_space.high) if not vectorized_env: if state is not None: raise ValueError("Error: The environment must be vectorized when using recurrent policies.") actions = actions[0] return actions, state
def evaluate_actions(self, obs: th.Tensor, actions: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: """ Evaluate actions according to the current policy, given the observations. :param obs: :param actions: :return: estimated value, log likelihood of taking those actions and entropy of the action distribution. """ obs, _ = self.preprocesses_obs_for_model(obs) obs = obs_as_tensor(obs, self.device) latent_pi, latent_vf, latent_sde = self._get_latent(obs) distribution = self._get_action_dist_from_latent(latent_pi, latent_sde) log_prob = distribution.log_prob(actions) values = self.value_net(latent_vf) return values, log_prob, distribution.entropy()
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int, ) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" # Switch to eval mode (this affects batch norm / dropout) self.policy.set_training_mode(False) n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(self._last_obs, self.device) actions, values, log_probs = self.policy(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) # Handle timeout by bootstraping with value function # see GitHub issue #633 for idx, done in enumerate(dones): if ( done and infos[idx].get("terminal_observation") is not None and infos[idx].get("TimeLimit.truncated", False) ): terminal_obs = self.policy.obs_to_tensor(infos[idx]["terminal_observation"])[0] with th.no_grad(): terminal_value = self.policy.predict_values(terminal_obs)[0] rewards[idx] += self.gamma * terminal_value rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs) self._last_obs = new_obs self._last_episode_starts = dones with th.no_grad(): # Compute value for the last timestep values = self.policy.predict_values(obs_as_tensor(new_obs, self.device)) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True
def predict( self, observation: Union[np.ndarray, Dict[str, np.ndarray]], state: Optional[np.ndarray] = None, mask: Optional[np.ndarray] = None, deterministic: bool = False, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ Get the policy action and state from an observation (and optional state). Includes sugar-coating to handle different observations (e.g. normalizing images). :param observation: the input observation :param state: The last states (can be None, used in recurrent policies) :param mask: The last masks (can be None, used in recurrent policies) :param deterministic: Whether or not to return deterministic actions. :return: the model's action and the next state (used in recurrent policies) """ # TODO (GH/1): add support for RNN policies # if state is None: # state = self.initial_state # if mask is None: # mask = [False for _ in range(self.n_envs)] # Switch to eval mode (this affects batch norm / dropout) self.eval() vectorized_env = False if isinstance(observation, dict): # need to copy the dict as the dict in VecFrameStack will become a torch tensor observation = copy.deepcopy(observation) for key, obs in observation.items(): obs_space = self.observation_space.spaces[key] if is_image_space(obs_space): obs_ = maybe_transpose(obs, obs_space) else: obs_ = np.array(obs) vectorized_env = vectorized_env or is_vectorized_observation(obs_, obs_space) # Add batch dimension if needed observation[key] = obs_.reshape((-1,) + self.observation_space[key].shape) elif is_image_space(self.observation_space): # Handle the different cases for images # as PyTorch use channel first format observation = maybe_transpose(observation, self.observation_space) else: observation = np.array(observation) if not isinstance(observation, dict): # Dict obs need to be handled separately vectorized_env = is_vectorized_observation(observation, self.observation_space) # Add batch dimension if needed observation = observation.reshape((-1,) + self.observation_space.shape) observation = obs_as_tensor(observation, self.device) with th.no_grad(): actions = self._predict(observation, deterministic=deterministic) # Convert to numpy actions = actions.cpu().numpy() if isinstance(self.action_space, gym.spaces.Box): if self.squash_output: # Rescale to proper domain when using squashing actions = self.unscale_action(actions) else: # Actions could be on arbitrary scale, so clip the actions to avoid # out of bound error (e.g. if sampling from a Gaussian distribution) actions = np.clip(actions, self.action_space.low, self.action_space.high) if not vectorized_env: if state is not None: raise ValueError("Error: The environment must be vectorized when using recurrent policies.") actions = actions[0] return actions, state
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int, ) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps * self.outer_steps: # here n_rollout_steps is n_steps in PPO args. Noted by Chenyin # while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(self._last_obs, self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 # (1) if at the T-th step, the env is going to reset, so we shall store the terminal states in advance # (2) if done, new_obs is the new state after resetting the env, so we need to get terminal state from infos if n_steps % n_rollout_steps == 0 or dones.any(): # if dones.any(): # second case: do not reset the env when encountering step T terminal_obs = new_obs.copy() infos_array = np.array(infos) # change list to numpy array i = 0 for done in dones: if done: terminal_obs[i] = infos_array[i][ "terminal_observation"] i += 1 with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(terminal_obs, self.device) _, terminal_values, _ = self.policy.forward( obs_tensor) # in the infinite game, V(s_T) is defined else: # when dones = [False, ..., False] terminal_values = None if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs, terminal_values) # Chenyin if n_steps % n_rollout_steps == 0: self._last_obs = env.reset() self._last_episode_starts = np.ones((env.num_envs, ), dtype=bool) else: self._last_obs = new_obs self._last_episode_starts = dones # self._last_obs = new_obs # self._last_episode_starts = dones with th.no_grad(): # Compute value for the last timestep if n_steps % n_rollout_steps == 0 or dones.any(): # if dones.any(): # obs_tensor = obs_as_tensor(terminal_obs, self.device) # _, values, _ = self.policy.forward(obs_tensor) values = terminal_values assert values is not None else: obs_tensor = obs_as_tensor(new_obs, self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values) callback.on_rollout_end() return True