def _wrap_env(env: GymEnv, verbose: int = 0, monitor_wrapper: bool = True) -> VecEnv: """ " Wrap environment with the appropriate wrappers if needed. For instance, to have a vectorized environment or to re-order the image channels. :param env: :param verbose: :param monitor_wrapper: Whether to wrap the env in a ``Monitor`` when possible. :return: The wrapped environment. """ if not isinstance(env, VecEnv): if not is_wrapped(env, Monitor) and monitor_wrapper: if verbose >= 1: print("Wrapping the env with a `Monitor` wrapper") env = Monitor(env) if verbose >= 1: print("Wrapping the env in a DummyVecEnv.") env = DummyVecEnv([lambda: env]) if (is_image_space(env.observation_space) and not is_vecenv_wrapped(env, VecTransposeImage) and not is_image_space_channels_first(env.observation_space)): if verbose >= 1: print("Wrapping the env in a VecTransposeImage.") env = VecTransposeImage(env) # check if wrapper for dict support is needed when using HER if isinstance(env.observation_space, gym.spaces.dict.Dict): env = ObsDictWrapper(env) return env
def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False) -> VecEnv: """ Create the environment and wrap it if necessary. :param n_envs: :param eval_env: Whether is it an environment used for evaluation or not :param no_log: Do not log training when doing hyperparameter optim (issue with writing the same file) :return: the vectorized environment, with appropriate wrappers """ # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else self.save_path # env = SubprocVecEnv([make_env(env_id, i, self.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = make_vec_env( env_id=self.env_id, n_envs=n_envs, seed=self.seed, env_kwargs=self.env_kwargs, monitor_dir=log_dir, wrapper_class=self.env_wrapper, vec_env_cls=self.vec_env_class, vec_env_kwargs=self.vec_env_kwargs, ) # Special case for GoalEnvs: log success rate too if "Neck" in self.env_id or self.is_robotics_env(self.env_id): self._log_success_rate(env) # Wrap the env into a VecNormalize wrapper if needed # and load saved statistics when present env = self._maybe_normalize(env, eval_env) # Optional Frame-stacking if self.frame_stack is not None: n_stack = self.frame_stack env = VecFrameStack(env, n_stack) if self.verbose > 0: print(f"Stacking {n_stack} frames") # Wrap if needed to re-order channels # (switch from channel last to channel first convention) if is_image_space(env.observation_space): if self.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) # check if wrapper for dict support is needed if self.algo == "her": if self.verbose > 0: print("Wrapping into a ObsDictWrapper") env = ObsDictWrapper(env) return env
def predict( self, observation: np.ndarray, state: Optional[np.ndarray] = None, mask: Optional[np.ndarray] = None, deterministic: bool = False, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ Overriden to create proper Octree batch. Get the policy action and state from an observation (and optional state). :param observation: the input observation :param state: The last states (can be None, used in recurrent policies) :param mask: The last masks (can be None, used in recurrent policies) :param deterministic: Whether or not to return deterministic actions. :return: the model's action and the next state (used in recurrent policies) """ if isinstance(observation, dict): observation = ObsDictWrapper.convert_dict(observation) else: observation = np.array(observation) vectorized_env = is_vectorized_observation(observation, self.observation_space) if self._debug_write_octree: ocnn.write_octree(th.from_numpy(observation[-1]), 'octree.octree') # Make batch out of tensor (consisting of n-stacked octrees) octree_batch = preprocess_stacked_octree_batch( observation, self.device, separate_batches=self._separate_networks_for_stacks) with th.no_grad(): actions = self._predict(octree_batch, deterministic=deterministic) # Convert to numpy actions = actions.cpu().numpy() if isinstance(self.action_space, gym.spaces.Box): if self.squash_output: # Rescale to proper domain when using squashing actions = self.unscale_action(actions) else: # Actions could be on arbitrary scale, so clip the actions to avoid # out of bound error (e.g. if sampling from a Gaussian distribution) actions = np.clip(actions, self.action_space.low, self.action_space.high) if not vectorized_env: if state is not None: raise ValueError( "Error: The environment must be vectorized when using recurrent policies." ) actions = actions[0] return actions, state
def predict( self, observation: np.ndarray, state: Optional[np.ndarray] = None, mask: Optional[np.ndarray] = None, deterministic: bool = False, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ Get the policy action and state from an observation (and optional state). Includes sugar-coating to handle different observations (e.g. normalizing images). :param observation: the input observation :param state: The last states (can be None, used in recurrent policies) :param mask: The last masks (can be None, used in recurrent policies) :param deterministic: Whether or not to return deterministic actions. :return: the model's action and the next state (used in recurrent policies) """ # TODO (GH/1): add support for RNN policies # if state is None: # state = self.initial_state # if mask is None: # mask = [False for _ in range(self.n_envs)] if isinstance(observation, dict): observation = ObsDictWrapper.convert_dict(observation) else: observation = np.array(observation) # Handle the different cases for images # as PyTorch use channel first format observation = maybe_transpose(observation, self.observation_space) vectorized_env = is_vectorized_observation(observation, self.observation_space) observation = observation.reshape((-1,) + self.observation_space.shape) observation = th.as_tensor(observation).to(self.device) with th.no_grad(): actions = self._predict(observation, deterministic=deterministic) # Convert to numpy actions = actions.cpu().numpy() if isinstance(self.action_space, gym.spaces.Box): if self.squash_output: # Rescale to proper domain when using squashing actions = self.unscale_action(actions) else: # Actions could be on arbitrary scale, so clip the actions to avoid # out of bound error (e.g. if sampling from a Gaussian distribution) actions = np.clip(actions, self.action_space.low, self.action_space.high) if not vectorized_env: if state is not None: raise ValueError("Error: The environment must be vectorized when using recurrent policies.") actions = actions[0] return actions, state
def _wrap_env(env: GymEnv, verbose: int = 0) -> VecEnv: if not isinstance(env, VecEnv): if verbose >= 1: print("Wrapping the env in a DummyVecEnv.") env = DummyVecEnv([lambda: env]) if is_image_space(env.observation_space) and not is_wrapped(env, VecTransposeImage): if verbose >= 1: print("Wrapping the env in a VecTransposeImage.") env = VecTransposeImage(env) # check if wrapper for dict support is needed when using HER if isinstance(env.observation_space, gym.spaces.dict.Dict): env = ObsDictWrapper(env) return env
def test_eval_success_logging(tmp_path): n_bits = 2 env = BitFlippingEnv(n_bits=n_bits) eval_env = DummyVecEnv([lambda: BitFlippingEnv(n_bits=n_bits)]) eval_callback = EvalCallback( ObsDictWrapper(eval_env), eval_freq=250, log_path=tmp_path, warn=False, ) model = HER("MlpPolicy", env, DQN, learning_starts=100, seed=0, max_episode_length=n_bits) model.learn(500, callback=eval_callback) assert len(eval_callback._is_success_buffer) > 0 # More than 50% success rate assert np.mean(eval_callback._is_success_buffer) > 0.5
def test_save_load(tmp_path, model_class, use_sde, online_sampling): """ Test if 'save' and 'load' saves and loads model correctly """ if use_sde and model_class != SAC: pytest.skip("Only SAC has gSDE support") n_bits = 4 env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN)) kwargs = dict(use_sde=True) if use_sde else {} # create model model = HER("MlpPolicy", env, model_class, n_sampled_goal=5, goal_selection_strategy="future", online_sampling=online_sampling, verbose=0, tau=0.05, batch_size=128, learning_rate=0.001, policy_kwargs=dict(net_arch=[64]), buffer_size=int(1e6), gamma=0.98, gradient_steps=1, train_freq=4, learning_starts=100, max_episode_length=n_bits, **kwargs) model.learn(total_timesteps=300) env.reset() observations_list = [] for _ in range(10): obs = env.step(env.action_space.sample())[0] observation = ObsDictWrapper.convert_dict(obs) observations_list.append(observation) observations = np.array(observations_list) # Get dictionary of current parameters params = deepcopy(model.policy.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values model.policy.load_state_dict(random_params) new_params = model.policy.state_dict() # Check that all params are different now for k in params: assert not th.allclose( params[k], new_params[k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") del model # test custom_objects # Load with custom objects custom_objects = dict(learning_rate=2e-5, dummy=1.0) model_ = HER.load(str(tmp_path / "test_save.zip"), env=env, custom_objects=custom_objects, verbose=2) assert model_.verbose == 2 # Check that the custom object was taken into account assert model_.learning_rate == custom_objects["learning_rate"] # Check that only parameters that are here already are replaced assert not hasattr(model_, "dummy") model = HER.load(str(tmp_path / "test_save.zip"), env=env) # check if params are still the same after load new_params = model.policy.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert th.allclose( params[key], new_params[key] ), "Model parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works model.learn(total_timesteps=300) # Test that the change of parameters works model = HER.load(str(tmp_path / "test_save.zip"), env=env, verbose=3, learning_rate=2.0) assert model.model.learning_rate == 2.0 assert model.verbose == 3 # clear file from os os.remove(tmp_path / "test_save.zip")
def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False) -> VecEnv: """ Create the environment and wrap it if necessary. :param n_envs: :param eval_env: Whether is it an environment used for evaluation or not :param no_log: Do not log training when doing hyperparameter optim (issue with writing the same file) :return: the vectorized environment, with appropriate wrappers """ # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else self.save_path monitor_kwargs = {} # Special case for GoalEnvs: log success rate too if "Neck" in self.env_id or self.is_robotics_env( self.env_id) or "parking-v0" in self.env_id: monitor_kwargs = dict(info_keywords=("is_success", )) # Note: made custom to support Gazebo Runtime wrapping def make_env(): def _init(): env = self.env_wrapper(env=self.env_id, **self.env_kwargs) env.seed(self.seed) env.action_space.seed(self.seed) monitor_path = log_dir if log_dir is not None else None if monitor_path is not None: os.makedirs(log_dir, exist_ok=True) env = Monitor(env, filename=monitor_path, **monitor_kwargs) return env return _init if self.vec_env_class is None: self.vec_env_class = DummyVecEnv env = self.vec_env_class([make_env()], **self.vec_env_kwargs) # Wrap the env into a VecNormalize wrapper if needed # and load saved statistics when present env = self._maybe_normalize(env, eval_env) # Optional Frame-stacking if self.frame_stack is not None: n_stack = self.frame_stack env = VecFrameStack(env, n_stack) if self.verbose > 0: print(f"Stacking {n_stack} frames") # Wrap if needed to re-order channels # (switch from channel last to channel first convention) if is_image_space( env.observation_space) and not is_image_space_channels_first( env.observation_space): if self.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) # check if wrapper for dict support is needed if self.algo == "her": if self.verbose > 0: print("Wrapping into a ObsDictWrapper") env = ObsDictWrapper(env) return env
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, train_freq: TrainFreq, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, log_interval: Optional[int] = None, ) -> RolloutReturn: """ Collect experiences and store them into a ReplayBuffer. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param train_freq: How much experience to collect by doing rollouts of current policy. Either ``TrainFreq(<n>, TrainFrequencyUnit.STEP)`` or ``TrainFreq(<n>, TrainFrequencyUnit.EPISODE)`` with ``<n>`` being an integer greater than 0. :param action_noise: Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: Number of steps before learning for the warm-up phase. :param log_interval: Log data every ``log_interval`` episodes :return: """ episode_rewards, total_timesteps = [], [] num_collected_steps, num_collected_episodes = 0, 0 assert isinstance(env, VecEnv), "You must pass a VecEnv" assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" assert train_freq.frequency > 0, "Should at least collect one step or episode." if self.model.use_sde: self.actor.reset_noise() callback.on_rollout_start() continue_training = True while should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes): done = False episode_reward, episode_timesteps = 0.0, 0 while not done: # concatenate observation and (desired) goal observation = self._last_obs self._last_obs = ObsDictWrapper.convert_dict(observation) if (self.model.use_sde and self.model.sde_sample_freq > 0 and num_collected_steps % self.model.sde_sample_freq == 0): # Sample a new noise matrix self.actor.reset_noise() # Select action randomly or according to policy self.model._last_obs = self._last_obs action, buffer_action = self._sample_action( learning_starts, action_noise) # Perform action new_obs, reward, done, infos = env.step(action) self.num_timesteps += 1 self.model.num_timesteps = self.num_timesteps episode_timesteps += 1 num_collected_steps += 1 # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, num_collected_steps, num_collected_episodes, continue_training=False) episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos, done) self.model.ep_info_buffer = self.ep_info_buffer self.model.ep_success_buffer = self.ep_success_buffer # == Store transition in the replay buffer and/or in the episode storage == if self._vec_normalize_env is not None: # Store only the unnormalized version new_obs_ = self._vec_normalize_env.get_original_obs() reward_ = self._vec_normalize_env.get_original_reward() else: # Avoid changing the original ones self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward self.model._last_original_obs = self._last_original_obs # As the VecEnv resets automatically, new_obs is already the # first observation of the next episode if done and infos[0].get("terminal_observation") is not None: next_obs = infos[0]["terminal_observation"] # VecNormalize normalizes the terminal observation if self._vec_normalize_env is not None: next_obs = self._vec_normalize_env.unnormalize_obs( next_obs) else: next_obs = new_obs_ if self.online_sampling: self.replay_buffer.add(self._last_original_obs, next_obs, buffer_action, reward_, done, infos) else: # concatenate observation with (desired) goal flattened_obs = ObsDictWrapper.convert_dict( self._last_original_obs) flattened_next_obs = ObsDictWrapper.convert_dict(next_obs) # add to replay buffer self.replay_buffer.add(flattened_obs, flattened_next_obs, buffer_action, reward_, done) # add current transition to episode storage self._episode_storage.add(self._last_original_obs, next_obs, buffer_action, reward_, done, infos) self._last_obs = new_obs self.model._last_obs = self._last_obs # Save the unnormalized new observation if self._vec_normalize_env is not None: self._last_original_obs = new_obs_ self.model._last_original_obs = self._last_original_obs self.model._update_current_progress_remaining( self.num_timesteps, self._total_timesteps) # For DQN, check if the target network should be updated # and update the exploration schedule # For SAC/TD3, the update is done as the same time as the gradient update # see https://github.com/hill-a/stable-baselines/issues/900 self.model._on_step() self.episode_steps += 1 if not should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes): break if done or self.episode_steps >= self.max_episode_length: if self.online_sampling: self.replay_buffer.store_episode() else: self._episode_storage.store_episode() # sample virtual transitions and store them in replay buffer self._sample_her_transitions() # clear storage for current episode self._episode_storage.reset() num_collected_episodes += 1 self._episode_num += 1 self.model._episode_num = self._episode_num episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Log training infos if log_interval is not None and self._episode_num % log_interval == 0: self._dump_logs() self.episode_steps = 0 mean_reward = np.mean( episode_rewards) if num_collected_episodes > 0 else 0.0 callback.on_rollout_end() return RolloutReturn(mean_reward, num_collected_steps, num_collected_episodes, continue_training)
def _sample_transitions( self, batch_size: Optional[int], maybe_vec_env: Optional[VecNormalize], online_sampling: bool, n_sampled_goal: Optional[int] = None, ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]: """ :param batch_size: Number of element to sample (only used for online sampling) :param env: associated gym VecEnv to normalize the observations/rewards Only valid when using online sampling :param online_sampling: Using online_sampling for HER or not. :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) :return: Samples. """ # Select which episodes to use if online_sampling: assert batch_size is not None, "No batch_size specified for online sampling of HER transitions" # Do not sample the episode with index `self.pos` as the episode is invalid if self.full: episode_indices = ( np.random.randint(1, self.n_episodes_stored, batch_size) + self.pos) % self.n_episodes_stored else: episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) # A subset of the transitions will be relabeled using HER algorithm her_indices = np.arange(batch_size)[:int(self.her_ratio * batch_size)] else: assert maybe_vec_env is None, "Transitions must be stored unnormalized in the replay buffer" assert n_sampled_goal is not None, "No n_sampled_goal specified for offline sampling of HER transitions" # Offline sampling: there is only one episode stored episode_length = self.episode_lengths[0] # we sample n_sampled_goal per timestep in the episode (only one is stored). episode_indices = np.tile(0, (episode_length * n_sampled_goal)) # we only sample virtual transitions # as real transitions are already stored in the replay buffer her_indices = np.arange(len(episode_indices)) ep_lengths = self.episode_lengths[episode_indices] # Special case when using the "future" goal sampling strategy # we cannot sample all transitions, we have to remove the last timestep if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: # restrict the sampling domain when ep_lengths > 1 # otherwise filter out the indices her_indices = her_indices[ep_lengths[her_indices] > 1] ep_lengths[her_indices] -= 1 if online_sampling: # Select which transitions to use transitions_indices = np.random.randint(ep_lengths) else: if her_indices.size == 0: # Episode of one timestep, not enough for using the "future" strategy # no virtual transitions are created in that case return np.zeros(0), np.zeros(0), np.zeros(0), np.zeros(0) else: # Repeat every transition index n_sampled_goals times # to sample n_sampled_goal per timestep in the episode (only one is stored). # Now with the corrected episode length when using "future" strategy transitions_indices = np.tile(np.arange(ep_lengths[0]), n_sampled_goal) episode_indices = episode_indices[transitions_indices] her_indices = np.arange(len(episode_indices)) # get selected transitions transitions = { key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys() } # sample new desired goals and relabel the transitions new_goals = self.sample_goals(episode_indices, her_indices, transitions_indices) transitions["desired_goal"][her_indices] = new_goals # Convert info buffer to numpy array transitions["info"] = np.array([ self.info_buffer[episode_idx][transition_idx] for episode_idx, transition_idx in zip(episode_indices, transitions_indices) ]) # Vectorized computation of the new reward transitions["reward"][her_indices, 0] = self.env.env_method( "compute_reward", # the new state depends on the previous state and action # s_{t+1} = f(s_t, a_t) # so the next_achieved_goal depends also on the previous state and action # because we are in a GoalEnv: # r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal) # therefore we have to use "next_achieved_goal" and not "achieved_goal" transitions["next_achieved_goal"][her_indices, 0], # here we use the new desired goal transitions["desired_goal"][her_indices, 0], transitions["info"][her_indices, 0], ) # concatenate observation with (desired) goal observations = ObsDictWrapper.convert_dict( self._normalize_obs(transitions, maybe_vec_env)) # HACK to make normalize obs work with the next observation transitions["observation"] = transitions["next_obs"] next_observations = ObsDictWrapper.convert_dict( self._normalize_obs(transitions, maybe_vec_env)) if online_sampling: data = ( observations[:, 0], transitions["action"], next_observations[:, 0], transitions["done"], self._normalize_reward(transitions["reward"], maybe_vec_env), ) return ReplayBufferSamples(*tuple(map(self.to_torch, data))) else: return observations, next_observations, transitions[ "action"], transitions["reward"]