def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: GraphRolloutBuffer, n_rollout_steps: int) -> bool: """ Collect rollouts using the current policy and fill a `RolloutBuffer`. :param env: (VecEnv) The training environment :param callback: (BaseCallback) Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: (RolloutBuffer) Buffer to fill with rollouts :param n_steps: (int) Number of experiences to collect per environment :return: (bool) True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration callback.on_rollout_start() while n_steps < n_rollout_steps: with th.no_grad(): # Convert to pytorch tensor # EDIT: obs_tensor = th.as_tensor(self._last_obs).to(self.device) obs_tensor = to_batch(obs_to_graph(self._last_obs)) actions, values, log_probs = self.policy.forward(obs_tensor) # TODO: May need to edit to supprot nodes. actions = [action.cpu().numpy() for action in actions] # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = [ np.clip(action, self.action_space.low, self.action_space.high) for action in actions ] new_obs, rewards, dones, infos = env.step(clipped_actions) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 self.num_timesteps += env.num_envs if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, dones, values, log_probs) self._last_obs = new_obs rollout_buffer.compute_returns_and_advantage(values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int = 256) -> bool: assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 self.num_timesteps += env.num_envs if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, dones, values, log_probs) self._last_obs = new_obs rollout_buffer.compute_returns_and_advantage(values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: TrajRolloutBuffer, n_rollout_steps: int) -> bool: """ Collect rollouts using the current policy and fill a `RolloutBuffer`. :param env: (VecEnv) The training environment :param callback: (BaseCallback) Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: (RolloutBuffer) Buffer to fill with rollouts :param n_steps: (int) Number of experiences to collect per environment :return: (bool) True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() # while n_steps < n_rollout_steps: while not rollout_buffer.full: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_ctx_tensor = th.as_tensor(self._last_obs).to( self.device) # (num_agents,) + (obs_dim,) actions, values, log_probs = self.policy.forward( obs_ctx_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) # action_dict = zip(enumerate(clipped_actions)) new_obs, rewards, dones, infos = env.step( clipped_actions) # env step takes np.array, returns np.array? # TODO: figure out where to put this reset: maybe in an env wrapper like they did if dones[self.env.num_agents] == 1: env.reset() if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 # self.num_timesteps += env.num_envs self.num_timesteps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) _obs = self._last_obs[..., 0:self.env.obs_size] _ctx = self._last_obs[..., self.env.obs_size:] # TODO: need to fix in the case of new number of agents, since range(len(last_obs)) will be incorrect for i in range(len(self._last_obs)): rollout_buffer.add(agent_id=i, context=_ctx[i], done=self._last_dones[i], obs=_obs[i], action=actions[i], reward=rewards[i], value=values[i], log_prob=log_probs[i]) # for i, state_ctx_pair in enumerate(zip(*_obs, _ctx)): # print(state_ctx_pair) # rollout_buffer.add(agent_id=i, # context=state_ctx_pair[-1], # done=self._last_dones[i], # obs=state_ctx_pair[0:-1], # action=actions[i], # reward=rewards[i], # value=values[i], # log_prob=log_probs[i]) # TODO: needs modification! # rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones rollout_buffer.compute_returns_and_advantage(values) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int = 256) -> bool: assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 self.num_timesteps += env.num_envs if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, dones, values, log_probs) self._last_obs = new_obs rollout_buffer.compute_returns_and_advantage(values, dones=dones) # # MSA debugging learning # try: # import copy # c_rb = copy.copy (rollout_buffer) # self.rollout_buffer_hist.append(c_rb) # except: # pass # if len(self.rollout_buffer_hist) == 25: # import matplotlib.pyplot as plt # n_envs = 4 # V = np.empty((0,n_envs), float) # A = np.empty((0,n_envs), float) # R = np.empty((0,n_envs), float) # lp = np.empty((0,n_envs), float) # r = np.empty((0,n_envs), float) # a = np.empty((0,n_envs, actions.shape[1]), float) # S = np.empty((0,n_envs, new_obs.shape[1]), float) # for rb in self.rollout_buffer_hist: # V = np.append (V, rb.values, axis=0) # A = np.append (A, rb.advantages, axis=0) # R = np.append (R, rb.returns, axis=0) # lp = np.append (lp, rb.log_probs, axis=0) # r = np.append (r, rb.rewards, axis=0) # a = np.append (a, rb.actions, axis=0) # S = np.append (S, rb.observations, axis=0) # plt.plot (V) # plt.title ('Values') # dir_no = "2" # filename = "RL_detailed_plots/"+ dir_no + "/V.png" # plt.savefig(filename) # plt.close () # # plt.plot (A) # plt.title ('Advantages') # filename = "RL_detailed_plots/"+ dir_no + "/A.png" # plt.savefig(filename) # plt.close () # # plt.plot (R) # plt.title ('Returns') # filename = "RL_detailed_plots/"+ dir_no + "/R.png" # plt.savefig(filename) # plt.close () # # plt.plot (lp) # plt.title ('Log Probs') # filename = "RL_detailed_plots/"+ dir_no + "/lp.png" # plt.savefig(filename) # plt.close () # # plt.plot (r) # plt.title ('rewards') # filename = "RL_detailed_plots/"+ dir_no + "/rew.png" # plt.savefig(filename) # plt.close () # # try: # fig, axes = plt.subplots (nrows=actions.shape[1], ncols=1, figsize=(8, actions.shape[1])) # for i in range (actions.shape[1]): # axes[i].plot (a[:, :, i]) # plt.suptitle ('Actions', y=1) # filename = "RL_detailed_plots/" + dir_no + "/act.png" # plt.savefig (filename) # plt.close() # except: # plt.plot (a[:, :, 0]) # plt.title ('Actions') # filename = "RL_detailed_plots/" + dir_no + "/act.png" # plt.savefig (filename) # plt.close() # # fig, axes = plt.subplots (nrows= new_obs.shape[1], ncols=1, figsize=(8, 2*new_obs.shape[1])) # for i in range ( new_obs.shape[1]): # axes[i].plot (S[:, :, i]) # axes[i].plot (S[:, :, i]) # plt.suptitle ('States', y=1) # filename = "RL_detailed_plots/" + dir_no + "/S.png" # plt.savefig (filename) # plt.close() callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int) -> bool: """ Collect rollouts using the current policy and fill a `RolloutBuffer`. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: self.policy.set_robot_id( self.policy.all_robot_ids ) # reset robot id before collecting rollouts if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones with th.no_grad(): # Compute value for the last timestep obs_tensor = th.as_tensor(new_obs).to(self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True
def generate_trajectories( policy, venv: VecEnv, sample_until: GenTrajTerminationFn, *, deterministic_policy: bool = False, rng: np.random.RandomState = np.random, ) -> Sequence[types.TrajectoryWithRew]: """Generate trajectory dictionaries from a policy and an environment. Args: policy (Callable,BasePolicy or BaseAlgorithm): A function mapping observation to action, a stable_baselines3 policy or an algorithm trained on the gym environment. venv: The vectorized environments to interact with. sample_until: A function determining the termination condition. It takes a sequence of trajectories, and returns a bool. Most users will want to use one of `min_episodes` or `min_timesteps`. deterministic_policy: If True, asks policy to deterministically return action. Note the trajectories might still be non-deterministic if the environment has non-determinism! rng: used for shuffling trajectories. Returns: Sequence of trajectories, satisfying `sample_until`. Additional trajectories may be collected to avoid biasing process towards short episodes; the user should truncate if required. """ if isinstance(policy, BaseAlgorithm): policy.set_env(venv) # Collect rollout tuples. trajectories = [] # accumulator for incomplete trajectories trajectories_accum = TrajectoryAccumulator() obs = venv.reset() for env_idx, ob in enumerate(obs): # Seed with first obs only. Inside loop, we'll only add second obs from # each (s,a,r,s') tuple, under the same "obs" key again. That way we still # get all observations, but they're not duplicated into "next obs" and # "previous obs" (this matters for, e.g., Atari, where observations are # really big). trajectories_accum.add_step(dict(obs=ob), env_idx) # Now, we sample until `sample_until(trajectories)` is true. # If we just stopped then this would introduce a bias towards shorter episodes, # since longer episodes are more likely to still be active, i.e. in the process # of being sampled from. To avoid this, we continue sampling until all epsiodes # are complete. # # To start with, all environments are active. active = np.ones(venv.num_envs, dtype=np.bool) while np.any(active): if isinstance(policy, Callable): acts = policy(obs) else: acts, _ = policy.predict(obs, deterministic=deterministic_policy) obs, rews, dones, infos = venv.step(acts) # If an environment is inactive, i.e. the episode completed for that # environment after `sample_until(trajectories)` was true, then we do # *not* want to add any subsequent trajectories from it. We avoid this # by just making it never done. dones &= active new_trajs = trajectories_accum.add_steps_and_auto_finish( acts, obs, rews, dones, infos) trajectories.extend(new_trajs) if sample_until(trajectories): # Termination condition has been reached. Mark as inactive any environments # where a trajectory was completed this timestep. active &= ~dones # Note that we just drop partial trajectories. This is not ideal for some # algos; e.g. BC can probably benefit from partial trajectories, too. # Each trajectory is sampled i.i.d.; however, shorter episodes are added to # `trajectories` sooner. Shuffle to avoid bias in order. This is important # when callees end up truncating the number of trajectories or transitions. # It is also cheap, since we're just shuffling pointers. rng.shuffle(trajectories) # Sanity checks. for trajectory in trajectories: n_steps = len(trajectory.acts) # extra 1 for the end exp_obs = (n_steps + 1, ) + venv.observation_space.shape real_obs = trajectory.obs.shape assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}" exp_act = (n_steps, ) + venv.action_space.shape real_act = trajectory.acts.shape assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}" exp_rew = (n_steps, ) real_rew = trajectory.rews.shape assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}" return trajectories
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int, ) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" # Switch to eval mode (this affects batch norm / dropout) self.policy.set_training_mode(False) n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(self._last_obs, self.device) actions, values, log_probs = self.policy(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) # Handle timeout by bootstraping with value function # see GitHub issue #633 for idx, done in enumerate(dones): if ( done and infos[idx].get("terminal_observation") is not None and infos[idx].get("TimeLimit.truncated", False) ): terminal_obs = self.policy.obs_to_tensor(infos[idx]["terminal_observation"])[0] with th.no_grad(): terminal_value = self.policy.predict_values(terminal_obs)[0] rewards[idx] += self.gamma * terminal_value rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs) self._last_obs = new_obs self._last_episode_starts = dones with th.no_grad(): # Compute value for the last timestep values = self.policy.predict_values(obs_as_tensor(new_obs, self.device)) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, n_episodes: int = 1, n_steps: int = -1, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, replay_buffer: Optional[ReplayBuffer] = None, log_interval: Optional[int] = None) -> RolloutReturn: """ Collect experiences and store them into a ReplayBuffer. :param env: (VecEnv) The training environment :param callback: (BaseCallback) Callback that will be called at each step (and at the beginning and end of the rollout) :param n_episodes: (int) Number of episodes to use to collect rollout data You can also specify a ``n_steps`` instead :param n_steps: (int) Number of steps to use to collect rollout data You can also specify a ``n_episodes`` instead. :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: (int) Number of steps before learning for the warm-up phase. :param replay_buffer: (ReplayBuffer) :param log_interval: (int) Log data every ``log_interval`` episodes :return: (RolloutReturn) """ episode_rewards, total_timesteps = [], [] total_steps, total_episodes = 0, 0 assert isinstance(env, VecEnv), "You must pass a VecEnv" assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" if self.use_sde: self.actor.reset_noise() callback.on_rollout_start() continue_training = True while total_steps < n_steps or total_episodes < n_episodes: done = False episode_reward, episode_timesteps = 0.0, 0 while not done: if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.actor.reset_noise() # Select action randomly or according to policy action, buffer_action, is_random_action = self._sample_action(learning_starts, action_noise) # print("action/buffer action/israndomaction ", action, buffer_action,is_random_action) # action, buffer_action = self._sample_action(learning_starts, action_noise) # Rescale and perform action new_obs, reward, done, infos = env.step(action) # print("Reward: ", reward) # print("Observation:", new_obs) self.period_counter += 1 self.current_observation = new_obs # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False) episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos, done) # Store data in replay buffer if replay_buffer is not None: # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs() reward_ = self._vec_normalize_env.get_original_reward() else: # Avoid changing the original ones self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done) old_observation = self._last_original_obs self._last_obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: self._last_original_obs = new_obs_ self.num_timesteps += 1 episode_timesteps += 1 total_steps += 1 self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps) # For DQN, check if the target network should be updated # and update the exploration schedule # For SAC/TD3, the update is done as the same time as the gradient update # see https://github.com/hill-a/stable-baselines/issues/900 self._on_step() if 0 < n_steps <= total_steps: break # Compute the target Q values target_st1 = self.q_net_target(th.tensor(new_obs)) # Follow greedy policy: use the one with the highest value target_st1, _ = target_st1.max(dim=1) # Avoid potential broadcast issue target_st1 = float(target_st1.reshape(-1, 1)) # Compute the target Q values target_st = self.q_net_target(th.tensor(old_observation)) # Follow greedy policy: use the one with the highest value target_st, _ = target_st.max(dim=1) # Avoid potential broadcast issue -> IS THIS NECESSARY? target_st = float(target_st.reshape(-1, 1)) # TODO: Welchen der drei indices von target_st nehmen wir? ->maximalen # TODO 2: laut paper wird zuerst rho berechnet, aber hier machen wir es einen step verzögert # buffer_action.astype(int)[0] if is_random_action == 0: decayed_alpha = self.exp_decay_alpha() self.rho = (1 - decayed_alpha) * self.rho + decayed_alpha * (reward_ + target_st1 - target_st) # Fixed observation for debugging purposes # TODO: GENERATE RANDOM NUMBERS OR COME UP WITH BETTER NUMBERS obs2 = th.tensor([[10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1., 6., 1., 0., 0., 3., 0., 8., 0., 0., 0., 3., 2., 14., 2., 0., 0., 4., 2., 17., 1., 0., 0., 3., 1., 10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1., 6., 1., 0., 0., 3., 0., 8., 0., 0., 0., 3., 2., 14., 2., 0., 0., 4., 2., 17., 1., 0., 0., 3., 1., 10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1., 6., 1., 0., 0., 3., 0., 8., 0., 0., 0., 3., 2., 10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1., 10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1., 10., 0., 0., 0., 5., 0., 7., 1., 0., 0., 5., 1. ]]) fix_observation = self.q_net._predict(obs2)[1][0] with open('../' + 'q_values_learned_results.csv', mode='a') as results_CSV: results_writer = csv.writer(results_CSV, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) results_writer.writerow( [self.period_counter, float(fix_observation[0]), float(fix_observation[1]), float(fix_observation[2])]) # Write reward to CSV file after each period with open('../' + 'rewards_per_period.csv', mode='a') as rewards_per_period_CSV: results_writer = csv.writer(rewards_per_period_CSV, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) results_writer.writerow([self.period_counter, float(reward_), float(self.rho)]) if done: total_episodes += 1 self._episode_num += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Log training infos if log_interval is not None and self._episode_num % log_interval == 0: self._dump_logs() mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 callback.on_rollout_end() return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, n_episodes: int = 1, n_steps: int = -1, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, replay_buffer: Optional[ReplayBuffer] = None, log_interval: Optional[int] = None, ) -> RolloutReturn: """ Collect experiences and store them into a ``ReplayBuffer``. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param n_episodes: Number of episodes to use to collect rollout data You can also specify a ``n_steps`` instead :param n_steps: Number of steps to use to collect rollout data You can also specify a ``n_episodes`` instead. :param action_noise: Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: Number of steps before learning for the warm-up phase. :param replay_buffer: :param log_interval: Log data every ``log_interval`` episodes :return: """ episode_rewards, total_timesteps = [], [] total_steps, total_episodes = 0, 0 assert isinstance(env, VecEnv), "You must pass a VecEnv" assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" if self.use_sde: self.actor.reset_noise() callback.on_rollout_start() continue_training = True while total_steps < n_steps or total_episodes < n_episodes: done = False episode_reward, episode_timesteps = 0.0, 0 while not done: if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.actor.reset_noise() # Select action randomly or according to policy action, buffer_action = self._sample_action( learning_starts, action_noise) # Rescale and perform action new_obs, reward, done, infos = env.step(action) self.num_timesteps += 1 episode_timesteps += 1 total_steps += 1 # Give access to local variables callback.update_locals(locals()) # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False) episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos, done) # Store data in replay buffer if replay_buffer is not None: # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs() reward_ = self._vec_normalize_env.get_original_reward() else: # Avoid changing the original ones self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done) self._last_obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: self._last_original_obs = new_obs_ self._update_current_progress_remaining( self.num_timesteps, self._total_timesteps) # For DQN, check if the target network should be updated # and update the exploration schedule # For SAC/TD3, the update is done as the same time as the gradient update # see https://github.com/hill-a/stable-baselines/issues/900 self._on_step() if 0 < n_steps <= total_steps: break if done: total_episodes += 1 self._episode_num += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Log training infos if log_interval is not None and self._episode_num % log_interval == 0: self._dump_logs() mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 callback.on_rollout_end() return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, train_freq: TrainFreq, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, log_interval: Optional[int] = None, ) -> RolloutReturn: """ Collect experiences and store them into a ReplayBuffer. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param train_freq: How much experience to collect by doing rollouts of current policy. Either ``TrainFreq(<n>, TrainFrequencyUnit.STEP)`` or ``TrainFreq(<n>, TrainFrequencyUnit.EPISODE)`` with ``<n>`` being an integer greater than 0. :param action_noise: Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: Number of steps before learning for the warm-up phase. :param log_interval: Log data every ``log_interval`` episodes :return: """ episode_rewards, total_timesteps = [], [] num_collected_steps, num_collected_episodes = 0, 0 assert isinstance(env, VecEnv), "You must pass a VecEnv" assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" assert train_freq.frequency > 0, "Should at least collect one step or episode." if self.model.use_sde: self.actor.reset_noise() callback.on_rollout_start() continue_training = True while should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes): done = False episode_reward, episode_timesteps = 0.0, 0 while not done: # concatenate observation and (desired) goal observation = self._last_obs self._last_obs = ObsDictWrapper.convert_dict(observation) if (self.model.use_sde and self.model.sde_sample_freq > 0 and num_collected_steps % self.model.sde_sample_freq == 0): # Sample a new noise matrix self.actor.reset_noise() # Select action randomly or according to policy self.model._last_obs = self._last_obs action, buffer_action = self._sample_action( learning_starts, action_noise) # Perform action new_obs, reward, done, infos = env.step(action) self.num_timesteps += 1 self.model.num_timesteps = self.num_timesteps episode_timesteps += 1 num_collected_steps += 1 # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, num_collected_steps, num_collected_episodes, continue_training=False) episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos, done) self.model.ep_info_buffer = self.ep_info_buffer self.model.ep_success_buffer = self.ep_success_buffer # == Store transition in the replay buffer and/or in the episode storage == if self._vec_normalize_env is not None: # Store only the unnormalized version new_obs_ = self._vec_normalize_env.get_original_obs() reward_ = self._vec_normalize_env.get_original_reward() else: # Avoid changing the original ones self._last_original_obs, new_obs_, reward_ = observation, new_obs, reward self.model._last_original_obs = self._last_original_obs # As the VecEnv resets automatically, new_obs is already the # first observation of the next episode if done and infos[0].get("terminal_observation") is not None: next_obs = infos[0]["terminal_observation"] # VecNormalize normalizes the terminal observation if self._vec_normalize_env is not None: next_obs = self._vec_normalize_env.unnormalize_obs( next_obs) else: next_obs = new_obs_ if self.online_sampling: self.replay_buffer.add(self._last_original_obs, next_obs, buffer_action, reward_, done, infos) else: # concatenate observation with (desired) goal flattened_obs = ObsDictWrapper.convert_dict( self._last_original_obs) flattened_next_obs = ObsDictWrapper.convert_dict(next_obs) # add to replay buffer self.replay_buffer.add(flattened_obs, flattened_next_obs, buffer_action, reward_, done) # add current transition to episode storage self._episode_storage.add(self._last_original_obs, next_obs, buffer_action, reward_, done, infos) self._last_obs = new_obs self.model._last_obs = self._last_obs # Save the unnormalized new observation if self._vec_normalize_env is not None: self._last_original_obs = new_obs_ self.model._last_original_obs = self._last_original_obs self.model._update_current_progress_remaining( self.num_timesteps, self._total_timesteps) # For DQN, check if the target network should be updated # and update the exploration schedule # For SAC/TD3, the update is done as the same time as the gradient update # see https://github.com/hill-a/stable-baselines/issues/900 self.model._on_step() self.episode_steps += 1 if not should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes): break if done or self.episode_steps >= self.max_episode_length: if self.online_sampling: self.replay_buffer.store_episode() else: self._episode_storage.store_episode() # sample virtual transitions and store them in replay buffer self._sample_her_transitions() # clear storage for current episode self._episode_storage.reset() num_collected_episodes += 1 self._episode_num += 1 self.model._episode_num = self._episode_num episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Log training infos if log_interval is not None and self._episode_num % log_interval == 0: self._dump_logs() self.episode_steps = 0 mean_reward = np.mean( episode_rewards) if num_collected_episodes > 0 else 0.0 callback.on_rollout_end() return RolloutReturn(mean_reward, num_collected_steps, num_collected_episodes, continue_training)
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, train_freq: TrainFreq, buffer: TrajectoryBuffer, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, log_interval: Optional[int] = None, ) -> RolloutReturn: """ Collect experiences and store them into a ``TrajectoryBuffer``. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param train_freq: How much experience to collect by doing rollouts of current policy. Either ``TrainFreq(<n>, TrainFrequencyUnit.STEP)`` or ``TrainFreq(<n>, TrainFrequencyUnit.EPISODE)`` with ``<n>`` being an integer greater than 0. :param action_noise: Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: Number of steps before learning for the warm-up phase. :param trajectory_buffer: :param log_interval: Log data every ``log_interval`` episodes :return: """ episode_rewards, total_timesteps = [], [] num_collected_steps, num_collected_episodes = 0, 0 assert isinstance(env, VecEnv), "You must pass a VecEnv" # assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" assert train_freq.frequency > 0, "Should at least collect one step or episode." if self.use_sde: self.actor.reset_noise() callback.on_rollout_start() continue_training = True self.rollout_buffer.reset() done = np.array([False for i in range(self.n_envs)]) episode_reward, episode_timesteps = [0.0 for i in range(self.n_envs)], [0 for i in range(self.n_envs)] if train_freq.unit == TrainFrequencyUnit.STEP: self.trajectories = [Trajectory(self.device) for i in range(self.n_envs)] while True: ms = [0] get_ms(ms) if self.use_sde and self.sde_sample_freq > 0 and num_collected_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.actor.reset_noise() # Select action randomly or according to policy with th.no_grad(): # action, buffer_action = self._sample_action(learning_starts, action_noise, use_behav=False) # log_probs = self.policy.get_action_log_probs(th.tensor(np.array(self._last_obs)).to(self.device), th.tensor(np.array([action])).T.to(self.device), use_behav=False) action, buffer_action = self._sample_action(learning_starts, action_noise, use_behav=True) log_probs = self.policy.get_action_log_probs(th.tensor(np.array(self._last_obs)).to(self.device), th.tensor(np.array([action])).T.to(self.device), use_behav=True) prob = th.exp(log_probs) prob = (1 - self.exploration_rate) * prob + (self.exploration_rate) * (1.0 / self.action_space.n) prob = prob.cpu().numpy() if (prob > 1).any(): print("prob > 1!!! => Code in offpac.py") print(prob) print(th.tensor(log_probs)) exit() new_obs, reward, done, infos = env.step(action) with th.no_grad(): if self.use_v_net: latent_pi, latent_vf, latent_sde = self.policy._get_latent(th.tensor(self._last_obs).to(self.device)) values = self.value_net(latent_vf).detach() else: values = self.policy.compute_value(th.tensor(self._last_obs).to(self.device), use_target_v=False).detach() # self.rollout_buffer.add(self._last_obs, action.reshape(-1, 1), reward, self._last_episode_starts, values, log_probs.flatten()) self.num_timesteps += env.num_envs num_collected_steps += env.num_envs # Give access to local variables callback.update_locals(locals()) # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, num_collected_steps, num_collected_episodes, continue_training=False) episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos, done) for i in range(len(self.trajectories)): # trajectories[i].add(Transition(self._last_obs[i], action[i], reward[i], new_obs[i], done[i], prob[i])) if done[i]: if infos[i]['terminal_observation'].dtype == np.float64: self.trajectories[i].add(Transition(self._last_obs[i], action[i], reward[i], infos[i]['terminal_observation'].astype(np.float32), done[i], prob[i])) else: self.trajectories[i].add(Transition(self._last_obs[i], action[i], reward[i], infos[i]['terminal_observation'], done[i], prob[i])) else: self.trajectories[i].add(Transition(self._last_obs[i], action[i], reward[i], new_obs[i], done[i], prob[i])) self._last_obs = new_obs self._last_episode_starts = done self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps) # For DQN, check if the target network should be updated # and update the exploration schedule # For SAC/TD3, the update is done as the same time as the gradient update # see https://github.com/hill-a/stable-baselines/issues/900 self._on_step() ''' if not should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes): # even if the episdoe is not finished, we store the trajectory because no more steps can be performed for traj_i, traj in enumerate(trajectories): self._store_transition(buffer, traj) total_timesteps.append(len(traj)) trajectories[traj_i] = Trajectory(self.device) episode_rewards.append(episode_reward[traj_i]) episode_reward[traj_i] = 0.0 break ''' # store transition of finished episode, but if not more steps can be collected, treat any trajectory as an episode if done.any(): num_collected_episodes += np.sum(done) self._episode_num += np.sum(done) if log_interval is not None and self._episode_num % log_interval == 0: self._dump_logs() if train_freq.unit == TrainFrequencyUnit.STEP: ending = not should_collect_more_steps(train_freq, num_collected_steps//self.n_envs, num_collected_episodes//self.n_envs) # if ending, save all trajectories, otherwise only save done episode if ending: for traj_i, traj in enumerate(self.trajectories): self._store_transition(buffer, traj) # total_timesteps.append(len(traj)) # is this line affecting anything???? self.trajectories[traj_i] = Trajectory(self.device) episode_rewards.append(episode_reward[traj_i]) episode_reward[traj_i] = 0.0 break else: if done.any(): traj_indexes = [i for i in np.arange(len(self.trajectories))[done]] for traj_i in traj_indexes: self._store_transition(buffer, self.trajectories[traj_i]) # total_timesteps.append(len(traj)) # is this line affecting anything???? self.trajectories[traj_i] = Trajectory(self.device) episode_rewards.append(episode_reward[traj_i]) episode_reward[traj_i] = 0.0 elif train_freq.unit == TrainFrequencyUnit.EPISODE: ending = not should_collect_more_steps(train_freq, num_collected_steps//self.n_envs, num_collected_episodes//self.n_envs) if done.any(): # if ending, save all trajectories even if not finished # if not ending: traj_indexes = [i for i in np.arange(len(self.trajectories))[done]] for traj_i in traj_indexes: self._store_transition(buffer, self.trajectories[traj_i]) # total_timesteps.append(len(traj)) # is this line affecting anything???? self.trajectories[traj_i] = Trajectory(self.device) episode_rewards.append(episode_reward[traj_i]) episode_reward[traj_i] = 0.0 ''' else: _trajectories = trajectories for traj_i, traj in enumerate(_trajectories): self._store_transition(buffer, traj) total_timesteps.append(len(traj)) # is this line affecting anything???? self.trajectories[traj_i] = Trajectory(self.device) episode_rewards.append(episode_reward[traj_i]) episode_reward[traj_i] = 0.0 ''' if ending: break else: print(train_freq.unit) raise Exception("Weird train_freq.unit...") exit(-1) if done.any(): if action_noise is not None: action_noise.reset() with th.no_grad(): obs_tensor = th.as_tensor(new_obs).squeeze(1).to(self.device) if self.use_v_net: latent_pi, latent_vf, latent_sde = self.policy._get_latent(obs_tensor) values = self.value_net(latent_vf).detach() else: values = self.policy.compute_value(obs_tensor, use_target_v=False) self.rollout_buffer.compute_returns_and_advantage(last_values=values, dones=done) mean_reward = np.mean(episode_rewards) if num_collected_episodes > 0 else 0.0 callback.on_rollout_end() return RolloutReturn(mean_reward, num_collected_steps, num_collected_episodes, continue_training)
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int, ) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps * self.outer_steps: # here n_rollout_steps is n_steps in PPO args. Noted by Chenyin # while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(self._last_obs, self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 # (1) if at the T-th step, the env is going to reset, so we shall store the terminal states in advance # (2) if done, new_obs is the new state after resetting the env, so we need to get terminal state from infos if n_steps % n_rollout_steps == 0 or dones.any(): # if dones.any(): # second case: do not reset the env when encountering step T terminal_obs = new_obs.copy() infos_array = np.array(infos) # change list to numpy array i = 0 for done in dones: if done: terminal_obs[i] = infos_array[i][ "terminal_observation"] i += 1 with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(terminal_obs, self.device) _, terminal_values, _ = self.policy.forward( obs_tensor) # in the infinite game, V(s_T) is defined else: # when dones = [False, ..., False] terminal_values = None if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs, terminal_values) # Chenyin if n_steps % n_rollout_steps == 0: self._last_obs = env.reset() self._last_episode_starts = np.ones((env.num_envs, ), dtype=bool) else: self._last_obs = new_obs self._last_episode_starts = dones # self._last_obs = new_obs # self._last_episode_starts = dones with th.no_grad(): # Compute value for the last timestep if n_steps % n_rollout_steps == 0 or dones.any(): # if dones.any(): # obs_tensor = obs_as_tensor(terminal_obs, self.device) # _, values, _ = self.policy.forward(obs_tensor) values = terminal_values assert values is not None else: obs_tensor = obs_as_tensor(new_obs, self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: CustomizedRolloutBuffer, n_rollout_steps: int) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() """ Sida """ short_hidden_states, long_hidden_states = None, None dones = None while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) """ Sida: get memory before passing forward, assuming there's only one rnn module for now. """ if self.policy.features_extractor.num_parallel_rnns: short_hidden_states = self.policy.features_extractor.cx_rollout.cpu( ).numpy() long_hidden_states = self.policy.features_extractor.hx_rollout.cpu( ).numpy() if dones is not None: dones = th.as_tensor(dones).to(self.device) actions, values, log_probs = self.policy.forward( obs_tensor, new_start=dones) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) """ Sida: add memory to rollout buffer """ rollout_buffer.add(short_hidden_states, long_hidden_states, self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones with th.no_grad(): # Compute value for the last timestep obs_tensor = th.as_tensor(new_obs).to(self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) if dones[0]: for info in infos: goal_diff = info['l_score'] - info['r_score'] print( f"Rewards: {goal_diff} | Score: [{info['l_score']} : {info['r_score']}]" ) self.scores.append(goal_diff) avg_score = sum(self.scores) / len(self.scores) print(f"Average Reward: {avg_score}") print("") if avg_score > self.best_score: self.best_score = avg_score self.save_best_model = True if self.log_handler is not None: self.log_handler.log({"Average Reward": avg_score}) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones with th.no_grad(): # Compute value for the last timestep obs_tensor = th.as_tensor(new_obs).to(self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, # Type hint as string to avoid circular import callback: 'BaseCallback', n_episodes: int = 1, n_steps: int = -1, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, replay_buffer: Optional[ReplayBuffer] = None, log_interval: Optional[int] = None) -> RolloutReturn: """ Collect rollout using the current policy (and possibly fill the replay buffer) :param env: (VecEnv) The training environment :param n_episodes: (int) Number of episodes to use to collect rollout data You can also specify a ``n_steps`` instead :param n_steps: (int) Number of steps to use to collect rollout data You can also specify a ``n_episodes`` instead. :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param callback: (BaseCallback) Callback that will be called at each step (and at the beginning and end of the rollout) :param learning_starts: (int) Number of steps before learning for the warm-up phase. :param replay_buffer: (ReplayBuffer) :param log_interval: (int) Log data every ``log_interval`` episodes :return: (RolloutReturn) """ episode_rewards, total_timesteps = [], [] total_steps, total_episodes = 0, 0 assert isinstance(env, VecEnv), "You must pass a VecEnv" assert env.num_envs == 1, "OffPolicyRLModel only support single environment" if self.use_sde: self.actor.reset_noise() callback.on_rollout_start() continue_training = True while total_steps < n_steps or total_episodes < n_episodes: done = False episode_reward, episode_timesteps = 0.0, 0 while not done: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.actor.reset_noise() # Select action randomly or according to policy if self.num_timesteps < learning_starts and not (self.use_sde and self.use_sde_at_warmup): # Warmup phase unscaled_action = np.array([self.action_space.sample()]) else: # Note: we assume that the policy uses tanh to scale the action # We use non-deterministic action in the case of SAC, for TD3, it does not matter unscaled_action, _ = self.predict(self._last_obs, deterministic=False) # Rescale the action from [low, high] to [-1, 1] if isinstance(self.action_space, gym.spaces.Box): scaled_action = self.policy.scale_action(unscaled_action) # Add noise to the action (improve exploration) if action_noise is not None: # NOTE: in the original implementation of TD3, the noise was applied to the unscaled action # Update(October 2019): Not anymore scaled_action = np.clip(scaled_action + action_noise(), -1, 1) # We store the scaled action in the buffer buffer_action = scaled_action action = self.policy.unscale_action(scaled_action) else: # Discrete case, no need to normalize or clip buffer_action = unscaled_action action = buffer_action # Rescale and perform action new_obs, reward, done, infos = env.step(action) # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False) episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos, done) # Store data in replay buffer if replay_buffer is not None: # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs() reward_ = self._vec_normalize_env.get_original_reward() else: # Avoid changing the original ones self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward replay_buffer.add(self._last_original_obs, new_obs_, buffer_action, reward_, done) self._last_obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: self._last_original_obs = new_obs_ self.num_timesteps += 1 episode_timesteps += 1 total_steps += 1 if 0 < n_steps <= total_steps: break if done: total_episodes += 1 self._episode_num += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Display training infos if self.verbose >= 1 and log_interval is not None and self._episode_num % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) self.logger.logkv("episodes", self._episode_num) if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: self.logger.logkv('ep_rew_mean', self.safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer])) self.logger.logkv('ep_len_mean', self.safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer])) self.logger.logkv("fps", fps) self.logger.logkv('time_elapsed', int(time.time() - self.start_time)) self.logger.logkv("total timesteps", self.num_timesteps) if self.use_sde: self.logger.logkv("std", (self.actor.get_std()).mean().item()) if len(self.ep_success_buffer) > 0: self.logger.logkv('success rate', self.safe_mean(self.ep_success_buffer)) self.logger.dumpkvs() mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 callback.on_rollout_end() return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) # Tag on the other agent's action submit_actions = clipped_actions if self.bridge and self.bridge.other(self.is_protagonist): other_actions = self.bridge.other(self.is_protagonist).predict( obs_tensor.cpu().numpy())[0] # if isinstance(self.action_space, gym.spaces.Box): # clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) if len(other_actions.shape) < len(clipped_actions.shape): other_actions = other_actions.unsqueeze(dim=1) submit_actions = np.concatenate( [other_actions, clipped_actions] if self.is_protagonist else [clipped_actions, other_actions], axis=1) elif self.adv_action_space: submit_actions = np.concatenate( (np.array([np.full(self.adv_action_space.shape, np.nan) ]), clipped_actions), axis=1) new_obs, rewards, dones, infos = env.step(submit_actions) if not self.is_protagonist: rewards = -rewards self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones with th.no_grad(): # Compute value for the last timestep obs_tensor = th.as_tensor(new_obs).to(self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, train_freq: TrainFreq, replay_buffer: ReplayBuffer, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, log_interval: Optional[int] = None, ) -> RolloutReturn: """ Collect experiences and store them into a ``ReplayBuffer``. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param train_freq: How much experience to collect by doing rollouts of current policy. Either ``TrainFreq(<n>, TrainFrequencyUnit.STEP)`` or ``TrainFreq(<n>, TrainFrequencyUnit.EPISODE)`` with ``<n>`` being an integer greater than 0. :param action_noise: Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: Number of steps before learning for the warm-up phase. :param replay_buffer: :param log_interval: Log data every ``log_interval`` episodes :return: """ episode_rewards, total_timesteps = [], [] num_collected_steps, num_collected_episodes = 0, 0 assert isinstance(env, VecEnv), "You must pass a VecEnv" assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" assert train_freq.frequency > 0, "Should at least collect one step or episode." if self.use_sde: self.actor.reset_noise() callback.on_rollout_start() continue_training = True while should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes): done = False episode_reward, episode_timesteps = 0.0, 0 while not done: if self.use_sde and self.sde_sample_freq > 0 and num_collected_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.actor.reset_noise() # Select action randomly or according to policy action, buffer_action = self._sample_action(learning_starts, action_noise) # Rescale and perform action new_obs, reward, done, infos = env.step(action) self.num_timesteps += 1 episode_timesteps += 1 num_collected_steps += 1 # Give access to local variables callback.update_locals(locals()) # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, num_collected_steps, num_collected_episodes, continue_training=False) episode_reward += reward # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos, done) # Store data in replay buffer (normalized action and unnormalized observation) self._store_transition(replay_buffer, buffer_action, new_obs, reward, done, infos) self._update_current_progress_remaining(self.num_timesteps, self._total_timesteps) # For DQN, check if the target network should be updated # and update the exploration schedule # For SAC/TD3, the update is done as the same time as the gradient update # see https://github.com/hill-a/stable-baselines/issues/900 self._on_step() if not should_collect_more_steps(train_freq, num_collected_steps, num_collected_episodes): break if done: num_collected_episodes += 1 self._episode_num += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Log training infos if log_interval is not None and self._episode_num % log_interval == 0: self._dump_logs() mean_reward = np.mean(episode_rewards) if num_collected_episodes > 0 else 0.0 callback.on_rollout_end() return RolloutReturn(mean_reward, num_collected_steps, num_collected_episodes, continue_training)
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int ) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() # debug =============================================================== if mode == 'debug': print(["OPA.collect_rollouts started, let's roll!"]) while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) # notes =========================================================== # use last observation to generate action (with log probs) and value with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # debug =========================================================== if mode == 'debug': print(['OPA.collect_rollouts loop', 'n_rollout_steps:', n_rollout_steps, 'n_steps:', n_steps]) print(['OPA.collect_rollouts loop eval', 'last_obs:', self._last_obs, 'actions', actions, 'values', values, 'log_probs', log_probs]) # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) # notes =========================================================== # use clipped_actions to interact with env new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) # debug =========================================================== if mode == 'debug': print(['OPA.collect_rollouts loop save', 'last_obs:', self._last_obs, 'actions', actions, 'values', values, 'log_probs', log_probs, 'rewards', rewards, 'last_dones', self._last_dones]) # notes =========================================================== # 6 things to save in buffer: last_obs, actions, rewards, last_dones, values, log_probs self._last_obs = new_obs self._last_dones = dones with th.no_grad(): # Compute value for the last timestep obs_tensor = th.as_tensor(new_obs).to(self.device) _, values, _ = self.policy.forward(obs_tensor) # debug =============================================================== if mode == 'debug': print(['OPA.collect_rollouts last', 'new_obs:', new_obs, 'values:', values, 'dones:', dones]) print(['OPA.collect_rollouts finished, ready to compute_returns']) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True