def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int = 256) -> bool: assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 self.num_timesteps += env.num_envs if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, dones, values, log_probs) self._last_obs = new_obs rollout_buffer.compute_returns_and_advantage(values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int) -> bool: """ Collect rollouts using the current policy and fill a `RolloutBuffer`. :param env: (VecEnv) The training environment :param callback: (BaseCallback) Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: (RolloutBuffer) Buffer to fill with rollouts :param n_steps: (int) Number of experiences to collect per environment :return: (bool) True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 self.num_timesteps += env.num_envs if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones rollout_buffer.compute_returns_and_advantage(values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int = 256) -> bool: assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 self.num_timesteps += env.num_envs if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, dones, values, log_probs) self._last_obs = new_obs rollout_buffer.compute_returns_and_advantage(values, dones=dones) # # MSA debugging learning # try: # import copy # c_rb = copy.copy (rollout_buffer) # self.rollout_buffer_hist.append(c_rb) # except: # pass # if len(self.rollout_buffer_hist) == 25: # import matplotlib.pyplot as plt # n_envs = 4 # V = np.empty((0,n_envs), float) # A = np.empty((0,n_envs), float) # R = np.empty((0,n_envs), float) # lp = np.empty((0,n_envs), float) # r = np.empty((0,n_envs), float) # a = np.empty((0,n_envs, actions.shape[1]), float) # S = np.empty((0,n_envs, new_obs.shape[1]), float) # for rb in self.rollout_buffer_hist: # V = np.append (V, rb.values, axis=0) # A = np.append (A, rb.advantages, axis=0) # R = np.append (R, rb.returns, axis=0) # lp = np.append (lp, rb.log_probs, axis=0) # r = np.append (r, rb.rewards, axis=0) # a = np.append (a, rb.actions, axis=0) # S = np.append (S, rb.observations, axis=0) # plt.plot (V) # plt.title ('Values') # dir_no = "2" # filename = "RL_detailed_plots/"+ dir_no + "/V.png" # plt.savefig(filename) # plt.close () # # plt.plot (A) # plt.title ('Advantages') # filename = "RL_detailed_plots/"+ dir_no + "/A.png" # plt.savefig(filename) # plt.close () # # plt.plot (R) # plt.title ('Returns') # filename = "RL_detailed_plots/"+ dir_no + "/R.png" # plt.savefig(filename) # plt.close () # # plt.plot (lp) # plt.title ('Log Probs') # filename = "RL_detailed_plots/"+ dir_no + "/lp.png" # plt.savefig(filename) # plt.close () # # plt.plot (r) # plt.title ('rewards') # filename = "RL_detailed_plots/"+ dir_no + "/rew.png" # plt.savefig(filename) # plt.close () # # try: # fig, axes = plt.subplots (nrows=actions.shape[1], ncols=1, figsize=(8, actions.shape[1])) # for i in range (actions.shape[1]): # axes[i].plot (a[:, :, i]) # plt.suptitle ('Actions', y=1) # filename = "RL_detailed_plots/" + dir_no + "/act.png" # plt.savefig (filename) # plt.close() # except: # plt.plot (a[:, :, 0]) # plt.title ('Actions') # filename = "RL_detailed_plots/" + dir_no + "/act.png" # plt.savefig (filename) # plt.close() # # fig, axes = plt.subplots (nrows= new_obs.shape[1], ncols=1, figsize=(8, 2*new_obs.shape[1])) # for i in range ( new_obs.shape[1]): # axes[i].plot (S[:, :, i]) # axes[i].plot (S[:, :, i]) # plt.suptitle ('States', y=1) # filename = "RL_detailed_plots/" + dir_no + "/S.png" # plt.savefig (filename) # plt.close() callback.on_rollout_end() return True
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int, ) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" # Switch to eval mode (this affects batch norm / dropout) self.policy.set_training_mode(False) n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(self._last_obs, self.device) actions, values, log_probs = self.policy(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) # Handle timeout by bootstraping with value function # see GitHub issue #633 for idx, done in enumerate(dones): if ( done and infos[idx].get("terminal_observation") is not None and infos[idx].get("TimeLimit.truncated", False) ): terminal_obs = self.policy.obs_to_tensor(infos[idx]["terminal_observation"])[0] with th.no_grad(): terminal_value = self.policy.predict_values(terminal_obs)[0] rewards[idx] += self.gamma * terminal_value rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs) self._last_obs = new_obs self._last_episode_starts = dones with th.no_grad(): # Compute value for the last timestep values = self.policy.predict_values(obs_as_tensor(new_obs, self.device)) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) # Tag on the other agent's action submit_actions = clipped_actions if self.bridge and self.bridge.other(self.is_protagonist): other_actions = self.bridge.other(self.is_protagonist).predict( obs_tensor.cpu().numpy())[0] # if isinstance(self.action_space, gym.spaces.Box): # clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) if len(other_actions.shape) < len(clipped_actions.shape): other_actions = other_actions.unsqueeze(dim=1) submit_actions = np.concatenate( [other_actions, clipped_actions] if self.is_protagonist else [clipped_actions, other_actions], axis=1) elif self.adv_action_space: submit_actions = np.concatenate( (np.array([np.full(self.adv_action_space.shape, np.nan) ]), clipped_actions), axis=1) new_obs, rewards, dones, infos = env.step(submit_actions) if not self.is_protagonist: rewards = -rewards self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones with th.no_grad(): # Compute value for the last timestep obs_tensor = th.as_tensor(new_obs).to(self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int, ) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps * self.outer_steps: # here n_rollout_steps is n_steps in PPO args. Noted by Chenyin # while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(self._last_obs, self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 # (1) if at the T-th step, the env is going to reset, so we shall store the terminal states in advance # (2) if done, new_obs is the new state after resetting the env, so we need to get terminal state from infos if n_steps % n_rollout_steps == 0 or dones.any(): # if dones.any(): # second case: do not reset the env when encountering step T terminal_obs = new_obs.copy() infos_array = np.array(infos) # change list to numpy array i = 0 for done in dones: if done: terminal_obs[i] = infos_array[i][ "terminal_observation"] i += 1 with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(terminal_obs, self.device) _, terminal_values, _ = self.policy.forward( obs_tensor) # in the infinite game, V(s_T) is defined else: # when dones = [False, ..., False] terminal_values = None if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs, terminal_values) # Chenyin if n_steps % n_rollout_steps == 0: self._last_obs = env.reset() self._last_episode_starts = np.ones((env.num_envs, ), dtype=bool) else: self._last_obs = new_obs self._last_episode_starts = dones # self._last_obs = new_obs # self._last_episode_starts = dones with th.no_grad(): # Compute value for the last timestep if n_steps % n_rollout_steps == 0 or dones.any(): # if dones.any(): # obs_tensor = obs_as_tensor(terminal_obs, self.device) # _, values, _ = self.policy.forward(obs_tensor) values = terminal_values assert values is not None else: obs_tensor = obs_as_tensor(new_obs, self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values) callback.on_rollout_end() return True
def collect_rollouts( self, opponent_model, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int, ) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() ## Initialized observation of OPPOMENT MODEL opponent_model._last_obs = self._last_obs ### MIGHT NEED TO CHANGE THIS # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() volley_env = gym.make( "SlimeVolley-v0" ) ### WORKS FOR NOW, MIGHT NEED BETTER WAYS TO DO THIS while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) ####print("line 166: obs_tensor, actions, values,log_probs: ", obs_tensor, actions, values,log_probs) actions = actions.cpu().numpy() ####print("line 168: agent actions numpy", actions) ## OPPOMENT MODEL with th.no_grad(): # Convert to pytorch tensor obs_tensor_op = th.as_tensor(opponent_model._last_obs).to( self.device) actions_op, values_op, log_probs_op = opponent_model.policy.forward( obs_tensor_op) actions_op = actions_op.cpu().numpy() ####print("line 177: opponent actions numpy", actions_op) # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) ####print("line 182: clipped actions numpy", clipped_actions) ## OPPOMENT MODEL # Rescale and perform action clipped_actions_op = actions_op # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions_op = np.clip(actions_op, self.action_space.low, self.action_space.high) action_n = np.array([clipped_actions[0], clipped_actions_op[0]]) new_obs_n, rewards_n, dones_n, info_n = volley_env.step(action_n) ####print("line 192: new_obs, rewards, dones, infos", new_obs, rewards, dones, info) ################new_obs, rewards, dones, infos = env.step(clipped_actions) ################new_obs, rewards, dones, infos = volley_env.step(clipped_actions[0]) new_obs = numpy.array([new_obs_n[0]]) ####print("line 209: agent new_obs", new_obs) rewards = numpy.array([rewards_n[0]]) dones = numpy.array([dones_n[0]]) infos = numpy.array([info_n]) ## OPPOMENT MODEL new_obs_op = numpy.array([new_obs_n[1]]) ####print("line 206: new_obs_op", new_obs_op) opponent_model._last_obs = numpy.array([new_obs_op]) ####print("line 209: opponent new_obs", opponent_model._last_obs) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones with th.no_grad(): # Compute value for the last timestep obs_tensor = th.as_tensor(new_obs).to(self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) if dones[0]: for info in infos: goal_diff = info['l_score'] - info['r_score'] print( f"Rewards: {goal_diff} | Score: [{info['l_score']} : {info['r_score']}]" ) self.scores.append(goal_diff) avg_score = sum(self.scores) / len(self.scores) print(f"Average Reward: {avg_score}") print("") if avg_score > self.best_score: self.best_score = avg_score self.save_best_model = True if self.log_handler is not None: self.log_handler.log({"Average Reward": avg_score}) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones with th.no_grad(): # Compute value for the last timestep obs_tensor = th.as_tensor(new_obs).to(self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int ) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() # debug =============================================================== if mode == 'debug': print(["OPA.collect_rollouts started, let's roll!"]) while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) # notes =========================================================== # use last observation to generate action (with log probs) and value with th.no_grad(): # Convert to pytorch tensor obs_tensor = th.as_tensor(self._last_obs).to(self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # debug =========================================================== if mode == 'debug': print(['OPA.collect_rollouts loop', 'n_rollout_steps:', n_rollout_steps, 'n_steps:', n_steps]) print(['OPA.collect_rollouts loop eval', 'last_obs:', self._last_obs, 'actions', actions, 'values', values, 'log_probs', log_probs]) # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) # notes =========================================================== # use clipped_actions to interact with env new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) # debug =========================================================== if mode == 'debug': print(['OPA.collect_rollouts loop save', 'last_obs:', self._last_obs, 'actions', actions, 'values', values, 'log_probs', log_probs, 'rewards', rewards, 'last_dones', self._last_dones]) # notes =========================================================== # 6 things to save in buffer: last_obs, actions, rewards, last_dones, values, log_probs self._last_obs = new_obs self._last_dones = dones with th.no_grad(): # Compute value for the last timestep obs_tensor = th.as_tensor(new_obs).to(self.device) _, values, _ = self.policy.forward(obs_tensor) # debug =============================================================== if mode == 'debug': print(['OPA.collect_rollouts last', 'new_obs:', new_obs, 'values:', values, 'dones:', dones]) print(['OPA.collect_rollouts finished, ready to compute_returns']) rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones) callback.on_rollout_end() return True