def env_step(env, action): # action should be of size: batch x 1 action = ptu.get_numpy(action.squeeze(dim=-1)) next_obs, reward, done, info = env.step(action) # move to torch next_obs = ptu.from_numpy(next_obs).view(-1, next_obs.shape[0]) reward = ptu.FloatTensor([reward]).view(-1, 1) done = ptu.from_numpy(np.array(done, dtype=int)).view(-1, 1) return next_obs, reward, done, info
def collect_rollouts_per_task(task_idx, agent, policy_storage, env, num_rollouts): for rollout in range(num_rollouts): obs = ptu.from_numpy(env.reset(task_idx)) obs = obs.reshape(-1, obs.shape[-1]) done_rollout = False while not done_rollout: action, _, _, _ = agent.act(obs=obs) # SAC # observe reward and next obs next_obs, reward, done, info = utl.env_step( env, action.squeeze(dim=0)) done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True # add data to policy buffer - (s+, a, r, s'+, term) term = env.unwrapped.is_goal_state() if "is_goal_state" in dir( env.unwrapped) else False rew_to_buffer = ptu.get_numpy(reward.squeeze(dim=0)) policy_storage.add_sample( task=0, #task_idx, observation=ptu.get_numpy(obs.squeeze(dim=0)), action=ptu.get_numpy(action.squeeze(dim=0)), reward=rew_to_buffer, terminal=np.array([term], dtype=float), next_observation=ptu.get_numpy(next_obs.squeeze(dim=0))) # set: obs <- next_obs obs = next_obs.clone()
def evaluate(self, tasks): num_episodes = self.args.max_rollouts_per_task num_steps_per_episode = self.env.unwrapped._max_episode_steps returns_per_episode = np.zeros((len(tasks), num_episodes)) success_rate = np.zeros(len(tasks)) if self.args.policy == 'dqn': values = np.zeros((len(tasks), self.args.max_trajectory_len)) else: obs_size = self.env.unwrapped.observation_space.shape[0] observations = np.zeros((len(tasks), self.args.max_trajectory_len + 1, obs_size)) log_probs = np.zeros((len(tasks), self.args.max_trajectory_len)) for task_idx, task in enumerate(tasks): obs = ptu.from_numpy(self.env.reset(task)) obs = obs.reshape(-1, obs.shape[-1]) step = 0 if self.args.policy == 'sac': observations[task_idx, step, :] = ptu.get_numpy(obs[0, :obs_size]) for episode_idx in range(num_episodes): running_reward = 0. for step_idx in range(num_steps_per_episode): # add distribution parameters to observation - policy is conditioned on posterior if self.args.policy == 'dqn': action, value = self.agent.act(obs=obs, deterministic=True) else: action, _, _, log_prob = self.agent.act(obs=obs, deterministic=self.args.eval_deterministic, return_log_prob=True) # observe reward and next obs next_obs, reward, done, info = utl.env_step(self.env, action.squeeze(dim=0)) running_reward += reward.item() if self.args.policy == 'dqn': values[task_idx, step] = value.item() else: observations[task_idx, step + 1, :] = ptu.get_numpy(next_obs[0, :obs_size]) log_probs[task_idx, step] = ptu.get_numpy(log_prob[0]) if "is_goal_state" in dir(self.env.unwrapped) and self.env.unwrapped.is_goal_state(): success_rate[task_idx] = 1. # set: obs <- next_obs obs = next_obs.clone() step += 1 returns_per_episode[task_idx, episode_idx] = running_reward if self.args.policy == 'dqn': return returns_per_episode, success_rate, values else: return returns_per_episode, success_rate, log_probs, observations
def collect_rollouts(self, num_rollouts, random_actions=False): ''' :param num_rollouts: :param random_actions: whether to use policy to sample actions, or randomly sample action space :return: ''' for rollout in range(num_rollouts): obs = ptu.from_numpy(self.env.reset(self.task_idx)) obs = obs.reshape(-1, obs.shape[-1]) done_rollout = False while not done_rollout: if random_actions: if self.args.policy == 'dqn': action = ptu.FloatTensor([[[self.env.action_space.sample()]]]).long() # Sample random action else: action = ptu.FloatTensor([self.env.action_space.sample()]) # Sample random action else: if self.args.policy == 'dqn': action, _ = self.agent.act(obs=obs) # DQN else: action, _, _, _ = self.agent.act(obs=obs) # SAC # observe reward and next obs next_obs, reward, done, info = utl.env_step(self.env, action.squeeze(dim=0)) done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True # add data to policy buffer - (s+, a, r, s'+, term) term = self.env.unwrapped.is_goal_state() if "is_goal_state" in dir(self.env.unwrapped) else False if self.args.dense_train_sparse_test: rew_to_buffer = {rew_type: rew for rew_type, rew in info.items() if rew_type.startswith('reward')} else: rew_to_buffer = ptu.get_numpy(reward.squeeze(dim=0)) self.policy_storage.add_sample(task=self.task_idx, observation=ptu.get_numpy(obs.squeeze(dim=0)), action=ptu.get_numpy(action.squeeze(dim=0)), reward=rew_to_buffer, terminal=np.array([term], dtype=float), next_observation=ptu.get_numpy(next_obs.squeeze(dim=0))) # set: obs <- next_obs obs = next_obs.clone() # update statistics self._n_env_steps_total += 1 if "is_goal_state" in dir(self.env.unwrapped) and self.env.unwrapped.is_goal_state(): # count successes self._successes_in_buffer += 1 self._n_rollouts_total += 1
def evaluate(self): num_episodes = self.args.max_rollouts_per_task num_steps_per_episode = self.env.unwrapped._max_episode_steps num_tasks = self.args.num_eval_tasks obs_size = self.env.unwrapped.observation_space.shape[0] returns_per_episode = np.zeros((num_tasks, num_episodes)) success_rate = np.zeros(num_tasks) rewards = np.zeros((num_tasks, self.args.trajectory_len)) reward_preds = np.zeros((num_tasks, self.args.trajectory_len)) observations = np.zeros( (num_tasks, self.args.trajectory_len + 1, obs_size)) if self.args.policy == 'sac': log_probs = np.zeros((num_tasks, self.args.trajectory_len)) # This part is very specific for the Semi-Circle env # if self.args.env_name == 'PointRobotSparse-v0': # reward_belief = np.zeros((num_tasks, self.args.trajectory_len)) # # low_x, high_x, low_y, high_y = -2., 2., -1., 2. # resolution = 0.1 # grid_x = np.arange(low_x, high_x + resolution, resolution) # grid_y = np.arange(low_y, high_y + resolution, resolution) # centers_x = (grid_x[:-1] + grid_x[1:]) / 2 # centers_y = (grid_y[:-1] + grid_y[1:]) / 2 # yv, xv = np.meshgrid(centers_y, centers_x, sparse=False, indexing='ij') # centers = np.vstack([xv.ravel(), yv.ravel()]).T # n_grid_points = centers.shape[0] # reward_belief_discretized = np.zeros((num_tasks, self.args.trajectory_len, centers.shape[0])) for task_loop_i, task in enumerate( self.env.unwrapped.get_all_eval_task_idx()): obs = ptu.from_numpy(self.env.reset(task)) obs = obs.reshape(-1, obs.shape[-1]) step = 0 # get prior parameters with torch.no_grad(): task_sample, task_mean, task_logvar, hidden_state = self.vae.encoder.prior( batch_size=1) observations[task_loop_i, step, :] = ptu.get_numpy(obs[0, :obs_size]) for episode_idx in range(num_episodes): running_reward = 0. for step_idx in range(num_steps_per_episode): # add distribution parameters to observation - policy is conditioned on posterior augmented_obs = self.get_augmented_obs( obs, task_mean, task_logvar) if self.args.policy == 'dqn': action, value = self.agent.act(obs=augmented_obs, deterministic=True) else: action, _, _, log_prob = self.agent.act( obs=augmented_obs, deterministic=self.args.eval_deterministic, return_log_prob=True) # observe reward and next obs next_obs, reward, done, info = utl.env_step( self.env, action.squeeze(dim=0)) running_reward += reward.item() # done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True # update encoding task_sample, task_mean, task_logvar, hidden_state = self.update_encoding( obs=next_obs, action=action, reward=reward, done=done, hidden_state=hidden_state) rewards[task_loop_i, step] = reward.item() reward_preds[task_loop_i, step] = ptu.get_numpy( self.vae.reward_decoder(task_sample, next_obs, obs, action)[0, 0]) # This part is very specific for the Semi-Circle env # if self.args.env_name == 'PointRobotSparse-v0': # reward_belief[task, step] = ptu.get_numpy( # self.vae.compute_belief_reward(task_mean, task_logvar, obs, next_obs, action)[0]) # # reward_belief_discretized[task, step, :] = ptu.get_numpy( # self.vae.compute_belief_reward(task_mean.repeat(n_grid_points, 1), # task_logvar.repeat(n_grid_points, 1), # None, # torch.cat((ptu.FloatTensor(centers), # ptu.zeros(centers.shape[0], 1)), dim=-1).unsqueeze(0), # None)[:, 0]) observations[task_loop_i, step + 1, :] = ptu.get_numpy( next_obs[0, :obs_size]) if self.args.policy != 'dqn': log_probs[task_loop_i, step] = ptu.get_numpy(log_prob[0]) if "is_goal_state" in dir( self.env.unwrapped ) and self.env.unwrapped.is_goal_state(): success_rate[task_loop_i] = 1. # set: obs <- next_obs obs = next_obs.clone() step += 1 returns_per_episode[task_loop_i, episode_idx] = running_reward if self.args.policy == 'dqn': return returns_per_episode, success_rate, observations, rewards, reward_preds # This part is very specific for the Semi-Circle env # elif self.args.env_name == 'PointRobotSparse-v0': # return returns_per_episode, success_rate, log_probs, observations, \ # rewards, reward_preds, reward_belief, reward_belief_discretized, centers else: return returns_per_episode, success_rate, log_probs, observations, rewards, reward_preds
def rollout_policy(env, learner): is_vae_exist = "vae" in dir(learner) observations = [] actions = [] rewards = [] values = [] if is_vae_exist: latent_samples = [] latent_means = [] latent_logvars = [] obs = ptu.from_numpy(env.reset()) obs = obs.reshape(-1, obs.shape[-1]) observations.append(obs) done_rollout = False if is_vae_exist: # get prior parameters with torch.no_grad(): task_sample, task_mean, task_logvar, hidden_state = learner.vae.encoder.prior( batch_size=1) # store latent_samples.append(ptu.get_numpy(task_sample[0, 0])) latent_means.append(ptu.get_numpy(task_mean[0, 0])) latent_logvars.append(ptu.get_numpy(task_logvar[0, 0])) while not done_rollout: if is_vae_exist: # add distribution parameters to observation - policy is conditioned on posterior augmented_obs = learner.get_augmented_obs(obs=obs, task_mu=task_mean, task_std=task_logvar) with torch.no_grad(): action, value = learner.agent.act(obs=augmented_obs, deterministic=True) else: action, _, _, _ = learner.agent.act(obs=obs) # observe reward and next obs next_obs, reward, done, info = utl.env_step(env, action.squeeze(dim=0)) # store observations.append(next_obs) actions.append(action) values.append(value) rewards.append(reward.item()) done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True if is_vae_exist: # update encoding task_sample, task_mean, task_logvar, hidden_state = learner.vae.encoder( action, next_obs, reward.reshape((1, 1)), hidden_state, return_prior=False) # values.append(value.item()) latent_samples.append(ptu.get_numpy(task_sample[0])) latent_means.append(ptu.get_numpy(task_mean[0])) latent_logvars.append(ptu.get_numpy(task_logvar[0])) # set: obs <- next_obs obs = next_obs.clone() if is_vae_exist: return observations, actions, rewards, values, \ latent_samples, latent_means, latent_logvars else: return observations, actions, rewards, values
def collect_rollouts(self, num_rollouts, random_actions=False): ''' :param num_rollouts: :param random_actions: whether to use policy to sample actions, or randomly sample action space :return: ''' for rollout in range(num_rollouts): obs = ptu.from_numpy(self.env.reset(self.task_idx)) obs = obs.reshape(-1, obs.shape[-1]) done_rollout = False # self.policy_storage.reset_running_episode(self.task_idx) # if self.args.fixed_latent_params: # assert 2 ** self.args.task_embedding_size >= self.args.num_tasks # task_mean = ptu.FloatTensor(utl.vertices(self.args.task_embedding_size)[self.task_idx]) # task_logvar = -2. * ptu.ones_like(task_logvar) # arbitrary negative enough number # add distribution parameters to observation - policy is conditioned on posterior augmented_obs = self.get_augmented_obs(obs=obs) while not done_rollout: if random_actions: if self.args.policy == 'dqn': action = ptu.FloatTensor([[ self.env.action_space.sample() ]]).type(torch.long) # Sample random action else: action = ptu.FloatTensor( [self.env.action_space.sample()]) else: if self.args.policy == 'dqn': action, _ = self.agent.act(obs=augmented_obs) # DQN else: action, _, _, _ = self.agent.act( obs=augmented_obs) # SAC # observe reward and next obs next_obs, reward, done, info = utl.env_step( self.env, action.squeeze(dim=0)) done_rollout = False if ptu.get_numpy( done[0][0]) == 0. else True # get augmented next obs augmented_next_obs = self.get_augmented_obs(obs=next_obs) # add data to policy buffer - (s+, a, r, s'+, term) term = self.env.unwrapped.is_goal_state( ) if "is_goal_state" in dir(self.env.unwrapped) else False self.policy_storage.add_sample( task=self.task_idx, observation=ptu.get_numpy(augmented_obs.squeeze(dim=0)), action=ptu.get_numpy(action.squeeze(dim=0)), reward=ptu.get_numpy(reward.squeeze(dim=0)), terminal=np.array([term], dtype=float), next_observation=ptu.get_numpy( augmented_next_obs.squeeze(dim=0))) if not random_actions: self.current_experience_storage.add_sample( task=self.task_idx, observation=ptu.get_numpy( augmented_obs.squeeze(dim=0)), action=ptu.get_numpy(action.squeeze(dim=0)), reward=ptu.get_numpy(reward.squeeze(dim=0)), terminal=np.array([term], dtype=float), next_observation=ptu.get_numpy( augmented_next_obs.squeeze(dim=0))) # set: obs <- next_obs obs = next_obs.clone() augmented_obs = augmented_next_obs.clone() # update statistics self._n_env_steps_total += 1 if "is_goal_state" in dir( self.env.unwrapped ) and self.env.unwrapped.is_goal_state(): # count successes self._successes_in_buffer += 1 self._n_rollouts_total += 1
def collect_rollouts(self): self.training_mode(False) num_episodes = self.args.max_rollouts_per_task num_steps_per_episode = self.env.unwrapped._max_episode_steps num_tasks = self.args.num_eval_tasks obs_size = self.env.unwrapped.observation_space.shape[0] returns_per_episode = np.zeros((num_tasks, num_episodes)) success_rate = np.zeros(num_tasks) rewards = np.zeros((num_tasks, self.args.trajectory_len)) observations = np.zeros( (num_tasks, self.args.trajectory_len + 1, obs_size)) actions = np.zeros( (num_tasks, self.args.trajectory_len, self.args.action_dim)) log_probs = np.zeros((num_tasks, self.args.trajectory_len)) for task in self.env.unwrapped.get_all_task_idx(): obs = ptu.from_numpy(self.env.reset(task)) obs = obs.reshape(-1, obs.shape[-1]) step = 0 # get prior parameters task_sample, task_mean, task_logvar, hidden_state = self.vae.encoder.prior( batch_size=1) observations[task, step, :] = ptu.get_numpy(obs[0, :obs_size]) for episode_idx in range(num_episodes): running_reward = 0. for step_idx in range(num_steps_per_episode): # add distribution parameters to observation - policy is conditioned on posterior augmented_obs = self.get_augmented_obs( obs, task_mean, task_logvar) action, _, _, log_prob = self.agent.act( obs=augmented_obs, deterministic=self.args.eval_deterministic, return_log_prob=True) # observe reward and next obs next_obs, reward, done, info = utl.env_step( self.env, action.squeeze(dim=0)) running_reward += reward.item() # update encoding task_sample, task_mean, task_logvar, hidden_state = self.update_encoding( obs=next_obs, action=action, reward=reward, done=done, hidden_state=hidden_state) rewards[task, step] = reward.item() #reward_preds[task, step] = ptu.get_numpy( # self.vae.reward_decoder(task_sample, next_obs, obs, action)[0, 0]) observations[task, step + 1, :] = ptu.get_numpy( next_obs[0, :obs_size]) actions[task, step, :] = ptu.get_numpy(action[0, :]) log_probs[task, step] = ptu.get_numpy(log_prob[0]) if "is_goal_state" in dir( self.env.unwrapped ) and self.env.unwrapped.is_goal_state(): success_rate[task] = 1. # set: obs <- next_obs obs = next_obs.clone() step += 1 returns_per_episode[task, episode_idx] = running_reward return returns_per_episode, success_rate, log_probs, observations, rewards, actions
def torch_ify(np_array_or_other): if isinstance(np_array_or_other, np.ndarray): return ptu.from_numpy(np_array_or_other) else: return np_array_or_other
def set_param_values_np(self, param_values): torch_dict = OrderedDict() for key, tensor in param_values.items(): torch_dict[key] = ptu.from_numpy(tensor) self.load_state_dict(torch_dict)