示例#1
0
def env_step(env, action):
    # action should be of size: batch x 1
    action = ptu.get_numpy(action.squeeze(dim=-1))
    next_obs, reward, done, info = env.step(action)
    # move to torch
    next_obs = ptu.from_numpy(next_obs).view(-1, next_obs.shape[0])
    reward = ptu.FloatTensor([reward]).view(-1, 1)
    done = ptu.from_numpy(np.array(done, dtype=int)).view(-1, 1)

    return next_obs, reward, done, info
示例#2
0
def collect_rollouts_per_task(task_idx, agent, policy_storage, env,
                              num_rollouts):
    for rollout in range(num_rollouts):
        obs = ptu.from_numpy(env.reset(task_idx))
        obs = obs.reshape(-1, obs.shape[-1])
        done_rollout = False

        while not done_rollout:
            action, _, _, _ = agent.act(obs=obs)  # SAC
            # observe reward and next obs
            next_obs, reward, done, info = utl.env_step(
                env, action.squeeze(dim=0))
            done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True

            # add data to policy buffer - (s+, a, r, s'+, term)
            term = env.unwrapped.is_goal_state() if "is_goal_state" in dir(
                env.unwrapped) else False
            rew_to_buffer = ptu.get_numpy(reward.squeeze(dim=0))
            policy_storage.add_sample(
                task=0,  #task_idx,
                observation=ptu.get_numpy(obs.squeeze(dim=0)),
                action=ptu.get_numpy(action.squeeze(dim=0)),
                reward=rew_to_buffer,
                terminal=np.array([term], dtype=float),
                next_observation=ptu.get_numpy(next_obs.squeeze(dim=0)))

            # set: obs <- next_obs
            obs = next_obs.clone()
示例#3
0
文件: learner.py 项目: vitchyr/BOReL
    def evaluate(self, tasks):
        num_episodes = self.args.max_rollouts_per_task
        num_steps_per_episode = self.env.unwrapped._max_episode_steps

        returns_per_episode = np.zeros((len(tasks), num_episodes))
        success_rate = np.zeros(len(tasks))

        if self.args.policy == 'dqn':
            values = np.zeros((len(tasks), self.args.max_trajectory_len))
        else:
            obs_size = self.env.unwrapped.observation_space.shape[0]
            observations = np.zeros((len(tasks), self.args.max_trajectory_len + 1, obs_size))
            log_probs = np.zeros((len(tasks), self.args.max_trajectory_len))

        for task_idx, task in enumerate(tasks):

            obs = ptu.from_numpy(self.env.reset(task))
            obs = obs.reshape(-1, obs.shape[-1])
            step = 0

            if self.args.policy == 'sac':
                observations[task_idx, step, :] = ptu.get_numpy(obs[0, :obs_size])

            for episode_idx in range(num_episodes):
                running_reward = 0.
                for step_idx in range(num_steps_per_episode):
                    # add distribution parameters to observation - policy is conditioned on posterior
                    if self.args.policy == 'dqn':
                        action, value = self.agent.act(obs=obs, deterministic=True)
                    else:
                        action, _, _, log_prob = self.agent.act(obs=obs,
                                                                deterministic=self.args.eval_deterministic,
                                                                return_log_prob=True)
                    # observe reward and next obs
                    next_obs, reward, done, info = utl.env_step(self.env, action.squeeze(dim=0))
                    running_reward += reward.item()
                    if self.args.policy == 'dqn':
                        values[task_idx, step] = value.item()
                    else:
                        observations[task_idx, step + 1, :] = ptu.get_numpy(next_obs[0, :obs_size])
                        log_probs[task_idx, step] = ptu.get_numpy(log_prob[0])

                    if "is_goal_state" in dir(self.env.unwrapped) and self.env.unwrapped.is_goal_state():
                        success_rate[task_idx] = 1.
                    # set: obs <- next_obs
                    obs = next_obs.clone()
                    step += 1

                returns_per_episode[task_idx, episode_idx] = running_reward

        if self.args.policy == 'dqn':
            return returns_per_episode, success_rate, values
        else:
            return returns_per_episode, success_rate, log_probs, observations
示例#4
0
文件: learner.py 项目: vitchyr/BOReL
    def collect_rollouts(self, num_rollouts, random_actions=False):
        '''

        :param num_rollouts:
        :param random_actions: whether to use policy to sample actions, or randomly sample action space
        :return:
        '''

        for rollout in range(num_rollouts):
            obs = ptu.from_numpy(self.env.reset(self.task_idx))
            obs = obs.reshape(-1, obs.shape[-1])
            done_rollout = False

            while not done_rollout:
                if random_actions:
                    if self.args.policy == 'dqn':
                        action = ptu.FloatTensor([[[self.env.action_space.sample()]]]).long()   # Sample random action
                    else:
                        action = ptu.FloatTensor([self.env.action_space.sample()])  # Sample random action
                else:
                    if self.args.policy == 'dqn':
                        action, _ = self.agent.act(obs=obs)   # DQN
                    else:
                        action, _, _, _ = self.agent.act(obs=obs)   # SAC
                # observe reward and next obs
                next_obs, reward, done, info = utl.env_step(self.env, action.squeeze(dim=0))
                done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True

                # add data to policy buffer - (s+, a, r, s'+, term)
                term = self.env.unwrapped.is_goal_state() if "is_goal_state" in dir(self.env.unwrapped) else False
                if self.args.dense_train_sparse_test:
                    rew_to_buffer = {rew_type: rew for rew_type, rew in info.items()
                                     if rew_type.startswith('reward')}
                else:
                    rew_to_buffer = ptu.get_numpy(reward.squeeze(dim=0))
                self.policy_storage.add_sample(task=self.task_idx,
                                               observation=ptu.get_numpy(obs.squeeze(dim=0)),
                                               action=ptu.get_numpy(action.squeeze(dim=0)),
                                               reward=rew_to_buffer,
                                               terminal=np.array([term], dtype=float),
                                               next_observation=ptu.get_numpy(next_obs.squeeze(dim=0)))

                # set: obs <- next_obs
                obs = next_obs.clone()

                # update statistics
                self._n_env_steps_total += 1
                if "is_goal_state" in dir(self.env.unwrapped) and self.env.unwrapped.is_goal_state():  # count successes
                    self._successes_in_buffer += 1
            self._n_rollouts_total += 1
示例#5
0
    def evaluate(self):
        num_episodes = self.args.max_rollouts_per_task
        num_steps_per_episode = self.env.unwrapped._max_episode_steps
        num_tasks = self.args.num_eval_tasks
        obs_size = self.env.unwrapped.observation_space.shape[0]

        returns_per_episode = np.zeros((num_tasks, num_episodes))
        success_rate = np.zeros(num_tasks)

        rewards = np.zeros((num_tasks, self.args.trajectory_len))
        reward_preds = np.zeros((num_tasks, self.args.trajectory_len))
        observations = np.zeros(
            (num_tasks, self.args.trajectory_len + 1, obs_size))
        if self.args.policy == 'sac':
            log_probs = np.zeros((num_tasks, self.args.trajectory_len))

        # This part is very specific for the Semi-Circle env
        # if self.args.env_name == 'PointRobotSparse-v0':
        #     reward_belief = np.zeros((num_tasks, self.args.trajectory_len))
        #
        #     low_x, high_x, low_y, high_y = -2., 2., -1., 2.
        #     resolution = 0.1
        #     grid_x = np.arange(low_x, high_x + resolution, resolution)
        #     grid_y = np.arange(low_y, high_y + resolution, resolution)
        #     centers_x = (grid_x[:-1] + grid_x[1:]) / 2
        #     centers_y = (grid_y[:-1] + grid_y[1:]) / 2
        #     yv, xv = np.meshgrid(centers_y, centers_x, sparse=False, indexing='ij')
        #     centers = np.vstack([xv.ravel(), yv.ravel()]).T
        #     n_grid_points = centers.shape[0]
        #     reward_belief_discretized = np.zeros((num_tasks, self.args.trajectory_len, centers.shape[0]))

        for task_loop_i, task in enumerate(
                self.env.unwrapped.get_all_eval_task_idx()):
            obs = ptu.from_numpy(self.env.reset(task))
            obs = obs.reshape(-1, obs.shape[-1])
            step = 0

            # get prior parameters
            with torch.no_grad():
                task_sample, task_mean, task_logvar, hidden_state = self.vae.encoder.prior(
                    batch_size=1)

            observations[task_loop_i,
                         step, :] = ptu.get_numpy(obs[0, :obs_size])

            for episode_idx in range(num_episodes):
                running_reward = 0.
                for step_idx in range(num_steps_per_episode):
                    # add distribution parameters to observation - policy is conditioned on posterior
                    augmented_obs = self.get_augmented_obs(
                        obs, task_mean, task_logvar)
                    if self.args.policy == 'dqn':
                        action, value = self.agent.act(obs=augmented_obs,
                                                       deterministic=True)
                    else:
                        action, _, _, log_prob = self.agent.act(
                            obs=augmented_obs,
                            deterministic=self.args.eval_deterministic,
                            return_log_prob=True)

                    # observe reward and next obs
                    next_obs, reward, done, info = utl.env_step(
                        self.env, action.squeeze(dim=0))
                    running_reward += reward.item()
                    # done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True
                    # update encoding
                    task_sample, task_mean, task_logvar, hidden_state = self.update_encoding(
                        obs=next_obs,
                        action=action,
                        reward=reward,
                        done=done,
                        hidden_state=hidden_state)
                    rewards[task_loop_i, step] = reward.item()
                    reward_preds[task_loop_i, step] = ptu.get_numpy(
                        self.vae.reward_decoder(task_sample, next_obs, obs,
                                                action)[0, 0])

                    # This part is very specific for the Semi-Circle env
                    # if self.args.env_name == 'PointRobotSparse-v0':
                    #     reward_belief[task, step] = ptu.get_numpy(
                    #         self.vae.compute_belief_reward(task_mean, task_logvar, obs, next_obs, action)[0])
                    #
                    #     reward_belief_discretized[task, step, :] = ptu.get_numpy(
                    #         self.vae.compute_belief_reward(task_mean.repeat(n_grid_points, 1),
                    #                                        task_logvar.repeat(n_grid_points, 1),
                    #                                        None,
                    #                                        torch.cat((ptu.FloatTensor(centers),
                    #                                                   ptu.zeros(centers.shape[0], 1)), dim=-1).unsqueeze(0),
                    #                                        None)[:, 0])

                    observations[task_loop_i, step + 1, :] = ptu.get_numpy(
                        next_obs[0, :obs_size])
                    if self.args.policy != 'dqn':
                        log_probs[task_loop_i,
                                  step] = ptu.get_numpy(log_prob[0])

                    if "is_goal_state" in dir(
                            self.env.unwrapped
                    ) and self.env.unwrapped.is_goal_state():
                        success_rate[task_loop_i] = 1.
                    # set: obs <- next_obs
                    obs = next_obs.clone()
                    step += 1

                returns_per_episode[task_loop_i, episode_idx] = running_reward

        if self.args.policy == 'dqn':
            return returns_per_episode, success_rate, observations, rewards, reward_preds
        # This part is very specific for the Semi-Circle env
        # elif self.args.env_name == 'PointRobotSparse-v0':
        #     return returns_per_episode, success_rate, log_probs, observations, \
        #            rewards, reward_preds, reward_belief, reward_belief_discretized, centers
        else:
            return returns_per_episode, success_rate, log_probs, observations, rewards, reward_preds
示例#6
0
def rollout_policy(env, learner):
    is_vae_exist = "vae" in dir(learner)

    observations = []
    actions = []
    rewards = []
    values = []
    if is_vae_exist:
        latent_samples = []
        latent_means = []
        latent_logvars = []

    obs = ptu.from_numpy(env.reset())
    obs = obs.reshape(-1, obs.shape[-1])
    observations.append(obs)
    done_rollout = False
    if is_vae_exist:
        # get prior parameters
        with torch.no_grad():
            task_sample, task_mean, task_logvar, hidden_state = learner.vae.encoder.prior(
                batch_size=1)
        # store
        latent_samples.append(ptu.get_numpy(task_sample[0, 0]))
        latent_means.append(ptu.get_numpy(task_mean[0, 0]))
        latent_logvars.append(ptu.get_numpy(task_logvar[0, 0]))

    while not done_rollout:
        if is_vae_exist:
            # add distribution parameters to observation - policy is conditioned on posterior
            augmented_obs = learner.get_augmented_obs(obs=obs,
                                                      task_mu=task_mean,
                                                      task_std=task_logvar)
            with torch.no_grad():
                action, value = learner.agent.act(obs=augmented_obs,
                                                  deterministic=True)
        else:
            action, _, _, _ = learner.agent.act(obs=obs)

        # observe reward and next obs
        next_obs, reward, done, info = utl.env_step(env, action.squeeze(dim=0))
        # store
        observations.append(next_obs)
        actions.append(action)
        values.append(value)
        rewards.append(reward.item())
        done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True

        if is_vae_exist:
            # update encoding
            task_sample, task_mean, task_logvar, hidden_state = learner.vae.encoder(
                action,
                next_obs,
                reward.reshape((1, 1)),
                hidden_state,
                return_prior=False)

            # values.append(value.item())
            latent_samples.append(ptu.get_numpy(task_sample[0]))
            latent_means.append(ptu.get_numpy(task_mean[0]))
            latent_logvars.append(ptu.get_numpy(task_logvar[0]))
        # set: obs <- next_obs
        obs = next_obs.clone()
    if is_vae_exist:
        return observations, actions, rewards, values, \
               latent_samples, latent_means, latent_logvars
    else:
        return observations, actions, rewards, values
示例#7
0
    def collect_rollouts(self, num_rollouts, random_actions=False):
        '''

        :param num_rollouts:
        :param random_actions: whether to use policy to sample actions, or randomly sample action space
        :return:
        '''

        for rollout in range(num_rollouts):
            obs = ptu.from_numpy(self.env.reset(self.task_idx))
            obs = obs.reshape(-1, obs.shape[-1])
            done_rollout = False
            # self.policy_storage.reset_running_episode(self.task_idx)

            # if self.args.fixed_latent_params:
            #     assert 2 ** self.args.task_embedding_size >= self.args.num_tasks
            #     task_mean = ptu.FloatTensor(utl.vertices(self.args.task_embedding_size)[self.task_idx])
            #     task_logvar = -2. * ptu.ones_like(task_logvar)   # arbitrary negative enough number
            # add distribution parameters to observation - policy is conditioned on posterior
            augmented_obs = self.get_augmented_obs(obs=obs)

            while not done_rollout:
                if random_actions:
                    if self.args.policy == 'dqn':
                        action = ptu.FloatTensor([[
                            self.env.action_space.sample()
                        ]]).type(torch.long)  # Sample random action
                    else:
                        action = ptu.FloatTensor(
                            [self.env.action_space.sample()])
                else:
                    if self.args.policy == 'dqn':
                        action, _ = self.agent.act(obs=augmented_obs)  # DQN
                    else:
                        action, _, _, _ = self.agent.act(
                            obs=augmented_obs)  # SAC
                # observe reward and next obs
                next_obs, reward, done, info = utl.env_step(
                    self.env, action.squeeze(dim=0))
                done_rollout = False if ptu.get_numpy(
                    done[0][0]) == 0. else True

                # get augmented next obs
                augmented_next_obs = self.get_augmented_obs(obs=next_obs)

                # add data to policy buffer - (s+, a, r, s'+, term)
                term = self.env.unwrapped.is_goal_state(
                ) if "is_goal_state" in dir(self.env.unwrapped) else False
                self.policy_storage.add_sample(
                    task=self.task_idx,
                    observation=ptu.get_numpy(augmented_obs.squeeze(dim=0)),
                    action=ptu.get_numpy(action.squeeze(dim=0)),
                    reward=ptu.get_numpy(reward.squeeze(dim=0)),
                    terminal=np.array([term], dtype=float),
                    next_observation=ptu.get_numpy(
                        augmented_next_obs.squeeze(dim=0)))
                if not random_actions:
                    self.current_experience_storage.add_sample(
                        task=self.task_idx,
                        observation=ptu.get_numpy(
                            augmented_obs.squeeze(dim=0)),
                        action=ptu.get_numpy(action.squeeze(dim=0)),
                        reward=ptu.get_numpy(reward.squeeze(dim=0)),
                        terminal=np.array([term], dtype=float),
                        next_observation=ptu.get_numpy(
                            augmented_next_obs.squeeze(dim=0)))

                # set: obs <- next_obs
                obs = next_obs.clone()
                augmented_obs = augmented_next_obs.clone()

                # update statistics
                self._n_env_steps_total += 1
                if "is_goal_state" in dir(
                        self.env.unwrapped
                ) and self.env.unwrapped.is_goal_state():  # count successes
                    self._successes_in_buffer += 1
            self._n_rollouts_total += 1
示例#8
0
    def collect_rollouts(self):
        self.training_mode(False)
        num_episodes = self.args.max_rollouts_per_task
        num_steps_per_episode = self.env.unwrapped._max_episode_steps
        num_tasks = self.args.num_eval_tasks
        obs_size = self.env.unwrapped.observation_space.shape[0]

        returns_per_episode = np.zeros((num_tasks, num_episodes))
        success_rate = np.zeros(num_tasks)

        rewards = np.zeros((num_tasks, self.args.trajectory_len))
        observations = np.zeros(
            (num_tasks, self.args.trajectory_len + 1, obs_size))
        actions = np.zeros(
            (num_tasks, self.args.trajectory_len, self.args.action_dim))

        log_probs = np.zeros((num_tasks, self.args.trajectory_len))

        for task in self.env.unwrapped.get_all_task_idx():
            obs = ptu.from_numpy(self.env.reset(task))
            obs = obs.reshape(-1, obs.shape[-1])
            step = 0

            # get prior parameters
            task_sample, task_mean, task_logvar, hidden_state = self.vae.encoder.prior(
                batch_size=1)

            observations[task, step, :] = ptu.get_numpy(obs[0, :obs_size])

            for episode_idx in range(num_episodes):
                running_reward = 0.
                for step_idx in range(num_steps_per_episode):
                    # add distribution parameters to observation - policy is conditioned on posterior
                    augmented_obs = self.get_augmented_obs(
                        obs, task_mean, task_logvar)
                    action, _, _, log_prob = self.agent.act(
                        obs=augmented_obs,
                        deterministic=self.args.eval_deterministic,
                        return_log_prob=True)

                    # observe reward and next obs
                    next_obs, reward, done, info = utl.env_step(
                        self.env, action.squeeze(dim=0))
                    running_reward += reward.item()

                    # update encoding
                    task_sample, task_mean, task_logvar, hidden_state = self.update_encoding(
                        obs=next_obs,
                        action=action,
                        reward=reward,
                        done=done,
                        hidden_state=hidden_state)
                    rewards[task, step] = reward.item()
                    #reward_preds[task, step] = ptu.get_numpy(
                    #    self.vae.reward_decoder(task_sample, next_obs, obs, action)[0, 0])

                    observations[task, step + 1, :] = ptu.get_numpy(
                        next_obs[0, :obs_size])
                    actions[task, step, :] = ptu.get_numpy(action[0, :])
                    log_probs[task, step] = ptu.get_numpy(log_prob[0])

                    if "is_goal_state" in dir(
                            self.env.unwrapped
                    ) and self.env.unwrapped.is_goal_state():
                        success_rate[task] = 1.
                    # set: obs <- next_obs
                    obs = next_obs.clone()
                    step += 1

                returns_per_episode[task, episode_idx] = running_reward

        return returns_per_episode, success_rate, log_probs, observations, rewards, actions
示例#9
0
def torch_ify(np_array_or_other):
    if isinstance(np_array_or_other, np.ndarray):
        return ptu.from_numpy(np_array_or_other)
    else:
        return np_array_or_other
示例#10
0
 def set_param_values_np(self, param_values):
     torch_dict = OrderedDict()
     for key, tensor in param_values.items():
         torch_dict[key] = ptu.from_numpy(tensor)
     self.load_state_dict(torch_dict)