Пример #1
0
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False,
            deterministic=False):
    """ Generate a sample from a policy.

    Args:
        deterministic (bool): Boolean variable indicating whether a
        stochastic or deterministic action should be taken during the
        rollout. This is False (stochastic actions) by default.

    Returns:

    """
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        if deterministic:
            a = agent_info['mean']
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return None

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False):
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        # a = agent_info["mean"]
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        # if d:
        #     break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return None

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Пример #3
0
    def _process_trajectory(self, result):
        """Collect trajectory from ray object store.

        Converts that trajectory to garage friendly format.

        Args:
            - result: ray object id of ready to be collected trajectory.
        """
        trajectory = ray.get(result)
        ready_worker_id = trajectory[0]
        self._active_worker_ids.remove(ready_worker_id)
        self._idle_worker_ids.append(ready_worker_id)
        trajectory = dict(observations=np.asarray(trajectory[1]),
                          actions=np.asarray(trajectory[2]),
                          rewards=tensor_utils.stack_tensor_list(
                              trajectory[3]),
                          agent_infos=trajectory[4],
                          env_infos=trajectory[5])
        num_returned_samples = len(trajectory['observations'])
        return trajectory, num_returned_samples
Пример #4
0
    def obtain_samples(self, itr):
        """
        Collect samples for the given iteration number.

        :param itr: Iteration number.
        :return: A list of paths.
        """
        paths = []
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs
        n_samples = 0
        batch_samples = self.vec_env.num_envs * self.algo.max_path_length

        policy = self.algo.policy
        if self.algo.es:
            self.algo.es.reset()

        while n_samples < batch_samples:
            policy.reset(dones)
            if self.algo.input_include_goal:
                obs = [obs["observation"] for obs in obses]
                d_g = [obs["desired_goal"] for obs in obses]
                a_g = [obs["achieved_goal"] for obs in obses]
                input_obses = np.concatenate((obs, d_g), axis=-1)
            else:
                input_obses = obses
            if self.algo.es:
                actions, agent_infos = self.algo.es.get_actions(
                    input_obses, self.algo.policy)
            else:
                actions, agent_infos = self.algo.policy.get_actions(
                    input_obses)

            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]

            if self.algo.input_include_goal:
                self.algo.replay_buffer.add_transition(
                    observation=obs,
                    action=actions,
                    goal=d_g,
                    achieved_goal=a_g,
                    terminal=dones,
                    next_observation=[
                        next_obs["observation"] for next_obs in next_obses
                    ],
                    next_achieved_goal=[
                        next_obs["achieved_goal"] for next_obs in next_obses
                    ],
                )
            else:
                self.algo.replay_buffer.add_transition(
                    observation=obses,
                    action=actions,
                    reward=rewards * self.algo.reward_scale,
                    terminal=dones,
                    next_observation=next_obses,
                )

            for idx, reward, env_info, done in zip(itertools.count(), rewards,
                                                   env_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        rewards=[],
                        env_infos=[],
                    )
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)

                if done:
                    paths.append(
                        dict(rewards=tensor_utils.stack_tensor_list(
                            running_paths[idx]["rewards"]),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]["env_infos"])))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

                    if self.algo.es:
                        self.algo.es.reset()
            obses = next_obses

        return paths
    def obtain_samples(self, itr, batch_size=None, whole_paths=True):
        """Obtain samples."""
        logger.log('Obtaining samples for iteration %d...' % itr)

        if not batch_size:
            batch_size = self.algo.max_path_length * self.n_envs

        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        import time
        while n_samples < batch_size:
            t = time.time()
            policy.reset(dones)

            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t
            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(  # noqa: E501
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]['observations'].append(observation)
                running_paths[idx]['actions'].append(action)
                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['agent_infos'].append(agent_info)
                if done:
                    paths.append(
                        dict(observations=self.env_spec.observation_space.
                             flatten_n(running_paths[idx]['observations']),
                             actions=self.env_spec.action_space.flatten_n(
                                 running_paths[idx]['actions']),
                             rewards=tensor_utils.stack_tensor_list(
                                 running_paths[idx]['rewards']),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['env_infos']),
                             agent_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]['agent_infos'])))
                    n_samples += len(running_paths[idx]['rewards'])
                    running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        tabular.record('PolicyExecTime', policy_time)
        tabular.record('EnvExecTime', env_time)
        tabular.record('ProcessExecTime', process_time)

        if whole_paths:
            return paths
        else:
            paths_truncated = truncate_paths(paths, batch_size)
            return paths_truncated
Пример #6
0
    def obtain_samples(self, itr, batch_size):
        """Collect samples for the given iteration number.

        Args:
            itr(int): Iteration number.
            batch_size(int): Number of environment interactions in one batch.

        Returns:
            list: A list of paths.
        """
        paths = []
        if not self.no_reset or self._last_obses is None:
            obses = self.vec_env.reset()
        else:
            obses = self._last_obses
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs
        n_samples = 0

        policy = self.algo.policy
        if self.algo.es:
            self.algo.es.reset()

        while n_samples < batch_size:
            policy.reset(dones)
            if self.algo.input_include_goal:
                obs = [obs['observation'] for obs in obses]
                d_g = [obs['desired_goal'] for obs in obses]
                a_g = [obs['achieved_goal'] for obs in obses]
                input_obses = np.concatenate((obs, d_g), axis=-1)
            else:
                input_obses = obses
            if self.algo.es:
                actions, agent_infos = self.algo.es.get_actions(
                    itr, input_obses, self.algo.policy)
            else:
                actions, agent_infos = self.algo.policy.get_actions(
                    input_obses)

            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            self._last_obses = next_obses
            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            n_samples += len(next_obses)

            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]

            if self.algo.input_include_goal:
                self.algo.replay_buffer.add_transitions(
                    observation=obs,
                    action=actions,
                    goal=d_g,
                    achieved_goal=a_g,
                    terminal=dones,
                    next_observation=[
                        next_obs['observation'] for next_obs in next_obses
                    ],
                    next_achieved_goal=[
                        next_obs['achieved_goal'] for next_obs in next_obses
                    ],
                )
            else:
                self.algo.replay_buffer.add_transitions(
                    observation=obses,
                    action=actions,
                    reward=rewards * self.algo.reward_scale,
                    terminal=dones,
                    next_observation=next_obses,
                )

            for idx, reward, env_info, done in zip(itertools.count(), rewards,
                                                   env_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        rewards=[],
                        env_infos=[],
                        dones=[],
                        undiscounted_return=self._last_uncounted_discount[idx],
                        # running_length: Length of path up to now
                        # Note that running_length is not len(rewards)
                        # Because a path may not be complete in one batch
                        running_length=self._last_running_length[idx],
                        success_count=self._last_success_count[idx])

                running_paths[idx]['rewards'].append(reward)
                running_paths[idx]['env_infos'].append(env_info)
                running_paths[idx]['dones'].append(done)
                running_paths[idx]['running_length'] += 1
                running_paths[idx]['undiscounted_return'] += reward
                running_paths[idx]['success_count'] += env_info.get(
                    'is_success') or 0

                self._last_uncounted_discount[idx] += reward
                self._last_success_count[idx] += env_info.get(
                    'is_success') or 0
                self._last_running_length[idx] += 1

                if done or n_samples >= batch_size:
                    paths.append(
                        dict(
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]['rewards']),
                            dones=tensor_utils.stack_tensor_list(
                                running_paths[idx]['dones']),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]['env_infos']),
                            running_length=running_paths[idx]
                            ['running_length'],
                            undiscounted_return=running_paths[idx]
                            ['undiscounted_return'],
                            success_count=running_paths[idx]['success_count']))
                    running_paths[idx] = None

                    if done:
                        self._last_running_length[idx] = 0
                        self._last_success_count[idx] = 0
                        self._last_uncounted_discount[idx] = 0

                    if self.algo.es:
                        self.algo.es.reset()
            obses = next_obses

        return paths
Пример #7
0
def rollout(env,
            agent,
            *,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            deterministic=False):
    """Sample a single rollout of the agent in the environment.

    Args:
        agent(Policy): Agent used to select actions.
        env(gym.Env): Environment to perform actions in.
        max_path_length(int): If the rollout reaches this many timesteps, it is
            terminated.
        animated(bool): If true, render the environment after each step.
        speedup(float): Factor by which to decrease the wait time between
            rendered steps. Only relevant, if animated == true.
        deterministic (bool): If true, use the mean action returned by the
            stochastic policy instead of sampling from the returned action
            distribution.

    Returns:
        worker_number(int): The worker number passed into this function.
        observations(np.array): Non-flattened array of observations.
        actions(np.array): Non-flattened array of actions.
        rewards(np.array): Array of rewards of shape (timesteps, 1).
        agent_infos(dict[str, np.array]): Dictionary of stacked, non-flattened
            `agent_info`s.
        env_infos(dict[str, np.array]): Dictionary of stacked, non-flattened
            `env_info`s.

    """
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    o = env.reset()
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        if deterministic:
            a = agent_info['mean']
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
Пример #8
0
    def obtain_samples_for_evaluation(self, num_paths=20):
        """Collect samples for the given iteration number.

        Args:
            itr(int): Iteration number.
            batch_size(int): Number of environment interactions in one batch.

        Returns:
            list: A list of paths.

        """
        paths = []

        policy = self.algo.policy

        for i in range(num_paths):
            obses = self.evaluate_env.reset()
            #print(obses)

            dones = np.asarray([True] * self.evaluate_env.num_envs)
            running_paths = [None] * self.evaluate_env.num_envs
            policy.reset(dones)
            end_of_path = False

            for j in range(500):
                input_obses = obses
                obs_normalized = tensor_utils.normalize_pixel_batch(
                    self.env_spec, input_obses)
                obses = obs_normalized

                actions = self.algo.policy.get_actions(obs_normalized)
                if len(actions) > 1:
                    actions = actions[0]
                agent_infos = None

                next_obses, rewards, dones, env_infos = self.evaluate_env.step(
                    actions)
                original_next_obses = next_obses
                next_obses = tensor_utils.normalize_pixel_batch(
                    self.env_spec, next_obses)

                env_infos = tensor_utils.split_tensor_dict_list(env_infos)

                if agent_infos is None:
                    agent_infos = [
                        dict() for _ in range(self.evaluate_env.num_envs)
                    ]
                if env_infos is None:
                    env_infos = [
                        dict() for _ in range(self.evaluate_env.num_envs)
                    ]

                for idx, reward, env_info, done in zip(itertools.count(),
                                                       rewards, env_infos,
                                                       dones):
                    if running_paths[idx] is None:
                        running_paths[idx] = dict(
                            rewards=[],
                            env_infos=[],
                            dones=[],
                            undiscounted_return=0,
                            # running_length: Length of path up to now
                            # Note that running_length is not len(rewards)
                            # Because a path may not be complete in one batch
                            running_length=0,
                            success_count=0)

                    running_paths[idx]['rewards'].append(reward)
                    running_paths[idx]['env_infos'].append(env_info)
                    running_paths[idx]['dones'].append(done)
                    running_paths[idx]['running_length'] += 1
                    running_paths[idx]['undiscounted_return'] += reward
                    running_paths[idx]['success_count'] += env_info.get(
                        'is_success') or 0

                    if done or j == 499:
                        paths.append(
                            dict(rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]['rewards']),
                                 dones=tensor_utils.stack_tensor_list(
                                     running_paths[idx]['dones']),
                                 env_infos=tensor_utils.stack_tensor_dict_list(
                                     running_paths[idx]['env_infos']),
                                 running_length=running_paths[idx]
                                 ['running_length'],
                                 undiscounted_return=running_paths[idx]
                                 ['undiscounted_return'],
                                 success_count=running_paths[idx]
                                 ['success_count']))
                        running_paths[idx] = None

                        end_of_path = True
                if end_of_path:
                    break
                obses = original_next_obses
        #print(paths)
        return paths
Пример #9
0
    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy

        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)

            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t
            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, next_observation, env_info, agent_info, done in zip(  # noqa: E501
                    itertools.count(), obses, actions, rewards, next_obses,
                    env_infos, agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        next_observations=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["next_observations"].append(
                    next_observation)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(observations=self.env_spec.observation_space.
                             flatten_n(running_paths[idx]["observations"]),
                             actions=self.env_spec.action_space.flatten_n(
                                 running_paths[idx]["actions"]),
                             rewards=tensor_utils.stack_tensor_list(
                                 running_paths[idx]["rewards"]),
                             next_observation=tensor_utils.stack_tensor_list(
                                 running_paths[idx]["next_observations"]),
                             env_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]["env_infos"]),
                             agent_infos=tensor_utils.stack_tensor_dict_list(
                                 running_paths[idx]["agent_infos"])))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
Пример #10
0
def skill_rollout(env,
                  agent,
                  max_path_length=np.inf,
                  skill_stopping_func=None,
                  reset_start_rollout=True,
                  keep_rendered_rgbs=False,
                  animated=False,
                  speedup=1
                  ):
    """
    Perform one rollout in given environment.
    Code adopted from https://github.com/florensacc/snn4hrl
    :param env: AsaEnv environment to run in
    :param agent: Policy to sample actions from
    :param max_path_length: force terminate the rollout after this many steps
    :param skill_stopping_func: function ({actions, observations} -> bool) that indicates that skill execution is done
    :param reset_start_rollout: whether to reset the env when calling this function
    :param keep_rendered_rgbs: whether to keep a list of all rgb_arrays (for future video making)
    :param animated: whether to render env after each step
    :param speedup: speedup factor for animation
    """
    observations = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []
    terminated = []
    rendered_rgbs = []
    if reset_start_rollout:
        o = env.reset()
    else:
        o = AsaEnv.get_current_obs_wrapped(env)
    agent.reset()
    path_length = 0
    if animated:
        env.render()
    if keep_rendered_rgbs:  # will return a new entry to the path dict with all rendered images
        rendered_rgbs.append(env.render(mode='rgb_array'))
    while path_length < max_path_length:
        a, agent_info = agent.get_action(o)
        next_o, r, d, env_info = env.step(a)
        observations.append(env.observation_space.flatten(o))
        rewards.append(r)
        actions.append(env.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        # natural termination
        if d:
            terminated.append(1)
            break
        terminated.append(0)
        # skill decides to terminate
        path_dict = dict(
            observations=tensor_utils.stack_tensor_list(observations),
            actions=tensor_utils.stack_tensor_list(actions),
            rewards=tensor_utils.stack_tensor_list(rewards),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos),  # here it concatenates all lower-level paths!
        )
        if skill_stopping_func and skill_stopping_func(path_dict):
            break

        o = next_o
        if keep_rendered_rgbs:  # will return a new entry to the path dict with all rendered images
            rendered_rgbs.append(env.render(mode='rgb_array'))
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    # This is off as in the case of being an inner rollout, it will close the outer renderer!
    # if animated:
    #     env.render(close=True)

    path_dict = dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),  # here it concatenates all lower-level paths!
        # termination indicates if the rollout was terminated or if we simply reached the limit of steps: important
        # when BOTH happened at the same time, to still be able to know it was the done (for hierarchized envs)
        terminated=tensor_utils.stack_tensor_list(terminated),
    )
    if keep_rendered_rgbs:
        path_dict['rendered_rgbs'] = tensor_utils.stack_tensor_list(rendered_rgbs)

    return path_dict