예제 #1
0
    def get_path(self, params):
        """
        Sample a single path in the environment accoring to the policy parameters
        that are passed.
        returns:
            A path consists of a dictionary of observations, actions, etc.
        """

        policy = self.policy
        env = self.env
        env_spec = self.env.spec

        obs = env.reset()
        policy.reset()

        policy.set_param_values(params, trainable=True)

        path = dict(observations=[],
                    actions=[],
                    rewards=[],
                    env_infos=[],
                    agent_infos=[])

        for t in range(self.max_path_length):

            action, agent_info = policy.get_action(obs)
            if self.deterministic:
                action = agent_info['mean']

            next_obs, reward, done, env_info = self.env.step(action)

            if agent_info is None:
                agent_info = dict()
            if env_info is None:
                env_info = dict()
            path['observations'].append(obs)
            path['actions'].append(action)
            path['rewards'].append(reward)
            path['env_infos'].append(env_info)
            path['agent_infos'].append(agent_info)

            if done:
                break

            obs = next_obs

        path['observations'] = env_spec.observation_space.flatten_n(
            path['observations'])
        path['actions'] = env_spec.action_space.flatten_n(path['actions'])
        path['rewards'] = tensor_utils.stack_tensor_list(path['rewards'])
        path['env_infos'] = tensor_utils.stack_tensor_dict_list(
            path['env_infos'])
        path['agent_infos'] = tensor_utils.stack_tensor_dict_list(
            path['agent_infos'])
        path['policy_params'] = policy.get_param_values(trainable=True)

        return path
예제 #2
0
    def step(self, action_n):
        results = singleton_pool.run_each(
            worker_run_step,
            [(action_n, self.scope) for _ in self._alloc_env_ids],
        )
        results = [x for x in results if x is not None]
        ids, obs, rewards, dones, env_infos = list(zip(*results))
        ids = np.concatenate(ids)
        obs = self.observation_space.unflatten_n(np.concatenate(obs))
        rewards = np.concatenate(rewards)
        dones = np.concatenate(dones)
        env_infos = tensor_utils.split_tensor_dict_list(
            tensor_utils.concat_tensor_dict_list(env_infos))
        if env_infos is None:
            env_infos = [dict() for _ in range(self.num_envs)]

        items = list(zip(ids, obs, rewards, dones, env_infos))
        items = sorted(items, key=lambda x: x[0])

        ids, obs, rewards, dones, env_infos = list(zip(*items))

        obs = list(obs)
        rewards = np.asarray(rewards)
        dones = np.asarray(dones)

        self.ts += 1
        dones[self.ts >= self.max_path_length] = True

        reset_obs = self._run_reset(dones)
        for (i, done) in enumerate(dones):
            if done:
                obs[i] = reset_obs[i]
                self.ts[i] = 0
        return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(
            list(env_infos))
    def step(self, action_n):
        self.ts += 1

        ast_action_n = action_n
        os = [
            np.reshape(env.get_observation(), env.observation_space.shape)
            for env in self.envs
        ]
        action_n, action_info_n = self.sut.get_actions(os)
        if "mean" in action_info_n:
            action_n = action_info_n["mean"]
        elif "prob" in action_info_n:
            action_n = np.argmax(action_info_n["prob"], axis=1)
        if self.sut.recurrent:
            self.sut.prev_actions = self.sut.action_space.flatten_n(action_n)
        # action = self.env.action_space.sample()
        results = [
            env.ast_step(action, ast_action)
            for (action, ast_action,
                 env) in zip(action_n, ast_action_n, self.envs)
        ]
        if self.open_loop:
            obs = [self._init_state for env in self.envs]
        else:
            obs = [
                np.reshape(ob, env.ast_observation_space.shape)
                for (ob, env) in zip(list(zip(*results))[0], self.envs)
            ]

        obs = np.asarray(obs)
        dones = list(zip(*results))[2]
        dones = np.asarray(dones)
        if self.max_path_length is not None:
            dones[self.ts >= self.max_path_length] = True

        infos = [env.ast_get_reward_info() for env in self.envs]
        for (i, info) in enumerate(infos):
            info['is_terminal'] = dones[i]
        rewards = [
            self.reward_function.give_reward(action=action, info=info)
            for (action, info) in zip(ast_action_n, infos)
        ]
        env_infos = infos

        rewards = np.asarray(rewards)

        for (i, done) in enumerate(dones):
            if done:
                if self._fixed_init_state:
                    obs[i] = self.envs[i].ast_reset(self._init_state)[0]
                else:
                    obs[i] = self.envs[i].ast_reset(
                        self.observation_space.sample())[0]
                self.ts[i] = 0
        self.sut.reset(dones)
        return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(
            env_infos)
예제 #4
0
 def step(self, action_n):
     all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)]
     obs, rewards, dones, env_infos = list(
         map(list, list(zip(*all_results))))
     dones = np.asarray(dones)
     rewards = np.asarray(rewards)
     self.ts += 1
     if self.max_path_length is not None:
         dones[self.ts >= self.max_path_length] = True
     for (i, done) in enumerate(dones):
         if done:
             obs[i] = self.envs[i].reset()
             self.ts[i] = 0
     return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(
         env_infos)
예제 #5
0
def worker_run_step(G, action_n, scope):
    assert hasattr(G, 'parallel_vec_envs')
    assert scope in G.parallel_vec_envs
    env_template = G.parallel_vec_env_template[scope]
    ids = []
    step_results = []
    for (idx, env) in G.parallel_vec_envs[scope]:
        action = action_n[idx]
        ids.append(idx)
        step_results.append(tuple(env.step(action)))
    if len(step_results) == 0:
        return None
    obs, rewards, dones, env_infos = list(map(list, list(zip(*step_results))))
    obs = env_template.observation_space.flatten_n(obs)
    rewards = np.asarray(rewards)
    dones = np.asarray(dones)
    env_infos = tensor_utils.stack_tensor_dict_list(env_infos)
    return ids, obs, rewards, dones, env_infos
예제 #6
0
    def process_samples(self, itr, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * baselines: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        """
        baselines = []
        returns = []
        total_steps = 0

        max_path_length = self.max_path_length

        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self.env_spec, paths),
            discount=self.discount)

        if self.flatten_input:
            paths = [
                dict(
                    observations=(self.env_spec.observation_space.flatten_n(
                        path['observations'])),
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos'],
                    dones=path['dones']) for path in paths
            ]
        else:
            paths = [
                dict(
                    observations=path['observations'],
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos'],
                    dones=path['dones']) for path in paths
            ]

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            total_steps += len(path['rewards'])
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self.discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = np_tensor_utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path['deltas'] = deltas

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        # make all paths the same length
        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path['returns'] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        lengths = np.asarray([v.sum() for v in valids])

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        self.episode_reward_mean.extend(undiscounted_returns)

        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            valids=valids,
            lengths=lengths,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        return samples_data
예제 #7
0
파일: _functions.py 프로젝트: geyang/garage
def paths_to_tensors(paths, max_episode_length, baseline_predictions, discount,
                     gae_lambda):
    """Return processed sample data based on the collected paths.

    Args:
        paths (list[dict]): A list of collected paths.
        max_episode_length (int): Maximum length of a single episode.
        baseline_predictions(numpy.ndarray): : Predicted value of GAE
            (Generalized Advantage Estimation) Baseline.
        discount (float): Environment reward discount.
        gae_lambda (float): Lambda used for generalized advantage
            estimation.

    Returns:
        dict: Processed sample data, with key
            * observations: (numpy.ndarray)
            * actions: (numpy.ndarray)
            * rewards: (numpy.ndarray)
            * baselines: (numpy.ndarray)
            * returns: (numpy.ndarray)
            * valids: (numpy.ndarray)
            * agent_infos: (dict)
            * env_infos: (dict)
            * paths: (list[dict])

    """
    baselines = []
    returns = []
    total_steps = 0

    for idx, path in enumerate(paths):
        total_steps += len(path['rewards'])
        path_baselines = np.append(baseline_predictions[idx], 0)
        deltas = (path['rewards'] + discount * path_baselines[1:] -
                  path_baselines[:-1])
        path['advantages'] = discount_cumsum(deltas, discount * gae_lambda)
        path['deltas'] = deltas

    for idx, path in enumerate(paths):
        # baselines
        path['baselines'] = baseline_predictions[idx]
        baselines.append(path['baselines'])

        # returns
        path['returns'] = discount_cumsum(path['rewards'], discount)
        returns.append(path['returns'])

    # make all paths the same length
    obs = [path['observations'] for path in paths]
    obs = tensor_utils.pad_tensor_n(obs, max_episode_length)

    actions = [path['actions'] for path in paths]
    actions = tensor_utils.pad_tensor_n(actions, max_episode_length)

    rewards = [path['rewards'] for path in paths]
    rewards = tensor_utils.pad_tensor_n(rewards, max_episode_length)

    returns = [path['returns'] for path in paths]
    returns = tensor_utils.pad_tensor_n(returns, max_episode_length)

    baselines = tensor_utils.pad_tensor_n(baselines, max_episode_length)

    agent_infos = [path['agent_infos'] for path in paths]
    agent_infos = tensor_utils.stack_tensor_dict_list([
        tensor_utils.pad_tensor_dict(p, max_episode_length)
        for p in agent_infos
    ])

    env_infos = [path['env_infos'] for path in paths]
    env_infos = tensor_utils.stack_tensor_dict_list([
        tensor_utils.pad_tensor_dict(p, max_episode_length) for p in env_infos
    ])

    valids = [np.ones_like(path['returns']) for path in paths]
    valids = tensor_utils.pad_tensor_n(valids, max_episode_length)

    lengths = np.asarray([v.sum() for v in valids])

    samples_data = dict(
        observations=obs,
        actions=actions,
        rewards=rewards,
        baselines=baselines,
        returns=returns,
        valids=valids,
        lengths=lengths,
        agent_infos=agent_infos,
        env_infos=env_infos,
        paths=paths,
    )

    return samples_data
예제 #8
0
    def paths_to_tensors(self, paths):
        # pylint: disable=too-many-statements
        """Return processed sample data based on the collected paths.

        Args:
            paths (list[dict]): A list of collected paths.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * tasks: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * trjectories: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * baselines: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * letent_infos: (dict)
                * env_infos: (dict)
                * trjectory_infos: (dict)
                * paths: (list[dict])

        """
        max_path_length = self.max_path_length

        def _extract_latent_infos(infos):
            """Extract and pack latent infos from dict.

            Args:
                infos (dict): A dict that contains latent infos with key
                    prefixed by 'latent_'.

            Returns:
                dict: A dict of latent infos.

            """
            latent_infos = dict()
            for k, v in infos.items():
                if k.startswith('latent_'):
                    latent_infos[k[7:]] = v
            return latent_infos

        for path in paths:
            path['actions'] = (self._env_spec.action_space.flatten_n(
                path['actions']))
            path['tasks'] = self.policy.task_space.flatten_n(
                path['env_infos']['task_onehot'])
            path['latents'] = path['agent_infos']['latent']
            path['latent_infos'] = _extract_latent_infos(path['agent_infos'])

            # - Calculate a forward-looking sliding window.
            # - If step_space has shape (n, d), then trajs will have shape
            #   (n, window, d)
            # - The length of the sliding window is determined by the
            #   trajectory inference spec. We smear the last few elements to
            #   preserve the time dimension.
            # - Only observation is used for a single step.
            #   Alternatively, stacked [observation, action] can be used for
            #   in harder tasks.
            obs = pad_tensor(path['observations'], max_path_length)
            obs_flat = self._env_spec.observation_space.flatten_n(obs)
            steps = obs_flat
            window = self._inference.spec.input_space.shape[0]
            traj = np_tensor_utils.sliding_window(steps, window, smear=True)
            traj_flat = self._inference.spec.input_space.flatten_n(traj)
            path['trajectories'] = traj_flat

            _, traj_info = self._inference.get_latents(traj_flat)
            path['trajectory_infos'] = traj_info

        all_path_baselines = [self._baseline.predict(path) for path in paths]

        tasks = [path['tasks'] for path in paths]
        tasks = pad_tensor_n(tasks, max_path_length)

        trajectories = np.stack([path['trajectories'] for path in paths])

        latents = [path['latents'] for path in paths]
        latents = pad_tensor_n(latents, max_path_length)

        latent_infos = [path['latent_infos'] for path in paths]
        latent_infos = stack_tensor_dict_list(
            [pad_tensor_dict(p, max_path_length) for p in latent_infos])

        trajectory_infos = [path['trajectory_infos'] for path in paths]
        trajectory_infos = stack_tensor_dict_list(
            [pad_tensor_dict(p, max_path_length) for p in trajectory_infos])

        samples_data = paths_to_tensors(paths, max_path_length,
                                        all_path_baselines, self._discount,
                                        self._gae_lambda)
        samples_data['tasks'] = tasks
        samples_data['latents'] = latents
        samples_data['latent_infos'] = latent_infos
        samples_data['trajectories'] = trajectories
        samples_data['trajectory_infos'] = trajectory_infos

        return samples_data
예제 #9
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        max_path_length = self.algo.max_path_length

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["deltas"] = deltas

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            returns.append(path["returns"])

        # make all paths the same length
        obs = [path["observations"] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path["actions"] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path["rewards"] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path["returns"] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        advantages = [path["advantages"] for path in paths]
        advantages = tensor_utils.pad_tensor_n(advantages, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        agent_infos = [path["agent_infos"] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path["env_infos"] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path["returns"]) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        average_discounted_return = (np.mean(
            [path["returns"][0] for path in paths]))

        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        ent = np.sum(
            self.algo.policy.distribution.entropy(agent_infos) *
            valids) / np.sum(valids)

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            advantages=advantages,
            baselines=baselines,
            returns=returns,
            valids=valids,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
예제 #10
0
파일: batch_polopt.py 프로젝트: wyjw/garage
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * baselines: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self.discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = special.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path['deltas'] = deltas

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = special.discount_cumsum(path['rewards'],
                                                      self.discount)
            returns.append(path['returns'])

        # make all paths the same length
        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path['returns'] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        average_discounted_return = (np.mean(
            [path['returns'][0] for path in paths]))

        undiscounted_returns = [sum(path['rewards']) for path in paths]
        self.episode_reward_mean.extend(undiscounted_returns)

        ent = np.sum(self.policy.distribution.entropy(agent_infos) *
                     valids) / np.sum(valids)

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            valids=valids,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        tabular.record('Iteration', itr)
        tabular.record('AverageDiscountedReturn', average_discounted_return)
        tabular.record('AverageReturn', np.mean(undiscounted_returns))
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self.episode_reward_mean))
        tabular.record('NumTrajs', len(paths))
        tabular.record('Entropy', ent)
        tabular.record('Perplexity', np.exp(ent))
        tabular.record('StdReturn', np.std(undiscounted_returns))
        tabular.record('MaxReturn', np.max(undiscounted_returns))
        tabular.record('MinReturn', np.min(undiscounted_returns))

        return samples_data
예제 #11
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.

        Parameters
        ----------
        itr : int
            The iteration number.
        paths : list[dict]
            The collected paths from the sampler.

        Returns
        -------
        samples_data : dict
            Processed sample data with same trajectory length (padded with 0)
        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if self.flatten_input:
            paths = [
                dict(
                    observations=(self.env_spec.observation_space.flatten_n(
                        path['observations'])),
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]
        else:
            paths = [
                dict(
                    observations=path['observations'],
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self.discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = np_tensor_utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path['deltas'] = deltas

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        # make all paths the same length
        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path['returns'] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        # average_discounted_return = (np.mean(
        #     [path['returns'][0] for path in paths]))

        undiscounted_returns = [sum(path['rewards']) for path in paths]
        self.episode_reward_mean.extend(undiscounted_returns)

        # ent = np.sum(self.policy.distribution.entropy(agent_infos) *
        #              valids) / np.sum(valids)

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            valids=valids,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        return samples_data
def rollout(env,
            agent,
            max_path_length=np.inf,
            animated=False,
            speedup=1,
            always_return_paths=False):

    observations = []
    tasks = []
    tasks_gt = []
    latents = []
    latent_infos = []
    actions = []
    rewards = []
    agent_infos = []
    env_infos = []

    # Resets
    o = env.reset()
    agent.reset()

    # Sample embedding network
    # NOTE: it is important to do this _once per rollout_, not once per
    # timestep, since we need correlated noise.
    t = env.active_task_one_hot
    task_gt = env.active_task_one_hot_gt
    z, latent_info = agent.get_latent(t)

    if animated:
        env.render()

    path_length = 0
    while path_length < max_path_length:
        #a, agent_info = agent.get_action(np.concatenate((t, o)))
        a, agent_info = agent.get_action_from_latent(z, o)
        # latent_info = agent_info["latent_info"]
        next_o, r, d, env_info = env.step(a)
        observations.append(agent.observation_space.flatten(o))
        tasks.append(t)
        tasks_gt.append(task_gt)
        # z = latent_info["mean"]
        latents.append(agent.latent_space.flatten(z))
        latent_infos.append(latent_info)
        rewards.append(r)
        actions.append(agent.action_space.flatten(a))
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        path_length += 1
        if d:
            break
        o = next_o
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)
    if animated and not always_return_paths:
        return

    return dict(
        observations=tensor_utils.stack_tensor_list(observations),
        actions=tensor_utils.stack_tensor_list(actions),
        rewards=tensor_utils.stack_tensor_list(rewards),
        tasks=tensor_utils.stack_tensor_list(tasks),
        tasks_gt=tensor_utils.stack_tensor_list(tasks_gt),
        latents=tensor_utils.stack_tensor_list(latents),
        latent_infos=tensor_utils.stack_tensor_dict_list(latent_infos),
        agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
        env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
    )
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        max_path_length = self.algo.max_path_length
        action_space = self.algo.env.action_space
        observation_space = self.algo.env.observation_space

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["deltas"] = deltas

        # calculate trajectory tensors (TODO: probably can do this in TF)
        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            returns.append(path["returns"])

            # Calculate trajectory samples
            #
            # Pad and flatten action and observation traces
            act = tensor_utils.pad_tensor(path['actions'], max_path_length)
            obs = tensor_utils.pad_tensor(path['observations'],
                                          max_path_length)
            act_flat = action_space.flatten_n(act)
            obs_flat = observation_space.flatten_n(obs)
            # Create a time series of stacked [act, obs] vectors
            #XXX now the inference network only looks at obs vectors
            #act_obs = np.concatenate([act_flat, obs_flat], axis=1)  # TODO reactivate for harder envs?
            act_obs = obs_flat
            # act_obs = act_flat
            # Calculate a forward-looking sliding window of the stacked vectors
            #
            # If act_obs has shape (n, d), then trajs will have shape
            # (n, window, d)
            #
            # The length of the sliding window is determined by the trajectory
            # inference spec. We smear the last few elements to preserve the
            # time dimension.
            window = self.algo.inference.input_space.shape[0]
            trajs = sliding_window(act_obs, window, 1, smear=True)
            trajs_flat = self.algo.inference.input_space.flatten_n(trajs)
            path['trajectories'] = trajs_flat

            # trajectory infos
            _, traj_infos = self.algo.inference.get_latents(trajs)
            path['trajectory_infos'] = traj_infos

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        #DEBUG CPU vars ######################
        cpu_adv = tensor_utils.concat_tensor_list(
            [path["advantages"] for path in paths])
        cpu_deltas = tensor_utils.concat_tensor_list(
            [path["deltas"] for path in paths])
        cpu_act = tensor_utils.concat_tensor_list(
            [path["actions"] for path in paths])
        cpu_obs = tensor_utils.concat_tensor_list(
            [path["observations"] for path in paths])
        cpu_agent_infos = tensor_utils.concat_tensor_dict_list(
            [path["agent_infos"] for path in paths])

        if self.algo.center_adv:
            cpu_adv = utils.center_advantages(cpu_adv)

        if self.algo.positive_adv:
            cpu_adv = utils.shift_advantages_to_positive(cpu_adv)
        #####################################

        # make all paths the same length
        obs = [path["observations"] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path["actions"] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        tasks = [path["tasks"] for path in paths]
        tasks = tensor_utils.pad_tensor_n(tasks, max_path_length)

        tasks_gt = [path['tasks_gt'] for path in paths]
        tasks_gt = tensor_utils.pad_tensor_n(tasks_gt, max_path_length)

        latents = [path['latents'] for path in paths]
        latents = tensor_utils.pad_tensor_n(latents, max_path_length)

        rewards = [path["rewards"] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path["returns"] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        trajectories = tensor_utils.stack_tensor_list(
            [path["trajectories"] for path in paths])

        agent_infos = [path["agent_infos"] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        latent_infos = [path["latent_infos"] for path in paths]
        latent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in latent_infos
        ])

        trajectory_infos = [path["trajectory_infos"] for path in paths]
        trajectory_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in trajectory_infos
        ])

        env_infos = [path["env_infos"] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path["returns"]) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        average_discounted_return = \
            np.mean([path["returns"][0] for path in paths])

        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        ent = np.sum(
            self.algo.policy.distribution.entropy(agent_infos) *
            valids) / np.sum(valids)

        samples_data = dict(
            observations=obs,
            actions=actions,
            tasks=tasks,
            latents=latents,
            trajectories=trajectories,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            valids=valids,
            agent_infos=agent_infos,
            latent_infos=latent_infos,
            trajectory_infos=trajectory_infos,
            env_infos=env_infos,
            paths=paths,
            cpu_adv=cpu_adv,  #DEBUG
            cpu_deltas=cpu_deltas,  #DEBUG
            cpu_obs=cpu_obs,  #DEBUG
            cpu_act=cpu_act,  #DEBUG
            cpu_agent_infos=cpu_agent_infos,  # DEBUG
        )

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
예제 #14
0
    def process_samples(self, itr, paths):
        """Return processed sample data based on the collected paths.
        (same as in bath_polopt without entropy and tabular recording)
        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            dict: Processed sample data, with key
                * observations: (numpy.ndarray)
                * actions: (numpy.ndarray)
                * rewards: (numpy.ndarray)
                * baselines: (numpy.ndarray)
                * returns: (numpy.ndarray)
                * valids: (numpy.ndarray)
                * agent_infos: (dict)
                * env_infos: (dict)
                * paths: (list[dict])
                * average_return: (numpy.float64)

        """
        baselines = []
        returns = []

        max_path_length = self.max_path_length

        if self.flatten_input:
            paths = [
                dict(
                    observations=(self.env_spec.observation_space.flatten_n(
                        path['observations'])),
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]
        else:
            paths = [
                dict(
                    observations=path['observations'],
                    actions=(
                        self.env_spec.action_space.flatten_n(  # noqa: E126
                            path['actions'])),
                    rewards=path['rewards'],
                    env_infos=path['env_infos'],
                    agent_infos=path['agent_infos']) for path in paths
            ]

        if hasattr(self.baseline, 'predict_n'):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = (path['rewards'] + self.discount * path_baselines[1:] -
                      path_baselines[:-1])
            path['advantages'] = np_tensor_utils.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path['deltas'] = deltas

        for idx, path in enumerate(paths):
            # baselines
            path['baselines'] = all_path_baselines[idx]
            baselines.append(path['baselines'])

            # returns
            path['returns'] = np_tensor_utils.discount_cumsum(
                path['rewards'], self.discount)
            returns.append(path['returns'])

        # make all paths the same length
        obs = [path['observations'] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        actions = [path['actions'] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        rewards = [path['rewards'] for path in paths]
        rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

        returns = [path['returns'] for path in paths]
        returns = tensor_utils.pad_tensor_n(returns, max_path_length)

        baselines = tensor_utils.pad_tensor_n(baselines, max_path_length)

        agent_infos = [path['agent_infos'] for path in paths]
        agent_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length)
            for p in agent_infos
        ])

        env_infos = [path['env_infos'] for path in paths]
        env_infos = tensor_utils.stack_tensor_dict_list([
            tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos
        ])

        valids = [np.ones_like(path['returns']) for path in paths]
        valids = tensor_utils.pad_tensor_n(valids, max_path_length)

        # average_discounted_return = (np.mean(
        #     [path['returns'][0] for path in paths]))

        undiscounted_returns = [sum(path['rewards']) for path in paths]
        self.episode_reward_mean.extend(undiscounted_returns)

        # ent = np.sum(self.policy.distribution.entropy(agent_infos) *
        #              valids) / np.sum(valids)

        samples_data = dict(
            observations=obs,
            actions=actions,
            rewards=rewards,
            baselines=baselines,
            returns=returns,
            valids=valids,
            agent_infos=agent_infos,
            env_infos=env_infos,
            paths=paths,
            average_return=np.mean(undiscounted_returns),
        )

        return samples_data