예제 #1
0
 def flatten(self):
     rtn = dict()
     rtn['observations'] = stack_tensor_list(self['observations'])
     rtn['actions'] = stack_tensor_list(self['actions'])
     rtn['rewards'] = stack_tensor_list(self['rewards'])
     # for each key, val in agent_infos and env_infos, stack them bring them to lowest level
     for k, v in stack_tensor_dict_list(self['agent_infos']).items():
         rtn[k] = v
     for k, v in stack_tensor_dict_list(self['env_infos']).items():
         rtn[k] = v
     return rtn
예제 #2
0
    def step(self, action_n):
        results = singleton_pool.run_each(
            worker_run_step,
            [(action_n, self.scope) for _ in self._alloc_env_ids],
        )
        results = [x for x in results if x is not None]
        ids, obs, rewards, dones, env_infos = zip(*results)
        ids = np.concatenate(ids)
        obs = self.observation_space.unflatten_n(np.concatenate(obs))
        rewards = np.concatenate(rewards)
        dones = np.concatenate(dones)
        env_infos = tensor_utils.split_tensor_dict_list(
            tensor_utils.concat_tensor_dict_list(env_infos))
        if env_infos is None:
            env_infos = [dict() for _ in xrange(self.num_envs)]

        items = zip(ids, obs, rewards, dones, env_infos)
        items = sorted(items, key=lambda x: x[0])

        ids, obs, rewards, dones, env_infos = zip(*items)

        obs = list(obs)
        rewards = np.asarray(rewards)
        dones = np.asarray(dones)

        self.ts += 1
        dones[self.ts >= self.max_path_length] = True

        reset_obs = self._run_reset(dones)
        for (i, done) in enumerate(dones):
            if done:
                obs[i] = reset_obs[i]
                self.ts[i] = 0
        return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(
            list(env_infos))
예제 #3
0
 def eval_expert_probs(self, expert_paths, policy, insert=False):
     """
     Evaluate expert policy probability under current policy
     """
     if isinstance(policy, np.ndarray):
         return self._compute_path_probs(expert_paths, insert=insert)
     elif hasattr(policy, 'recurrent') and policy.recurrent:
         policy.reset([True] * len(expert_paths))
         expert_obs = self.extract_paths(expert_paths,
                                         keys=('observations', ),
                                         stack=True)[0]
         agent_infos = []
         for t in range(expert_obs.shape[1]):
             a, infos = policy.get_actions(expert_obs[:, t])
             agent_infos.append(infos)
         agent_infos_stack = tensor_utils.stack_tensor_dict_list(
             agent_infos)
         for key in agent_infos_stack:
             agent_infos_stack[key] = np.transpose(agent_infos_stack[key],
                                                   axes=[1, 0, 2])
         agent_infos_transpose = tensor_utils.split_tensor_dict_list(
             agent_infos_stack)
         for i, path in enumerate(expert_paths):
             path['agent_infos'] = agent_infos_transpose[i]
     else:
         for path in expert_paths:
             actions, agent_infos = policy.get_actions(path['observations'])
             path['agent_infos'] = agent_infos
     return self._compute_path_probs(expert_paths, insert=insert)
    def step(self, action_n):
        results = singleton_pool.run_each(
            worker_run_step,
            [(action_n, self.scope) for _ in self._alloc_env_ids],
        )
        results = [x for x in results if x is not None]
        ids, obs, rewards, dones, env_infos = list(zip(*results))
        ids = np.concatenate(ids)
        obs = self.observation_space.unflatten_n(np.concatenate(obs))
        rewards = np.concatenate(rewards)
        dones = np.concatenate(dones)
        env_infos = tensor_utils.split_tensor_dict_list(tensor_utils.concat_tensor_dict_list(env_infos))
        if env_infos is None:
            env_infos = [dict() for _ in range(self.num_envs)]

        items = list(zip(ids, obs, rewards, dones, env_infos))
        items = sorted(items, key=lambda x: x[0])

        ids, obs, rewards, dones, env_infos = list(zip(*items))

        obs = list(obs)
        rewards = np.asarray(rewards)
        dones = np.asarray(dones)

        self.ts += 1
        dones[self.ts >= self.max_path_length] = True

        reset_obs = self._run_reset(dones)
        for (i, done) in enumerate(dones):
            if done:
                obs[i] = reset_obs[i]
                self.ts[i] = 0
        return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(list(env_infos))
    def step(self, action_n):
        # use the model to make (predicted) steps
        prev_obs = self.current_obs
        next_obs = self.model.predict(prev_obs, action_n)

        rewards = self.unwrapped_env.reward(prev_obs, action_n, next_obs)

        if self.has_done_fn:
            dones = self.unwrapped_env.done(next_obs)
        else:
            dones = np.asarray([False for _ in range(self.n_parallel)])

        env_infos = [{} for _ in range(action_n.shape[0])]

        self.ts += 1
        if self.max_path_length is not None:
            dones[self.ts >= self.max_path_length] = True
        for (i, done) in enumerate(dones):
            if done:
                next_obs[i] = self.env.reset()
                self.ts[i] = 0

        self.current_obs = next_obs

        # transform obs to lists
        next_obs = [
            np.squeeze(o) for o in np.vsplit(next_obs, next_obs.shape[0])
        ]
        return next_obs, list(rewards), list(
            dones), tensor_utils.stack_tensor_dict_list(env_infos)  #lists
예제 #6
0
    def eval_expert_probs(self,
                          expert_paths,
                          policy,
                          insert=False,
                          context=None):
        """
        Evaluate expert policy probability under current policy
        """
        for traj in expert_paths:
            if 'agent_infos' in traj:
                del traj['agent_infos']
            if 'a_logprobs' in traj:
                del traj['a_logprobs']

        if isinstance(policy, np.ndarray):
            return ImitationLearning._compute_path_probs(expert_paths,
                                                         insert=insert)
        elif hasattr(policy, 'recurrent') and policy.recurrent:
            policy.reset([True] * len(expert_paths))
            expert_obs = ImitationLearning.extract_paths(
                expert_paths, keys=('observations', ), stack=True)[0]
            if context is not None:
                expert_obs = np.concatenate(
                    (expert_obs,
                     np.tile(np.expand_dims(context[i], axis=1),
                             [1, expert_obs.shape[0], 1])),
                    axis=-1)
            agent_infos = []
            for t in range(expert_obs.shape[1]):
                a, infos = policy.get_actions(expert_obs[:, t])
                agent_infos.append(infos)
            agent_infos_stack = tensor_utils.stack_tensor_dict_list(
                agent_infos)
            for key in agent_infos_stack:
                agent_infos_stack[key] = np.transpose(agent_infos_stack[key],
                                                      axes=[1, 0, 2])
            agent_infos_transpose = tensor_utils.split_tensor_dict_list(
                agent_infos_stack)
            for i, path in enumerate(expert_paths):
                path['agent_infos'] = agent_infos_transpose[i]
        else:
            for i, path in enumerate(expert_paths):
                expert_obs = path['observations']
                if context is not None:
                    expert_obs = np.concatenate(
                        (expert_obs,
                         np.tile(np.expand_dims(context[i], axis=0),
                                 [expert_obs.shape[0], 1])),
                        axis=-1)
                actions, agent_infos = policy.get_actions(expert_obs)
                path['agent_infos'] = agent_infos
        return ImitationLearning._compute_path_probs(expert_paths,
                                                     insert=insert)
예제 #7
0
 def step(self, action_n):
     all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)]
     obs, rewards, dones, env_infos = map(list, zip(*all_results))
     dones = np.asarray(dones)
     rewards = np.asarray(rewards)
     self.ts += 1
     dones[self.ts >= self.max_path_length] = True
     for (i, done) in enumerate(dones):
         if done:
             obs[i] = self.envs[i].reset()
             self.ts[i] = 0
     return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
예제 #8
0
 def step(self, action_n):
     all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)]
     obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results))))
     dones = np.asarray(dones)
     rewards = np.asarray(rewards)
     self.ts += 1
     if self.max_path_length is not None:
         dones[self.ts >= self.max_path_length] = True
     for (i, done) in enumerate(dones):
         if done:
             obs[i] = self.envs[i].reset()
             self.ts[i] = 0
     return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
예제 #9
0
 def step(self, action_n, reset_args=None):
     if reset_args is None:
         reset_args = [None]*len(self.envs)
     all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)]
     obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results))))
     dones = np.asarray(dones)
     rewards = np.asarray(rewards)
     self.ts += 1
     if self.max_path_length is not None:
         dones[self.ts >= self.max_path_length] = True
     for (i, done) in enumerate(dones):
         if done:
             obs[i] = self.envs[i].reset(reset_args=reset_args[i])
             self.ts[i] = 0
     return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
예제 #10
0
def worker_run_step(G, action_n, scope):
    assert hasattr(G, 'parallel_vec_envs')
    assert scope in G.parallel_vec_envs
    env_template = G.parallel_vec_env_template[scope]
    ids = []
    step_results = []
    for (idx, env) in G.parallel_vec_envs[scope]:
        action = action_n[idx]
        ids.append(idx)
        step_results.append(tuple(env.step(action)))
    if len(step_results) == 0:
        return None
    obs, rewards, dones, env_infos = map(list, zip(*step_results))
    obs = env_template.observation_space.flatten_n(obs)
    rewards = np.asarray(rewards)
    dones = np.asarray(dones)
    env_infos = tensor_utils.stack_tensor_dict_list(env_infos)
    return ids, obs, rewards, dones, env_infos
def worker_run_step(G, action_n, scope):
    assert hasattr(G, 'parallel_vec_envs')
    assert scope in G.parallel_vec_envs
    env_template = G.parallel_vec_env_template[scope]
    ids = []
    step_results = []
    for (idx, env) in G.parallel_vec_envs[scope]:
        action = action_n[idx]
        ids.append(idx)
        step_results.append(tuple(env.step(action)))
    if len(step_results) == 0:
        return None
    obs, rewards, dones, env_infos = list(map(list, list(zip(*step_results))))
    obs = env_template.observation_space.flatten_n(obs)
    rewards = np.asarray(rewards)
    dones = np.asarray(dones)
    env_infos = tensor_utils.stack_tensor_dict_list(env_infos)
    return ids, obs, rewards, dones, env_infos
예제 #12
0
    def step(self, action_n, traj_starting_obs=None, traj_starting_ts=None):
        """
        :param action_n: batches of actions for all models/taks stacked on top of each other (n_models * batch_per_model, ndim_act)
        :return: predicted observations (n_models * batch_per_model, ndim_obs)
        """

        assert action_n.shape[0] == self.n_parallel

        # use the model to make (predicted) steps
        prev_obs = self.current_obs
        next_obs = self.model.predict_model_batches(prev_obs, action_n)
        if self.clip_obs:
            next_obs = np.clip(next_obs, -1000, 1000)
        rewards = self.unwrapped_env.reward(prev_obs, action_n, next_obs)

        if self.has_done_fn:
            dones = self.unwrapped_env.done(next_obs)
        else:
            dones = np.asarray([False for _ in range(self.n_parallel)])

        env_infos = [{} for _ in range(action_n.shape[0])]

        self.ts += 1
        if self.max_path_length is not None:
            dones[self.ts >= self.max_path_length] = True
        for (i, done) in enumerate(dones):
            if done:
                if traj_starting_obs is None or np.random.random() < 0.1:
                    next_obs[i] = self.env.reset()
                    self.ts[i] = 0
                else:
                    min_idx = max(-10000, -traj_starting_obs.shape[0])
                    idx = np.random.randint(min_idx, 0)
                    next_obs[i] = traj_starting_obs[idx, :]
                    self.ts[i] = traj_starting_ts[idx]

        self.current_obs = next_obs

        # transform obs to lists
        next_obs = [
            np.squeeze(o) for o in np.vsplit(next_obs, next_obs.shape[0])
        ]
        return next_obs, list(rewards), list(
            dones), tensor_utils.stack_tensor_dict_list(env_infos)  #lists
    def step(self, action_n, itr):

        all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)]

        obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results))))


        # if env_action_space == 5:
        #     ###function to modify the goal position
        #     rewards, dones = self.change_goal_state(itr, obs, rewards, dones)

        dones = np.asarray(dones)
        rewards = np.asarray(rewards)
        self.ts += 1
        if self.max_path_length is not None:
            dones[self.ts >= self.max_path_length] = True
        for (i, done) in enumerate(dones):
            if done:
                obs[i] = self.envs[i].reset()
                self.ts[i] = 0
        return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
예제 #14
0
 def eval_expert_probs(self, expert_paths, policy, insert=False):
     """
     Evaluate expert policy probability under current policy
     """
     if policy.recurrent:
         policy.reset([True]*len(expert_paths))
         expert_obs = self.extract_paths(expert_paths, keys=('observations',))[0]
         agent_infos = []
         for t in range(expert_obs.shape[1]):
             a, infos = policy.get_actions(expert_obs[:, t])
             agent_infos.append(infos)
         agent_infos_stack = tensor_utils.stack_tensor_dict_list(agent_infos)
         for key in agent_infos_stack:
             agent_infos_stack[key] = np.transpose(agent_infos_stack[key], axes=[1,0,2])
         agent_infos_transpose = tensor_utils.split_tensor_dict_list(agent_infos_stack)
         for i, path in enumerate(expert_paths):
             path['agent_infos'] = agent_infos_transpose[i]
     else:
         for path in expert_paths:
             actions, agent_infos = policy.get_actions(path['observations'])
             path['agent_infos'] = agent_infos
     return self._compute_path_probs(expert_paths, insert=insert)
예제 #15
0
 def eval_expert_probs(self,
                       expert_paths,
                       policy,
                       insert=False,
                       context=None):
     """
     Evaluate expert policy probability under current policy
     """
     if policy.recurrent:
         policy.reset([True] * len(expert_paths))
         expert_obs = ImitationLearning.extract_paths(
             expert_paths, keys=('observations', ))[0]
         if context is not None:
             expert_obs = np.concatenate((expert_obs, context), axis=-1)
         agent_infos = []
         for t in range(expert_obs.shape[1]):
             a, infos = policy.get_actions(expert_obs[:, t])
             agent_infos.append(infos)
         agent_infos_stack = tensor_utils.stack_tensor_dict_list(
             agent_infos)
         for key in agent_infos_stack:
             agent_infos_stack[key] = np.transpose(agent_infos_stack[key],
                                                   axes=[1, 0, 2])
         agent_infos_transpose = tensor_utils.split_tensor_dict_list(
             agent_infos_stack)
         for i, path in enumerate(expert_paths):
             path['agent_infos'] = agent_infos_transpose[i]
     else:
         for path in expert_paths:
             expert_obs = path['observations']
             if context is not None:
                 expert_obs = np.concatenate((expert_obs, context), axis=-1)
             actions, agent_infos = policy.get_actions(expert_obs)
             path['agent_infos'] = agent_infos
     return ImitationLearning._compute_path_probs(expert_paths,
                                                  insert=insert)
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        if len(paths) > 0 and "vf" in paths[0]["agent_infos"]:
            all_path_baselines = [
                p["agent_infos"]["vf"].flatten() for p in paths
            ]
        else:
            if hasattr(self.algo.baseline, "predict_n"):
                all_path_baselines = self.algo.baseline.predict_n(paths)
            else:
                all_path_baselines = [
                    self.algo.baseline.predict(path) for path in paths
                ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular_misc_stat('TrajLen',
                                        [len(p["rewards"]) for p in paths],
                                        placement='front')
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular_misc_stat('Return',
                                        undiscounted_returns,
                                        placement='front')

        return samples_data