Exemplo n.º 1
0
    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
Exemplo n.º 2
0
    def obtain_samples(self,
                       itr,
                       reset_args=None,
                       task_idxs=None,
                       return_dict=False,
                       log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list
                                       and type(reset_args) != np.ndarray):
            reset_args = [reset_args] * self.vec_env.num_envs

        n_samples = 0
        curr_noises = [
            np.random.normal(0, 1, size=(self.latent_dim, ))
            for _ in range(self.vec_env.num_envs)
        ]

        #curr_noises = [np.ones(size = (self.latent_dim)) for _ in range(self.vec_env.num_envs)]
        obses = self.vec_env.reset(reset_args)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time

        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)  #TODO: What the hell does this do?
            actions, agent_infos = policy.get_actions(obses, task_idxs,
                                                      curr_noises)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done, noise in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones, curr_noises):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(observations=[],
                                              actions=[],
                                              rewards=[],
                                              env_infos=[],
                                              agent_infos=[],
                                              noises=[])
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                running_paths[idx]["noises"].append(noise)

                if done:
                    paths[idx].append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            noises=self.flatten_n(
                                running_paths[idx]["noises"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
                    curr_noises[idx] = np.random.normal(
                        0, 1, size=(self.latent_dim, ))
                    #curr_noises[idx] = np.ones(size=(self.latent_dim))

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
        logger.record_tabular(log_prefix + "EnvExecTime", env_time)
        logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [
                item for sublist in l for item in sublist
            ]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='', extra_input=None, extra_input_dim=None, preupdate=False, save_img_obs=False, numTrajs_perTask = None):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        #logger.log("Obtaining samples for iteration %d..." % itr)

        if extra_input is not None:
            if extra_input == "onehot_exploration":
                if preupdate:
                    print("debug, using extra_input onehot")
                    def expand_obs(obses, path_nums):
                        extra = [special.to_onehot(path_num % extra_input_dim, extra_input_dim) for path_num in path_nums]
                        return np.concatenate((obses, extra), axis=1)
                else:
                    print("debug, using extra_input zeros")
                    def expand_obs(obses, path_nums):
                        extra = [np.zeros(extra_input_dim) for path_num in path_nums]
                        return np.concatenate((obses, extra),axis=1)
            elif extra_input == "onehot_hacked":
                if preupdate:
                    print("debug, using extra_input onehot")
                    def expand_obs(obses, path_nums):
                        extra = [special.to_onehot(3, extra_input_dim) for path_num in path_nums]
                        return np.concatenate((obses, extra), axis=1)
                else:
                    print("debug, using extra_input zeros")
                    def expand_obs(obses, path_nums):
                        extra = [np.zeros(extra_input_dim) for path_num in path_nums]
                        return np.concatenate((obses, extra),axis=1)
            elif extra_input == "gaussian_exploration":
                if preupdate:
                    print("debug, using extra_input gaussian")

                    def expand_obs(obses, path_nums):
                        extra = [np.random.normal(0.,1.,size=(extra_input_dim,)) for path_num in path_nums]
                        return np.concatenate((obses, extra), axis=1)
                else:
                    print("debug, using extra_input zeros")
                    def expand_obs(obses, path_nums):
                        extra = [np.zeros(extra_input_dim) for path_num in path_nums]
                        return np.concatenate((obses, extra), axis=1)


            else:
                def expand_obs(obses, path_nums):
                    return obses
        else:
            def expand_obs(obses, path_nums):
                return obses
        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray):
            assert False, "debug, should we be using this?"
            print("WARNING, will vectorize reset_args")
            reset_args = [reset_args]*self.vec_env.num_envs


        n_samples = 0
        path_nums = [0] * self.vec_env.num_envs # keeps track on which rollout we are for each environment instance
        obses = self.vec_env.reset(reset_args)
        obses = expand_obs(obses, path_nums)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        #pbar = ProgBarCounter(self.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0
        policy = self.algo.policy
        import time

        while n_samples < self.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            # print("debug, agent_infos", agent_infos)
            policy_time += time.time() - t
            t = time.time()

            next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args)   # TODO: instead of receive obs from env, we'll receive it from the policy as a feed_dict
            next_obses = expand_obs(next_obses,path_nums)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])  # TODO: let's also add the incomplete running_paths to paths
                    running_paths[idx] = None
                    path_nums[idx] += 1
            process_time += time.time() - t
            #pbar.inc(len(obses))
            obses = next_obses


       # pbar.stop()

      #  logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
      #  logger.record_tabular(log_prefix + "EnvExecTime", env_time)
       # logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Exemplo n.º 4
0
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='',  preupdate=False, save_img_obs=False, contexts = None):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray):
            assert False, "debug, should we be using this?"
            print("WARNING, will vectorize reset_args")
            reset_args = [reset_args]*self.vec_env.num_envs


        n_samples = 0
        path_nums = [0] * self.vec_env.num_envs # keeps track on which rollout we are for each environment instance
        obses = self.vec_env.reset(reset_args)
        if contexts:
            obses = np.concatenate([obses, contexts], axis = 1)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        if contexts:
            policy = self.algo.post_policy
        else:
            policy = self.algo.policy

        while n_samples < self.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            # print("debug, agent_infos", agent_infos)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args)   # TODO: instead of receive obs from env, we'll receive it from the policy as a feed_dict
            if contexts:
                next_obses = np.concatenate([next_obses, contexts], axis = 1)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])  # TODO: let's also add the incomplete running_paths to paths
                    running_paths[idx] = None
                    path_nums[idx] += 1
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        # adding the incomplete paths
        # for idx in range(self.vec_env.num_envs):
        #     if running_paths[idx] is not None:
        #         paths[idx].append(dict(
        #             observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
        #             actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
        #             rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
        #             env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
        #             agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
        #         ))


        pbar.stop()





      #  logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
      #  logger.record_tabular(log_prefix + "EnvExecTime", env_time)
       # logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Exemplo n.º 5
0
    def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''):
        # reset_args: arguments to pass to the environments to reset
        # return_dict: whether or not to return a dictionary or list form of paths

        logger.log("Obtaining samples for iteration %d..." % itr)

        #paths = []
        paths = {}
        for i in range(self.vec_env.num_envs):
            paths[i] = []

        # if the reset args are not list/numpy, we set the same args for each env
        if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray):
            reset_args = [reset_args]*self.vec_env.num_envs

        n_samples = 0
        obses = self.vec_env.reset(reset_args)
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time


        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)

            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths[idx].append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular(log_prefix+"PolicyExecTime", policy_time)
        logger.record_tabular(log_prefix+"EnvExecTime", env_time)
        logger.record_tabular(log_prefix+"ProcessExecTime", process_time)

        if not return_dict:
            flatten_list = lambda l: [item for sublist in l for item in sublist]
            paths = flatten_list(paths.values())
            #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()])

        return paths
Exemplo n.º 6
0
    def obtain_samples(self, itr):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0
        import time
        while n_samples < self.algo.batch_size:
            t = time.time()
            self.algo.policy.reset(dones)
            actions, agent_infos = self.algo.policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
                                                                                    rewards, env_infos, agent_infos,
                                                                                    dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(dict(
                        observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
                        actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
                        rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
                        env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
                        agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
                    ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
Exemplo n.º 7
0
def ed_dec_rollout(env,
                   agents,
                   max_path_length=np.inf,
                   animated=False,
                   speedup=1):
    if (agents.recurrent):
        assert isinstance(
            agents,
            GSMDPRecurrentPolicy), 'Recurrent policy is not a GSMDP class'
    """Decentralized rollout"""
    n_agents = len(env.agents)
    observations = [[] for _ in range(n_agents)]
    actions = [[] for _ in range(n_agents)]
    rewards = [[] for _ in range(n_agents)]
    agent_infos = [[] for _ in range(n_agents)]
    env_infos = [[] for _ in range(n_agents)]
    offset_t_sojourn = [[] for _ in range(n_agents)]
    olist = env.reset()
    assert len(olist) == n_agents, "{} != {}".format(len(olist), n_agents)

    agents.reset(dones=[True for _ in range(n_agents)])
    path_length = 0
    if animated:
        env.render()
    while path_length < max_path_length:
        agents_to_act = [
            i for i, j in enumerate(olist) if j != [None] * len(j)
        ]
        if (not agents.recurrent):
            alist, agent_info_list = agents.get_actions(
                [olist[i] for i in agents_to_act])
            agent_info_list = tensor_utils.split_tensor_dict_list(
                agent_info_list)
        else:
            alist, agent_info_list = agents.get_actions(olist)
            alist = [a for a in alist if a != None]
            agent_info_list = tensor_utils.split_tensor_dict_list(
                agent_info_list)
            agent_info_list = [
                ainfo for i, ainfo in enumerate(agent_info_list)
                if i in agents_to_act
            ]

        next_actions = [None] * n_agents  # will fill in in the loop

        # For each agent
        for ind, o in enumerate([olist[j] for j in agents_to_act]):
            # ind refers to non-None indicies
            # i refers to indices with Nones
            i = agents_to_act[ind]
            observations[i].append(env.observation_space.flatten(o))
            # observations[i].append(o) # REMOVE THIS AND UNCOMMENT THE ABOVE LINE
            actions[i].append(env.action_space.flatten(alist[ind]))
            next_actions[i] = alist[ind]
            if agent_info_list is None:
                agent_infos[i].append({})
            else:
                agent_infos[i].append(agent_info_list[ind])

        # take next actions

        next_olist, rlist, d, env_info = env.step(np.asarray(next_actions))

        # update sojourn time (we should associate ts from next_olist to r, not current)

        for i, r in enumerate(rlist):
            if r is None: continue
            # skip reward if agent has not acted yet
            if (len(observations[i]) > 0):
                rewards[i].append(r)
                offset_t_sojourn[i].append(
                    env.observation_space.flatten(next_olist[i])[-1])
                env_infos[i].append(env_info)
        path_length = max([len(o) for o in observations])
        if d:
            break
        olist = next_olist
        if animated:
            env.render()
            timestep = 0.05
            time.sleep(timestep / speedup)

    if (path_length == max_path_length):
        # probably have some paths that aren't the right length
        for ind, o in enumerate(observations):
            r = rewards[ind]
            if (len(o) > len(r)):
                assert(len(o) <= (len(r) + 1)), \
                 'len(o) %d, len(r) %d' % (len(o), len(r))
                # delete last elem of obs, actions, agent infos
                del observations[ind][-1]
                del actions[ind][-1]
                del agent_infos[ind][-1]

    if animated:
        env.render()

    # remove empty agent trajectories
    observations = [o for o in observations if len(o) > 0]
    actions = [a for a in actions if len(a) > 0]
    rewards = [r for r in rewards if len(r) > 0]
    agent_infos = [i for i in agent_infos if len(i) > 0]
    env_infos = [e for e in env_infos if len(e) > 0]
    offset_t_sojourn = [o for o in offset_t_sojourn if len(o) > 0]

    if (any(
            map(lambda x: x < n_agents, [
                len(observations),
                len(actions),
                len(rewards),
                len(agent_infos),
                len(env_infos)
            ]))):
        print('\nWARNING: \n')
        print('n_agents: ', n_agents)
        print('len(observations): ', len(observations))
        print('len(actions): ', len(actions))
        print('len(rewards): ', len(rewards))
        print('len(agent_infos): ', len(agent_infos))
        print('len(env_infos): ', len(env_infos))

    return [
        dict(
            observations=tensor_utils.stack_tensor_list(observations[i]),
            actions=tensor_utils.stack_tensor_list(actions[i]),
            rewards=tensor_utils.stack_tensor_list(rewards[i]),
            agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos[i]),
            env_infos=tensor_utils.stack_tensor_dict_list(env_infos[i]),
            offset_t_sojourn=tensor_utils.stack_tensor_list(
                offset_t_sojourn[i]),
        ) for i in range(len(observations))
    ]
Exemplo n.º 8
0
    def obtain_agent_info_offpolicy(self,
                                    itr,
                                    expert_trajs_dir=None,
                                    offpol_trajs=None,
                                    treat_as_expert_traj=False,
                                    log_prefix=''):
        assert expert_trajs_dir is None, "deprecated"
        start = time.time()
        if offpol_trajs is None:
            assert expert_trajs_dir is not None, "neither offpol_trajs nor expert_trajs_dir is provided"
            if self.use_pooled_goals:
                for t, taskidx in enumerate(self.goals_idxs_for_itr_dict[itr]):
                    assert np.array_equal(
                        self.goals_pool[taskidx],
                        self.goals_to_use_dict[itr][t]), "fail"
                offpol_trajs = {
                    t: joblib.load(expert_trajs_dir + str(taskidx) +
                                   self.expert_trajs_suffix + ".pkl")
                    for t, taskidx in enumerate(
                        self.goals_idxs_for_itr_dict[itr])
                }
            else:
                offpol_trajs = joblib.load(expert_trajs_dir + str(itr) +
                                           self.expert_trajs_suffix + ".pkl")

            offpol_trajs = {
                tasknum: offpol_trajs[tasknum]
                for tasknum in range(self.meta_batch_size)
            }

        # some initial rearrangement
        tasknums = offpol_trajs.keys(
        )  # tasknums is range(self.meta_batch_size) as can be seen above
        for t in tasknums:
            for path in offpol_trajs[t]:
                if 'expert_actions' not in path.keys(
                ) and treat_as_expert_traj:
                    # print("copying expert actions, you should do this only 1x per metaitr")
                    path['expert_actions'] = np.clip(deepcopy(path['actions']),
                                                     -1.0, 1.0)

                if treat_as_expert_traj:
                    path['agent_infos'] = dict(
                        mean=[[0.0] * len(path['actions'][0])] *
                        len(path['actions']),
                        log_std=[[0.0] * len(path['actions'][0])] *
                        len(path['actions']))
                else:
                    path['agent_infos'] = [None] * len(path['rewards'])

        if not treat_as_expert_traj:
            print("debug12, running offpol on own previous samples")
            running_path_idx = {t: 0 for t in tasknums}
            running_intra_path_idx = {t: 0 for t in tasknums}
            while max([running_path_idx[t] for t in tasknums
                       ]) > -0.5:  # we cycle until all indices are -1
                observations = [
                    offpol_trajs[t][running_path_idx[t]]['observations'][
                        running_intra_path_idx[t]] for t in tasknums
                ]
                actions, agent_infos = self.policy.get_actions(observations)
                agent_infos = split_tensor_dict_list(agent_infos)
                for t, action, agent_info in zip(itertools.count(), actions,
                                                 agent_infos):
                    offpol_trajs[t][running_path_idx[t]]['agent_infos'][
                        running_intra_path_idx[t]] = agent_info
                    # INDEX JUGGLING:
                    if -0.5 < running_intra_path_idx[t] < len(offpol_trajs[t][
                            running_path_idx[t]]['rewards']) - 1:
                        # if we haven't reached the end:
                        running_intra_path_idx[t] += 1
                    else:

                        if -0.5 < running_path_idx[t] < len(
                                offpol_trajs[t]) - 1:
                            # we wrap up the agent_infos
                            offpol_trajs[t][running_path_idx[t]]['agent_infos'] = \
                                stack_tensor_dict_list(offpol_trajs[t][running_path_idx[t]]['agent_infos'])
                            # if we haven't reached the last path:
                            running_intra_path_idx[t] = 0
                            running_path_idx[t] += 1
                        elif running_path_idx[t] == len(offpol_trajs[t]) - 1:
                            offpol_trajs[t][running_path_idx[t]]['agent_infos'] = \
                                stack_tensor_dict_list(offpol_trajs[t][running_path_idx[t]]['agent_infos'])
                            running_intra_path_idx[t] = -1
                            running_path_idx[t] = -1
                        else:
                            # otherwise we set the running index to -1 to signal a stop
                            running_intra_path_idx[t] = -1
                            running_path_idx[t] = -1
        total_time = time.time() - start
        # logger.record_tabular(log_prefix+"TotalExecTime", total_time)
        return offpol_trajs
Exemplo n.º 9
0
    def obtain_samples(self,
                       itr,
                       max_path_length,
                       batch_size,
                       max_n_trajs=None):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        n_samples = 0
        dones = np.asarray([True] * self.vec_env.n_envs)
        obses = self.vec_env.reset(dones)
        running_paths = [None] * self.vec_env.n_envs

        pbar = ProgBarCounter(batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.policy
        import time
        while n_samples < batch_size:
            t = time.time()
            if hasattr(self.vec_env, "handle_policy_reset"):
                self.vec_env.handle_policy_reset(policy, dones)
            else:
                policy.reset(dones)
            actions, agent_infos = policy.get_actions(obses)
            policy_time += time.time() - t
            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, max_path_length=max_path_length)

            if np.any(dones):
                new_obses = self.vec_env.reset(dones)
                reset_idx = 0
                for idx, done in enumerate(dones):
                    if done:
                        next_obses[idx] = new_obses[reset_idx]
                        reset_idx += 1

            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.n_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.n_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))

                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

            if max_n_trajs is not None and len(paths) >= max_n_trajs:
                break

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        return paths
    def obtain_samples(self,
                       itr,
                       num_samples=None,
                       log=True,
                       log_prefix='RandomSampler-'):
        if num_samples is None:
            num_samples = self.algo.batch_size

        paths = []
        n_samples_collected = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(num_samples)
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time
        while n_samples_collected < num_samples:
            # random actions
            t = time.time()
            actions = np.stack([
                self.vec_env.action_space.sample() for _ in range(len(obses))
            ],
                               axis=0)
            policy_time = time.time() - t
            agent_infos = {}

            t = time.time()
            next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
            env_time += time.time() - t

            t = time.time()

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)
                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples_collected += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None
            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        if log:
            logger.record_tabular(log_prefix + "PolicyExecTime", policy_time)
            logger.record_tabular(log_prefix + "EnvExecTime", env_time)
            logger.record_tabular(log_prefix + "ProcessExecTime", process_time)

        return paths
Exemplo n.º 11
0
    def obtain_samples(self, itr, oracle_policy):
        logger.log("Obtaining samples for iteration %d..." % itr)
        paths = []
        agent_only_paths = []
        oracle_only_paths = []
        n_samples = 0
        obses = self.vec_env.reset()
        dones = np.asarray([True] * self.vec_env.num_envs)
        running_paths = [None] * self.vec_env.num_envs
        agent_only_running_paths = [None] * self.vec_env.num_envs
        oracle_only_running_paths = [None] * self.vec_env.num_envs

        pbar = ProgBarCounter(self.algo.batch_size)
        policy_time = 0
        env_time = 0
        process_time = 0

        policy = self.algo.policy
        import time

        while n_samples < self.algo.batch_size:
            t = time.time()
            policy.reset(dones)

            agent_actions, binary_actions, agent_infos = policy.get_actions(
                obses)
            oracle_actions, oracle_agent_infos = oracle_policy.get_actions(
                obses)
            sigma = np.round(binary_actions)

            actions_1 = np.array([
                sigma[0, 0] * agent_actions[0, :] +
                sigma[0, 1] * oracle_actions[0, :]
            ])
            actions_2 = np.array([
                sigma[1, 0] * agent_actions[1, :] +
                sigma[1, 1] * oracle_actions[1, :]
            ])

            actions = np.concatenate((actions_1, actions_2), axis=0)

            policy_time += time.time() - t
            t = time.time()

            next_obses, rewards, dones, env_infos = self.vec_env.step(
                actions, itr)

            agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
            env_infos = tensor_utils.split_tensor_dict_list(env_infos)
            if env_infos is None:
                env_infos = [dict() for _ in range(self.vec_env.num_envs)]
            if agent_infos is None:
                agent_infos = [dict() for _ in range(self.vec_env.num_envs)]

            for idx, observation, action, reward, env_info, agent_info, done in zip(
                    itertools.count(), obses, actions, rewards, env_infos,
                    agent_infos, dones):
                if running_paths[idx] is None:
                    running_paths[idx] = dict(
                        observations=[],
                        actions=[],
                        rewards=[],
                        env_infos=[],
                        agent_infos=[],
                    )
                running_paths[idx]["observations"].append(observation)
                running_paths[idx]["actions"].append(action)
                running_paths[idx]["rewards"].append(reward)
                running_paths[idx]["env_infos"].append(env_info)
                running_paths[idx]["agent_infos"].append(agent_info)

                if done:
                    paths.append(
                        dict(
                            observations=self.env_spec.observation_space.
                            flatten_n(running_paths[idx]["observations"]),
                            actions=self.env_spec.action_space.flatten_n(
                                running_paths[idx]["actions"]),
                            rewards=tensor_utils.stack_tensor_list(
                                running_paths[idx]["rewards"]),
                            env_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["env_infos"]),
                            agent_infos=tensor_utils.stack_tensor_dict_list(
                                running_paths[idx]["agent_infos"]),
                        ))
                    n_samples += len(running_paths[idx]["rewards"])
                    running_paths[idx] = None

            if sigma[0, 0] == 1 or sigma[1, 0] == 1:

                for idx, observation, action, reward, env_info, agent_info, done in zip(
                        itertools.count(), obses, actions, rewards, env_infos,
                        agent_infos, dones):
                    if agent_only_running_paths[idx] is None:
                        agent_only_running_paths[idx] = dict(
                            observations=[],
                            actions=[],
                            rewards=[],
                            env_infos=[],
                            agent_infos=[],
                        )
                    agent_only_running_paths[idx]["observations"].append(
                        observation)
                    agent_only_running_paths[idx]["actions"].append(action)
                    agent_only_running_paths[idx]["rewards"].append(reward)
                    agent_only_running_paths[idx]["env_infos"].append(env_info)
                    agent_only_running_paths[idx]["agent_infos"].append(
                        agent_info)

                    if done:
                        agent_only_paths.append(
                            dict(
                                observations=self.env_spec.observation_space.
                                flatten_n(agent_only_running_paths[idx]
                                          ["observations"]),
                                actions=self.env_spec.action_space.flatten_n(
                                    agent_only_running_paths[idx]["actions"]),
                                rewards=tensor_utils.stack_tensor_list(
                                    agent_only_running_paths[idx]["rewards"]),
                                env_infos=tensor_utils.stack_tensor_dict_list(
                                    agent_only_running_paths[idx]
                                    ["env_infos"]),
                                agent_infos=tensor_utils.
                                stack_tensor_dict_list(
                                    agent_only_running_paths[idx]
                                    ["agent_infos"]),
                            ))
                        n_samples += len(
                            agent_only_running_paths[idx]["rewards"])
                        agent_only_running_paths[idx] = None
            """
            To get paths taken by the oracle
            """
            # elif sigma[0] == 0. or sigma[1] == 0.:

            #     for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
            #                                                                             rewards, env_infos, agent_infos,
            #                                                                             dones):
            #         if oracle_only_running_paths[idx] is None:
            #             oracle_only_running_paths[idx] = dict(
            #                 observations=[],
            #                 actions=[],
            #                 rewards=[],
            #                 env_infos=[],
            #                 agent_infos=[],
            #             )
            #         oracle_only_running_paths[idx]["observations"].append(observation)
            #         oracle_only_running_paths[idx]["actions"].append(action)
            #         oracle_only_running_paths[idx]["rewards"].append(reward)
            #         oracle_only_running_paths[idx]["env_infos"].append(env_info)
            #         oracle_only_running_paths[idx]["agent_infos"].append(agent_info)

            #         if done:
            #             oracle_only_paths.append(dict(
            #                 observations=self.env_spec.observation_space.flatten_n(oracle_only_running_paths[idx]["observations"]),
            #                 actions=self.env_spec.action_space.flatten_n(oracle_only_running_paths[idx]["actions"]),
            #                 rewards=tensor_utils.stack_tensor_list(oracle_only_running_paths[idx]["rewards"]),
            #                 env_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["env_infos"]),
            #                 agent_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["agent_infos"]),
            #             ))
            #             n_samples += len(oracle_only_running_paths[idx]["rewards"])
            #             oracle_only_running_paths[idx] = None

            process_time += time.time() - t
            pbar.inc(len(obses))
            obses = next_obses

        pbar.stop()

        logger.record_tabular("PolicyExecTime", policy_time)
        logger.record_tabular("EnvExecTime", env_time)
        logger.record_tabular("ProcessExecTime", process_time)

        #return paths, agent_only_paths, oracle_only_paths
        return paths, agent_only_paths