Пример #1
0
 def compute(self, observation, add_noise=False, update=True):
     observation = self.preprocessor.transform(observation)
     observation = self.observation_filter(observation[None], update=update)
     action = self.sess.run(
         self.sampler, feed_dict={self.inputs: observation})
     action = _unbatch_tuple_actions(action)
     if add_noise and isinstance(self.action_space, gym.spaces.Box):
         action += np.random.randn(*action.shape) * self.action_noise_std
     return action
Пример #2
0
 def compute(self, observation, add_noise=False, update=True):
     observation = self.preprocessor.transform(observation)
     observation = self.observation_filter(observation[None], update=update)
     action = self.sess.run(
         self.sampler, feed_dict={self.inputs: observation})
     action = _unbatch_tuple_actions(action)
     if add_noise and isinstance(self.action_space, gym.spaces.Box):
         action += np.random.randn(*action.shape) * self.action_noise_std
     return action
Пример #3
0
    def _compute_actions(policy, obs_batch, add_noise=False, update=True):
        observation = policy.preprocessor.transform(obs_batch)
        observation = policy.observation_filter(observation[None],
                                                update=update)

        observation = convert_to_torch_tensor(observation)
        dist_inputs, _ = policy.model({SampleBatch.CUR_OBS: observation}, [],
                                      None)
        dist = policy.dist_class(dist_inputs, policy.model)
        action = dist.sample().detach().numpy()
        action = _unbatch_tuple_actions(action)
        if add_noise and isinstance(policy.action_space, gym.spaces.Box):
            action += np.random.randn(*action.shape) * policy.action_noise_std
        return action
Пример #4
0
def rollout_loop(agent,
                 env_name,
                 num_steps,
                 num_episodes,
                 no_render=True,
                 fps=1000,
                 frameskip=1):
    policy_agent_mapping = default_policy_agent_mapping

    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: m.action_space.sample()
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    steps = 0
    full_episodes = 0
    last_render_start = time.time()
    avg_reward = collections.deque([], maxlen=100)

    while steps < (num_steps or steps + 1) and full_episodes < num_episodes:
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id_: state_init[mapping_cache[agent_id_]])
        prev_actions = DefaultMapping(
            lambda agent_id_: action_init[mapping_cache[agent_id_]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_episode = 0.0

        while not done and steps < (num_steps or steps + 1):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)

                    if isinstance(env.action_space, gym.spaces.Tuple):
                        a_action = TupleActions(a_action)
                        a_action = _unbatch_tuple_actions(a_action)[0]

                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            rewards = None

            for frame in range(frameskip):
                next_obs, reward, done, _ = env.step(action)
                if done:
                    log.info('Done at steps %d', steps)
                    break

                if rewards is None:
                    rewards = reward

                else:
                    if multiagent:
                        for agent_id, r in reward.items():
                            rewards[agent_id] += r
                    else:
                        rewards += reward

                if not no_render:
                    target_delay = 1.0 / fps if fps > 0 else 0
                    current_delay = time.time() - last_render_start
                    time_wait = target_delay - current_delay

                    # note: ASYNC_PLAYER mode actually makes this sleep redundant
                    if time_wait > 0:
                        # log.info('Wait time %.3f', time_wait)
                        time.sleep(time_wait)

                    last_render_start = time.time()
                    env.render()

                steps += 1
                obs = next_obs

            if multiagent:
                for agent_id, r in rewards.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = rewards

            if multiagent:
                done = done['__all__']
                reward_episode += 0 if rewards is None else sum(
                    rewards.values())
            else:
                reward_episode += 0 if rewards is None else rewards

        full_episodes += 1

        avg_reward.append(reward_episode)
        log.info('Reward episode: %.3f, avg_reward %.3f', reward_episode,
                 np.mean(avg_reward))

    env.reset()  # this guarantees that recordings are saved to disk