예제 #1
0
def own_rollout(agent, num_episodes):
    results = []
    if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
    else:
        raise NotImplementedError("Multi-Agent only")

    action_init = {
        p: flatten_to_single_ndarray(m.action_space.sample())
        for p, m in policy_map.items()
    }

    episodes = 0
    while episodes < num_episodes:
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        obs = env.reset()
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        while not done and (episodes < num_episodes):
            action_dict = {}
            for agent_id, a_obs in obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))

                    a_action = agent.compute_action(
                        a_obs,
                        prev_action=prev_actions[agent_id],
                        prev_reward=prev_rewards[agent_id],
                        policy_id=policy_id)
                    a_action = flatten_to_single_ndarray(a_action)
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action

            action = action_dict
            next_obs, reward, done, info = env.step(action)
            done = done["__all__"]

            # update
            for agent_id, r in reward.items():
                prev_rewards[agent_id] = r
            obs = next_obs

        if done:
            episodes += 1
            # specific function for alternate game.
            results.append(env.determine_winner())

    return results
예제 #2
0
파일: evaluate.py 프로젝트: zeyefkey/SMARTS
def rollout(trainer, env_name, num_steps, num_episodes=0):
    policy_agent_mapping = default_policy_agent_mapping
    if hasattr(trainer, "workers") and isinstance(trainer.workers, WorkerSet):
        env = trainer.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if trainer.workers.local_worker().multiagent:
            policy_agent_mapping = trainer.config["multiagent"]["policy_mapping_fn"]

        policy_map = trainer.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
    else:
        env = gym.make(env_name)
        multiagent = False
        try:
            policy_map = {DEFAULT_POLICY_ID: trainer.policy}
        except AttributeError:
            raise AttributeError(
                "Agent ({}) does not have a `policy` property! This is needed "
                "for performing (trained) agent rollouts.".format(trainer)
            )
        use_lstm = {DEFAULT_POLICY_ID: False}

    action_init = {
        p: flatten_to_single_ndarray(m.action_space.sample())
        for p, m in policy_map.items()
    }

    metrics_obj = metrics.Metric(num_episodes)

    for episode in range(num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]]
        )
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]]
        )
        prev_rewards = collections.defaultdict(lambda: 0.0)
        done = False
        reward_total = 0.0
        step = 0
        while not done and step < num_steps:
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id)
                    )
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = trainer.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id,
                        )
                        agent_states[agent_id] = p_state
                    else:
                        a_action = trainer.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id,
                        )
                    a_action = flatten_to_single_ndarray(a_action)
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)

            metrics_obj.log_step(multi_obs, reward, done, info, episode=episode)

            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            # filter dead agents
            if multiagent:
                next_obs = {
                    agent_id: obs
                    for agent_id, obs in next_obs.items()
                    if not done[agent_id]
                }

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward

            step += 1
            obs = next_obs
        print("\nEpisode #{}: steps: {} reward: {}".format(episode, step, reward_total))
        if done:
            episode += 1
    print("\n metrics: {}".format(metrics_obj.compute()))
예제 #3
0
def internal_rollout(
    worker,
    num_steps,
    policy_map=None,
    policy_agent_mapping=None,
    reset_env_before=True,
    num_episodes=0,
    last_obs=None,
    saver=None,
    no_render=True,
    video_dir=None,
    seed=None,
    explore=None,
):
    """
    Can perform rollouts on the environment from inside a worker_rollout or
    from a policy. Can perform rollouts during the evaluation rollouts ran
    from an RLLib Trainer.

    :param worker: worker from an RLLib Trainer.
    The interal rollouts will be run inside this worker, using its policies.
    :param num_steps: number of maximum steps to perform in total
    :param policy_map: (optional) by default the policy_map of the provided
    worker will be used
    :param policy_agent_mapping: (optional) by default the policy_mapping_fn
    of the provided worker will be used
    :param reset_env_before: (optional) reset the environment from the worker
    before first using it
    :param num_episodes: (optional) number of maximum episodes to perform
    :param last_obs: (optional) if reset_env_before is False then you must
    provide the last observation
    :param saver: (optional) an instance of a RolloutManager
    :param no_render: (optional) option to call env.render()
    :param video_dir: (optional)
    :param seed: (optional) random seed to set for the environment by calling
    env.seed(seed)
    :return: an instance of a RolloutManager, which contains the data about
    the rollouts performed
    """

    assert num_steps is not None or num_episodes is not None
    assert reset_env_before or last_obs is not None

    if saver is None:
        saver = RolloutManager()

    env = copy.deepcopy(worker.env)
    if hasattr(env, "seed") and callable(env.seed):
        env.seed(seed)

    multiagent = isinstance(env, MultiAgentEnv)
    if policy_agent_mapping is None:
        if worker.multiagent:
            policy_agent_mapping = worker.policy_config["multiagent"][
                "policy_mapping_fn"]
        else:
            policy_agent_mapping = default_policy_agent_mapping

    if policy_map is None:
        policy_map = worker.policy_map
    state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
    use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
    action_init = {
        p: flatten_to_single_ndarray(m.action_space.sample())
        for p, m in policy_map.items()
    }

    # If monitoring has been requested, manually wrap our environment with a
    # gym monitor, which is set to record every episode.
    if video_dir:
        env = gym_wrappers.Monitor(env=env,
                                   directory=video_dir,
                                   video_callable=lambda x: True,
                                   force=True)

    random_policy_id = list(policy_map.keys())[0]
    virtual_global_timestep = worker.get_policy(
        random_policy_id).global_timestep

    steps = 0
    episodes = 0
    while _keep_going(steps, num_steps, episodes, num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        if reset_env_before or episodes > 0:
            obs = env.reset()
        else:
            obs = last_obs
        agent_states = DefaultMapping(
            lambda agent_id_: state_init[mapping_cache[agent_id_]])
        prev_actions = DefaultMapping(
            lambda agent_id_: action_init[mapping_cache[agent_id_]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        while not done and _keep_going(steps, num_steps, episodes,
                                       num_episodes):

            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            virtual_global_timestep += 1
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    # print("rollout")
                    if p_use_lstm:
                        a_action, p_state, _ = _worker_compute_action(
                            worker,
                            timestep=virtual_global_timestep,
                            observation=a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id,
                            explore=explore)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = _worker_compute_action(
                            worker,
                            virtual_global_timestep,
                            observation=a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id,
                            explore=explore)
                    a_action = flatten_to_single_ndarray(a_action)
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action

            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(r for r in reward.values()
                                    if r is not None)
            else:
                reward_total += reward
            if not no_render:
                env.render()
            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs
        saver.end_rollout()
        if done:
            episodes += 1
    return saver
예제 #4
0
def rollout_loop(agent,
                 env_name,
                 num_steps,
                 num_episodes,
                 no_render=True,
                 fps=1000,
                 frameskip=1):
    policy_agent_mapping = default_policy_agent_mapping

    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: m.action_space.sample()
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    steps = 0
    full_episodes = 0
    last_render_start = time.time()
    avg_reward = collections.deque([], maxlen=100)

    while steps < (num_steps or steps + 1) and full_episodes < num_episodes:
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id_: state_init[mapping_cache[agent_id_]])
        prev_actions = DefaultMapping(
            lambda agent_id_: action_init[mapping_cache[agent_id_]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_episode = 0.0

        while not done and steps < (num_steps or steps + 1):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)

                    if isinstance(env.action_space, gym.spaces.Tuple):
                        a_action = TupleActions(a_action)
                        a_action = _unbatch_tuple_actions(a_action)[0]

                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            rewards = None

            for frame in range(frameskip):
                next_obs, reward, done, _ = env.step(action)
                if done:
                    log.info('Done at steps %d', steps)
                    break

                if rewards is None:
                    rewards = reward

                else:
                    if multiagent:
                        for agent_id, r in reward.items():
                            rewards[agent_id] += r
                    else:
                        rewards += reward

                if not no_render:
                    target_delay = 1.0 / fps if fps > 0 else 0
                    current_delay = time.time() - last_render_start
                    time_wait = target_delay - current_delay

                    # note: ASYNC_PLAYER mode actually makes this sleep redundant
                    if time_wait > 0:
                        # log.info('Wait time %.3f', time_wait)
                        time.sleep(time_wait)

                    last_render_start = time.time()
                    env.render()

                steps += 1
                obs = next_obs

            if multiagent:
                for agent_id, r in rewards.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = rewards

            if multiagent:
                done = done['__all__']
                reward_episode += 0 if rewards is None else sum(
                    rewards.values())
            else:
                reward_episode += 0 if rewards is None else rewards

        full_episodes += 1

        avg_reward.append(reward_episode)
        log.info('Reward episode: %.3f, avg_reward %.3f', reward_episode,
                 np.mean(avg_reward))

    env.reset()  # this guarantees that recordings are saved to disk
예제 #5
0
파일: rollout.py 프로젝트: valaxkong/SMARTS
def rollout(trainer, env_name, metrics_handler, num_steps, num_episodes,
            log_dir):
    """Reference: https://github.com/ray-project/ray/blob/master/rllib/rollout.py"""
    policy_agent_mapping = default_policy_agent_mapping
    assert hasattr(trainer, "workers") and isinstance(trainer.workers,
                                                      WorkerSet)
    env = trainer.workers.local_worker().env
    multiagent = isinstance(env, MultiAgentEnv)
    if trainer.workers.local_worker().multiagent:
        policy_agent_mapping = trainer.config["multiagent"][
            "policy_mapping_fn"]
    policy_map = trainer.workers.local_worker().policy_map
    state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
    use_lstm = {p: len(s) > 0 for p, s in state_init.items()}

    action_init = {
        p: flatten_to_single_ndarray(m.action_space.sample())
        for p, m in policy_map.items()
    }

    for episode in range(num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.0)
        done = False
        reward_total = 0.0
        step = 0
        while not done and step < num_steps:
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = trainer.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id,
                        )
                        agent_states[agent_id] = p_state
                    else:
                        a_action = trainer.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id,
                        )
                    a_action = flatten_to_single_ndarray(a_action)
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)

            metrics_handler.log_step(
                episode=episode,
                observations=multi_obs,
                actions=action,
                rewards=reward,
                dones=done,
                infos=info,
            )

            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            # filter dead agents
            if multiagent:
                next_obs = {
                    agent_id: obs
                    for agent_id, obs in next_obs.items() if not done[agent_id]
                }

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward

            step += 1
            obs = next_obs
        logger.info("\nEpisode #{}: steps: {} reward: {}".format(
            episode, step, reward_total))
        if done:
            episode += 1
    metrics_handler.write_to_csv(csv_dir=log_dir)
예제 #6
0
def rollout(agent,
            env_name,
            num_steps,
            out=None,
            no_render=True,
            intent_predict=False):
    policy_agent_mapping = default_policy_agent_mapping
    '''if env_name is not None:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}'''

    if hasattr(agent, "workers"):
        env = gym.make(
            env_name) if env_name is not None else agent.workers.local_worker(
            ).env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: m.action_space.sample()
            for p, m in policy_map.items()
        }
    else:
        raise ValueError('Env name/id is None and agent has no workers')

    if out is not None:
        rollouts = []
    steps = 0
    while steps < (num_steps or steps + 1):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        if out is not None:
            rollout = []
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        # rollout one trajectory
        from urban_env.utils import print_execution_time
        import time
        current_wall_time = time.time()
        prev_step_time = 0
        prev_action_time = 0
        while not done and steps < (num_steps or steps + 1):

            current_wall_time = time.time()
            sim_loop_time = current_wall_time - prev_step_time
            action_loop_time = current_wall_time - prev_action_time

            if sim_loop_time < 1 / env.config["SIMULATION_FREQUENCY"]:
                #print("loop time (in ms) ", round(1e3*sim_loop_time, 2))
                continue

            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}

            if action_loop_time > 1 / env.config["POLICY_FREQUENCY"]:
                action = act(multi_obs, agent, multiagent, prev_actions,
                             prev_rewards, policy_agent_mapping, mapping_cache,
                             use_lstm)
            #current_wall_time = print_execution_time(current_wall_time, "After calculating action ")
            next_obs, reward, done, _ = env.step(action)
            prev_step_time = time.time()
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            policy_id = mapping_cache.setdefault(
                _DUMMY_AGENT_ID, policy_agent_mapping(_DUMMY_AGENT_ID))

            #current_wall_time = print_execution_time(current_wall_time, "Before intent pred ")
            if intent_predict:

                projections = predict_one_step_of_rollout(
                    env, agent, multi_obs, action, reward, policy_id, False)
                env.vehicle.projection = projections
                # env.intent_pred = True
                no_render = False
            #current_wall_time = print_execution_time(current_wall_time, "After intent pred ")

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                env.render()
            if out is not None:
                rollout.append([obs, action, next_obs, reward, done])
            steps += 1
            obs = next_obs
        if out is not None:
            rollouts.append(rollout)
        print("Episode reward", reward_total)

    if out is not None:
        pickle.dump(rollouts, open(out, "wb"))