def own_rollout(agent, num_episodes): results = [] if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} else: raise NotImplementedError("Multi-Agent only") action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } episodes = 0 while episodes < num_episodes: mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False while not done and (episodes < num_episodes): action_dict = {} for agent_id, a_obs in obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict next_obs, reward, done, info = env.step(action) done = done["__all__"] # update for agent_id, r in reward.items(): prev_rewards[agent_id] = r obs = next_obs if done: episodes += 1 # specific function for alternate game. results.append(env.determine_winner()) return results
def rollout(trainer, env_name, num_steps, num_episodes=0): policy_agent_mapping = default_policy_agent_mapping if hasattr(trainer, "workers") and isinstance(trainer.workers, WorkerSet): env = trainer.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if trainer.workers.local_worker().multiagent: policy_agent_mapping = trainer.config["multiagent"]["policy_mapping_fn"] policy_map = trainer.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} else: env = gym.make(env_name) multiagent = False try: policy_map = {DEFAULT_POLICY_ID: trainer.policy} except AttributeError: raise AttributeError( "Agent ({}) does not have a `policy` property! This is needed " "for performing (trained) agent rollouts.".format(trainer) ) use_lstm = {DEFAULT_POLICY_ID: False} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } metrics_obj = metrics.Metric(num_episodes) for episode in range(num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]] ) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]] ) prev_rewards = collections.defaultdict(lambda: 0.0) done = False reward_total = 0.0 step = 0 while not done and step < num_steps: multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id) ) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = trainer.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, ) agent_states[agent_id] = p_state else: a_action = trainer.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, ) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) metrics_obj.log_step(multi_obs, reward, done, info, episode=episode) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward # filter dead agents if multiagent: next_obs = { agent_id: obs for agent_id, obs in next_obs.items() if not done[agent_id] } if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward step += 1 obs = next_obs print("\nEpisode #{}: steps: {} reward: {}".format(episode, step, reward_total)) if done: episode += 1 print("\n metrics: {}".format(metrics_obj.compute()))
def internal_rollout( worker, num_steps, policy_map=None, policy_agent_mapping=None, reset_env_before=True, num_episodes=0, last_obs=None, saver=None, no_render=True, video_dir=None, seed=None, explore=None, ): """ Can perform rollouts on the environment from inside a worker_rollout or from a policy. Can perform rollouts during the evaluation rollouts ran from an RLLib Trainer. :param worker: worker from an RLLib Trainer. The interal rollouts will be run inside this worker, using its policies. :param num_steps: number of maximum steps to perform in total :param policy_map: (optional) by default the policy_map of the provided worker will be used :param policy_agent_mapping: (optional) by default the policy_mapping_fn of the provided worker will be used :param reset_env_before: (optional) reset the environment from the worker before first using it :param num_episodes: (optional) number of maximum episodes to perform :param last_obs: (optional) if reset_env_before is False then you must provide the last observation :param saver: (optional) an instance of a RolloutManager :param no_render: (optional) option to call env.render() :param video_dir: (optional) :param seed: (optional) random seed to set for the environment by calling env.seed(seed) :return: an instance of a RolloutManager, which contains the data about the rollouts performed """ assert num_steps is not None or num_episodes is not None assert reset_env_before or last_obs is not None if saver is None: saver = RolloutManager() env = copy.deepcopy(worker.env) if hasattr(env, "seed") and callable(env.seed): env.seed(seed) multiagent = isinstance(env, MultiAgentEnv) if policy_agent_mapping is None: if worker.multiagent: policy_agent_mapping = worker.policy_config["multiagent"][ "policy_mapping_fn"] else: policy_agent_mapping = default_policy_agent_mapping if policy_map is None: policy_map = worker.policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } # If monitoring has been requested, manually wrap our environment with a # gym monitor, which is set to record every episode. if video_dir: env = gym_wrappers.Monitor(env=env, directory=video_dir, video_callable=lambda x: True, force=True) random_policy_id = list(policy_map.keys())[0] virtual_global_timestep = worker.get_policy( random_policy_id).global_timestep steps = 0 episodes = 0 while _keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() if reset_env_before or episodes > 0: obs = env.reset() else: obs = last_obs agent_states = DefaultMapping( lambda agent_id_: state_init[mapping_cache[agent_id_]]) prev_actions = DefaultMapping( lambda agent_id_: action_init[mapping_cache[agent_id_]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 while not done and _keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} virtual_global_timestep += 1 for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] # print("rollout") if p_use_lstm: a_action, p_state, _ = _worker_compute_action( worker, timestep=virtual_global_timestep, observation=a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, explore=explore) agent_states[agent_id] = p_state else: a_action = _worker_compute_action( worker, virtual_global_timestep, observation=a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, explore=explore) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(r for r in reward.values() if r is not None) else: reward_total += reward if not no_render: env.render() saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs saver.end_rollout() if done: episodes += 1 return saver
def rollout_loop(agent, env_name, num_steps, num_episodes, no_render=True, fps=1000, frameskip=1): policy_agent_mapping = default_policy_agent_mapping if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: m.action_space.sample() for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} steps = 0 full_episodes = 0 last_render_start = time.time() avg_reward = collections.deque([], maxlen=100) while steps < (num_steps or steps + 1) and full_episodes < num_episodes: mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() agent_states = DefaultMapping( lambda agent_id_: state_init[mapping_cache[agent_id_]]) prev_actions = DefaultMapping( lambda agent_id_: action_init[mapping_cache[agent_id_]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_episode = 0.0 while not done and steps < (num_steps or steps + 1): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) if isinstance(env.action_space, gym.spaces.Tuple): a_action = TupleActions(a_action) a_action = _unbatch_tuple_actions(a_action)[0] action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] rewards = None for frame in range(frameskip): next_obs, reward, done, _ = env.step(action) if done: log.info('Done at steps %d', steps) break if rewards is None: rewards = reward else: if multiagent: for agent_id, r in reward.items(): rewards[agent_id] += r else: rewards += reward if not no_render: target_delay = 1.0 / fps if fps > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay # note: ASYNC_PLAYER mode actually makes this sleep redundant if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() steps += 1 obs = next_obs if multiagent: for agent_id, r in rewards.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = rewards if multiagent: done = done['__all__'] reward_episode += 0 if rewards is None else sum( rewards.values()) else: reward_episode += 0 if rewards is None else rewards full_episodes += 1 avg_reward.append(reward_episode) log.info('Reward episode: %.3f, avg_reward %.3f', reward_episode, np.mean(avg_reward)) env.reset() # this guarantees that recordings are saved to disk
def rollout(trainer, env_name, metrics_handler, num_steps, num_episodes, log_dir): """Reference: https://github.com/ray-project/ray/blob/master/rllib/rollout.py""" policy_agent_mapping = default_policy_agent_mapping assert hasattr(trainer, "workers") and isinstance(trainer.workers, WorkerSet) env = trainer.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if trainer.workers.local_worker().multiagent: policy_agent_mapping = trainer.config["multiagent"][ "policy_mapping_fn"] policy_map = trainer.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } for episode in range(num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.0) done = False reward_total = 0.0 step = 0 while not done and step < num_steps: multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = trainer.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, ) agent_states[agent_id] = p_state else: a_action = trainer.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, ) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) metrics_handler.log_step( episode=episode, observations=multi_obs, actions=action, rewards=reward, dones=done, infos=info, ) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward # filter dead agents if multiagent: next_obs = { agent_id: obs for agent_id, obs in next_obs.items() if not done[agent_id] } if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward step += 1 obs = next_obs logger.info("\nEpisode #{}: steps: {} reward: {}".format( episode, step, reward_total)) if done: episode += 1 metrics_handler.write_to_csv(csv_dir=log_dir)
def rollout(agent, env_name, num_steps, out=None, no_render=True, intent_predict=False): policy_agent_mapping = default_policy_agent_mapping '''if env_name is not None: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False}''' if hasattr(agent, "workers"): env = gym.make( env_name) if env_name is not None else agent.workers.local_worker( ).env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: m.action_space.sample() for p, m in policy_map.items() } else: raise ValueError('Env name/id is None and agent has no workers') if out is not None: rollouts = [] steps = 0 while steps < (num_steps or steps + 1): mapping_cache = {} # in case policy_agent_mapping is stochastic if out is not None: rollout = [] obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 # rollout one trajectory from urban_env.utils import print_execution_time import time current_wall_time = time.time() prev_step_time = 0 prev_action_time = 0 while not done and steps < (num_steps or steps + 1): current_wall_time = time.time() sim_loop_time = current_wall_time - prev_step_time action_loop_time = current_wall_time - prev_action_time if sim_loop_time < 1 / env.config["SIMULATION_FREQUENCY"]: #print("loop time (in ms) ", round(1e3*sim_loop_time, 2)) continue multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} if action_loop_time > 1 / env.config["POLICY_FREQUENCY"]: action = act(multi_obs, agent, multiagent, prev_actions, prev_rewards, policy_agent_mapping, mapping_cache, use_lstm) #current_wall_time = print_execution_time(current_wall_time, "After calculating action ") next_obs, reward, done, _ = env.step(action) prev_step_time = time.time() if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward policy_id = mapping_cache.setdefault( _DUMMY_AGENT_ID, policy_agent_mapping(_DUMMY_AGENT_ID)) #current_wall_time = print_execution_time(current_wall_time, "Before intent pred ") if intent_predict: projections = predict_one_step_of_rollout( env, agent, multi_obs, action, reward, policy_id, False) env.vehicle.projection = projections # env.intent_pred = True no_render = False #current_wall_time = print_execution_time(current_wall_time, "After intent pred ") if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if not no_render: env.render() if out is not None: rollout.append([obs, action, next_obs, reward, done]) steps += 1 obs = next_obs if out is not None: rollouts.append(rollout) print("Episode reward", reward_total) if out is not None: pickle.dump(rollouts, open(out, "wb"))