def last_action_for(self, agent_id=_DUMMY_AGENT_ID): """Returns the last action for the specified agent, or zeros.""" if agent_id in self._agent_to_last_action: return flatten_to_single_ndarray( self._agent_to_last_action[agent_id]) else: policy = self._policies[self.policy_for(agent_id)] flat = flatten_to_single_ndarray(policy.action_space.sample()) return np.zeros_like(flat)
def last_action_for(self, agent_id: AgentID = _DUMMY_AGENT_ID) -> EnvActionType: """Returns the last action for the specified agent, or zeros.""" if agent_id in self._agent_to_last_action: return flatten_to_single_ndarray( self._agent_to_last_action[agent_id]) else: policy = self._policies[self.policy_for(agent_id)] flat = flatten_to_single_ndarray(policy.action_space.sample()) if hasattr(policy.action_space, "dtype"): return np.zeros_like(flat, dtype=policy.action_space.dtype) return np.zeros_like(flat)
def prev_action_for(self, agent_id: AgentID = _DUMMY_AGENT_ID) -> EnvActionType: """Returns the previous action for the specified agent, or zeros. The "previous" action is the one taken one timestep before the most recent action taken by the agent. Args: agent_id: The agent's ID to get the previous action for. Returns: Previous action the specified AgentID has executed. Zero in case the agent has never performed any actions (or only one) in the episode. """ policy_id = self.policy_for(agent_id) policy = self.policy_map[policy_id] # We are at t > 1 -> There has been a previous action by this agent. if agent_id in self._agent_to_prev_action: if policy.config.get("_disable_action_flattening"): return self._agent_to_prev_action[agent_id] else: return flatten_to_single_ndarray(self._agent_to_prev_action[agent_id]) # We're at t <= 1, so return all zeros. else: if policy.config.get("_disable_action_flattening"): return tree.map_structure( lambda a: np.zeros_like(a, a.dtype) if hasattr(a, "dtype") # noqa else np.zeros_like(a), # noqa self.last_action_for(agent_id), ) else: return np.zeros_like(self.last_action_for(agent_id))
def prev_action_for(self, agent_id=_DUMMY_AGENT_ID): """Returns the previous action for the specified agent.""" if agent_id in self._agent_to_prev_action: return flatten_to_single_ndarray( self._agent_to_prev_action[agent_id]) else: # We're at t=0, so return all zeros. return np.zeros_like(self.last_action_for(agent_id))
def last_action_for(self, agent_id: AgentID = _DUMMY_AGENT_ID) -> EnvActionType: """Returns the last action for the specified AgentID, or zeros. The "last" action is the most recent one taken by the agent. Args: agent_id: The agent's ID to get the last action for. Returns: Last action the specified AgentID has executed. Zeros in case the agent has never performed any actions in the episode. """ policy_id = self.policy_for(agent_id) policy = self.policy_map[policy_id] # Agent has already taken at least one action in the episode. if agent_id in self._agent_to_last_action: if policy.config.get("_disable_action_flattening"): return self._agent_to_last_action[agent_id] else: return flatten_to_single_ndarray( self._agent_to_last_action[agent_id]) # Agent has not acted yet, return all zeros. else: if policy.config.get("_disable_action_flattening"): return tree.map_structure( lambda s: np.zeros_like(s.sample(), s.dtype) if hasattr(s, "dtype") else np.zeros_like(s.sample()), policy.action_space_struct, ) else: flat = flatten_to_single_ndarray(policy.action_space.sample()) if hasattr(policy.action_space, "dtype"): return np.zeros_like(flat, dtype=policy.action_space.dtype) return np.zeros_like(flat)
def rollout(trainer, env_name, num_steps, num_episodes=0): policy_agent_mapping = default_policy_agent_mapping if hasattr(trainer, "workers") and isinstance(trainer.workers, WorkerSet): env = trainer.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if trainer.workers.local_worker().multiagent: policy_agent_mapping = trainer.config["multiagent"]["policy_mapping_fn"] policy_map = trainer.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} else: env = gym.make(env_name) multiagent = False try: policy_map = {DEFAULT_POLICY_ID: trainer.policy} except AttributeError: raise AttributeError( "Agent ({}) does not have a `policy` property! This is needed " "for performing (trained) agent rollouts.".format(trainer) ) use_lstm = {DEFAULT_POLICY_ID: False} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } metrics_obj = metrics.Metric(num_episodes) for episode in range(num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]] ) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]] ) prev_rewards = collections.defaultdict(lambda: 0.0) done = False reward_total = 0.0 step = 0 while not done and step < num_steps: multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id) ) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = trainer.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, ) agent_states[agent_id] = p_state else: a_action = trainer.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, ) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) metrics_obj.log_step(multi_obs, reward, done, info, episode=episode) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward # filter dead agents if multiagent: next_obs = { agent_id: obs for agent_id, obs in next_obs.items() if not done[agent_id] } if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward step += 1 obs = next_obs print("\nEpisode #{}: steps: {} reward: {}".format(episode, step, reward_total)) if done: episode += 1 print("\n metrics: {}".format(metrics_obj.compute()))
def rollout(agent, env_name, num_steps, num_episodes=0, saver=None, no_render=True, video_dir=None): policy_agent_mapping = default_policy_agent_mapping if saver is None: saver = RolloutSaver() # Normal case: Agent was setup correctly with an evaluation WorkerSet, # which we will now use to rollout. if hasattr(agent, "evaluation_workers") and isinstance( agent.evaluation_workers, WorkerSet): steps = 0 episodes = 0 while keep_going(steps, num_steps, episodes, num_episodes): saver.begin_rollout() eval_result = agent.evaluate()["evaluation"] # Increase timestep and episode counters. eps = agent.config["evaluation_num_episodes"] episodes += eps steps += eps * eval_result["episode_len_mean"] # Print out results and continue. print("Episode #{}: reward: {}".format( episodes, eval_result["episode_reward_mean"])) saver.end_rollout() return # Agent has no evaluation workers, but RolloutWorkers. elif hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} # Agent has neither evaluation- nor rollout workers. else: from gym import envs if envs.registry.env_specs.get(agent.config["env"]): # if environment is gym environment, load from gym env = gym.make(agent.config["env"]) else: # if environment registered ray environment, load from ray env_creator = _global_registry.get(ENV_CREATOR, agent.config["env"]) env_context = EnvContext(agent.config["env_config"] or {}, worker_index=0) env = env_creator(env_context) multiagent = False try: policy_map = {DEFAULT_POLICY_ID: agent.policy} except AttributeError: raise AttributeError( "Agent ({}) does not have a `policy` property! This is needed " "for performing (trained) agent rollouts.".format(agent)) use_lstm = {DEFAULT_POLICY_ID: False} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } # If monitoring has been requested, manually wrap our environment with a # gym monitor, which is set to record every episode. if video_dir: env = gym_wrappers.Monitor(env=env, directory=video_dir, video_callable=lambda _: True, force=True) steps = 0 episodes = 0 while keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 while not done and keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_single_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_single_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(r for r in reward.values() if r is not None) else: reward_total += reward if not no_render: env.render() saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs saver.end_rollout() print("Episode #{}: reward: {}".format(episodes, reward_total)) if done: episodes += 1
def _process_observations( worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], batch_builder_pool: List[MultiAgentSampleBatchBuilder], active_episodes: Dict[str, MultiAgentEpisode], unfiltered_obs: Dict[EnvID, Dict[AgentID, EnvObsType]], rewards: Dict[EnvID, Dict[AgentID, float]], dones: Dict[EnvID, Dict[AgentID, bool]], infos: Dict[EnvID, Dict[AgentID, EnvInfoDict]], horizon: int, preprocessors: Dict[PolicyID, Preprocessor], obs_filters: Dict[PolicyID, Filter], rollout_fragment_length: int, pack_multiple_episodes_in_batch: bool, callbacks: "DefaultCallbacks", soft_horizon: bool, no_done_at_end: bool, observation_fn: "ObservationFunction", _use_trajectory_view_api: bool = False ) -> Tuple[Set[EnvID], Dict[PolicyID, List[PolicyEvalData]], List[Union[ RolloutMetrics, SampleBatchType]]]: """Record new data from the environment and prepare for policy evaluation. Args: worker (RolloutWorker): Reference to the current rollout worker. base_env (BaseEnv): Env implementing BaseEnv. policies (dict): Map of policy ids to Policy instances. batch_builder_pool (List[SampleBatchBuilder]): List of pooled SampleBatchBuilder object for recycling. active_episodes (Dict[str, MultiAgentEpisode]): Mapping from episode ID to currently ongoing MultiAgentEpisode object. unfiltered_obs (dict): Doubly keyed dict of env-ids -> agent ids -> unfiltered observation tensor, returned by a `BaseEnv.poll()` call. rewards (dict): Doubly keyed dict of env-ids -> agent ids -> rewards tensor, returned by a `BaseEnv.poll()` call. dones (dict): Doubly keyed dict of env-ids -> agent ids -> boolean done flags, returned by a `BaseEnv.poll()` call. infos (dict): Doubly keyed dict of env-ids -> agent ids -> info dicts, returned by a `BaseEnv.poll()` call. horizon (int): Horizon of the episode. preprocessors (dict): Map of policy id to preprocessor for the observations prior to filtering. obs_filters (dict): Map of policy id to filter used to process observations for the policy. rollout_fragment_length (int): Number of episode steps before `SampleBatch` is yielded. Set to infinity to yield complete episodes. pack_multiple_episodes_in_batch (bool): Whether to pack multiple episodes into each batch. This guarantees batches will be exactly `rollout_fragment_length` in size. callbacks (DefaultCallbacks): User callbacks to run on episode events. soft_horizon (bool): Calculate rewards but don't reset the environment when the horizon is hit. no_done_at_end (bool): Ignore the done=True at the end of the episode and instead record done=False. observation_fn (ObservationFunction): Optional multi-agent observation func to use for preprocessing observations. _use_trajectory_view_api (bool): Whether to use the (experimental) `_use_trajectory_view_api` to make generic trajectory views available to Models. Default: False. Returns: Tuple: - active_envs: Set of non-terminated env ids. - to_eval: Map of policy_id to list of agent PolicyEvalData. - outputs: List of metrics and samples to return from the sampler. """ # Output objects. active_envs: Set[EnvID] = set() to_eval: Dict[PolicyID, List[PolicyEvalData]] = defaultdict(list) outputs: List[Union[RolloutMetrics, SampleBatchType]] = [] large_batch_threshold: int = max(1000, rollout_fragment_length * 10) if \ rollout_fragment_length != float("inf") else 5000 # For each environment. # type: EnvID, Dict[AgentID, EnvObsType] for env_id, agent_obs in unfiltered_obs.items(): is_new_episode: bool = env_id not in active_episodes episode: MultiAgentEpisode = active_episodes[env_id] if not is_new_episode: episode.length += 1 episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) if (episode.batch_builder.total() > large_batch_threshold and log_once("large_batch_warning")): logger.warning( "More than {} observations for {} env steps ".format( episode.batch_builder.total(), episode.batch_builder.count) + "are buffered in " "the sampler. If this is more than you expected, check " "that you set a horizon on your environment correctly and " "that it terminates at some point. " "Note: In multi-agent environments, `rollout_fragment_length` " "sets the batch size based on environment steps, not the " "steps of " "individual agents, which can result in unexpectedly large " "batches. Also, you may be in evaluation waiting for your Env " "to terminate (batch_mode=`complete_episodes`). Make sure it " "does at some point.") # Check episode termination conditions. if dones[env_id]["__all__"] or episode.length >= horizon: hit_horizon = (episode.length >= horizon and not dones[env_id]["__all__"]) all_agents_done = True atari_metrics: List[RolloutMetrics] = _fetch_atari_metrics( base_env) if atari_metrics is not None: for m in atari_metrics: outputs.append( m._replace(custom_metrics=episode.custom_metrics)) else: outputs.append( RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards), episode.custom_metrics, {}, episode.hist_data)) else: hit_horizon = False all_agents_done = False active_envs.add(env_id) # Custom observation function is applied before preprocessing. if observation_fn: agent_obs: Dict[AgentID, EnvObsType] = observation_fn( agent_obs=agent_obs, worker=worker, base_env=base_env, policies=policies, episode=episode) if not isinstance(agent_obs, dict): raise ValueError( "observe() must return a dict of agent observations") # For each agent in the environment. # type: AgentID, EnvObsType for agent_id, raw_obs in agent_obs.items(): assert agent_id != "__all__" policy_id: PolicyID = episode.policy_for(agent_id) prep_obs: EnvObsType = _get_or_raise(preprocessors, policy_id).transform(raw_obs) if log_once("prep_obs"): logger.info("Preprocessed obs: {}".format(summarize(prep_obs))) filtered_obs: EnvObsType = _get_or_raise(obs_filters, policy_id)(prep_obs) if log_once("filtered_obs"): logger.info("Filtered obs: {}".format(summarize(filtered_obs))) agent_done = bool(all_agents_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( PolicyEvalData(env_id, agent_id, filtered_obs, infos[env_id].get(agent_id, {}), episode.rnn_state_for(agent_id), episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0)) last_observation: EnvObsType = episode.last_observation_for( agent_id) episode._set_last_observation(agent_id, filtered_obs) episode._set_last_raw_obs(agent_id, raw_obs) episode._set_last_info(agent_id, infos[env_id].get(agent_id, {})) # Record transition info if applicable. if (last_observation is not None and infos[env_id].get( agent_id, {}).get("training_enabled", True)): episode.batch_builder.add_values( agent_id, policy_id, t=episode.length - 1, eps_id=episode.episode_id, agent_index=episode._agent_index(agent_id), obs=last_observation, actions=episode.last_action_for(agent_id), rewards=rewards[env_id][agent_id], prev_actions=episode.prev_action_for(agent_id), prev_rewards=episode.prev_reward_for(agent_id), dones=(False if (no_done_at_end or (hit_horizon and soft_horizon)) else agent_done), infos=infos[env_id].get(agent_id, {}), new_obs=filtered_obs, **episode.last_pi_info_for(agent_id)) # Invoke the step callback after the step is logged to the episode callbacks.on_episode_step( worker=worker, base_env=base_env, episode=episode) # Cut the batch if ... # - all-agents-done and not packing multiple episodes into one # (batch_mode="complete_episodes") # - or if we've exceeded the rollout_fragment_length. if episode.batch_builder.has_pending_agent_data(): # Sanity check, whether all agents have done=True, if done[__all__] # is True. if dones[env_id]["__all__"] and not no_done_at_end: episode.batch_builder.check_missing_dones() # Reached end of episode and we are not allowed to pack the # next episode into the same SampleBatch -> Build the SampleBatch # and add it to "outputs". if (all_agents_done and not pack_multiple_episodes_in_batch) or \ episode.batch_builder.count >= rollout_fragment_length: outputs.append(episode.batch_builder.build_and_reset(episode)) # Make sure postprocessor stays within one episode. elif all_agents_done: episode.batch_builder.postprocess_batch_so_far(episode) # Episode is done. if all_agents_done: # Handle episode termination. batch_builder_pool.append(episode.batch_builder) # Call each policy's Exploration.on_episode_end method. for p in policies.values(): if getattr(p, "exploration", None) is not None: p.exploration.on_episode_end( policy=p, environment=base_env, episode=episode, tf_sess=getattr(p, "_sess", None)) # Call custom on_episode_end callback. callbacks.on_episode_end( worker=worker, base_env=base_env, policies=policies, episode=episode) if hit_horizon and soft_horizon: episode.soft_reset() resetted_obs: Dict[AgentID, EnvObsType] = agent_obs else: del active_episodes[env_id] resetted_obs: Dict[AgentID, EnvObsType] = base_env.try_reset( env_id) if resetted_obs is None: # Reset not supported, drop this env from the ready list. if horizon != float("inf"): raise ValueError( "Setting episode horizon requires reset() support " "from the environment.") elif resetted_obs != ASYNC_RESET_RETURN: # Creates a new episode if this is not async return. # If reset is async, we will get its result in some future poll episode: MultiAgentEpisode = active_episodes[env_id] if observation_fn: resetted_obs: Dict[AgentID, EnvObsType] = observation_fn( agent_obs=resetted_obs, worker=worker, base_env=base_env, policies=policies, episode=episode) # type: AgentID, EnvObsType for agent_id, raw_obs in resetted_obs.items(): policy_id: PolicyID = episode.policy_for(agent_id) policy: Policy = _get_or_raise(policies, policy_id) prep_obs: EnvObsType = _get_or_raise( preprocessors, policy_id).transform(raw_obs) filtered_obs: EnvObsType = _get_or_raise( obs_filters, policy_id)(prep_obs) episode._set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( PolicyEvalData( env_id, agent_id, filtered_obs, episode.last_info_for(agent_id) or {}, episode.rnn_state_for(agent_id), np.zeros_like( flatten_to_single_ndarray( policy.action_space.sample())), 0.0)) return active_envs, to_eval, outputs
def run_rollout(agent, env, env_name, num_steps, num_episodes=0, no_render=True, video_dir=None, args=None): """ Rollout execution function. This was largely inherited from RLlib. :agent: Agent: Rllib agent :env: Env: Gym environment :env_name: str: Env id / name :num_steps: Int: number of steps :num_episodes: Int: Number of episodes :no_render: bool: Whether to render environment for visual inspection :video_dir: str: Video storage path :args: Argparse.Args: User defined arguments """ policy_agent_mapping = default_policy_agent_mapping # Normal case: Agent was setup correctly with an evaluation WorkerSet, # which we will now use to rollout. if hasattr(agent, "evaluation_workers") and isinstance( agent.evaluation_workers, WorkerSet): steps = 0 episodes = 0 while keep_going(steps, num_steps, episodes, num_episodes): eval_result = agent.evaluate()["evaluation"] # Increase timestep and episode counters. eps = agent.config["evaluation_num_episodes"] episodes += eps steps += eps * eval_result["episode_len_mean"] # Print out results and continue. logging.info("Episode #{}: reward: {}".format( episodes, eval_result["episode_reward_mean"])) return # Agent has no evaluation workers, but RolloutWorkers. elif hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} logging.warn( """\nWARNING: You are either spinning up a random agent (untrained, no checkpoint) or you have a malformed checkpoint object with no evaluation workers. You may run rollouts in this way, but rollouts will be slow. Instead, read in a checkpoint from the very beginning of your model training for faster rollouts.\n\n NOTE: You CANNOT run interestingness analysis with these rollouts, as they only include information about the environment""") action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } # If monitoring has been requested, manually wrap our environment with a # gym monitor, which is set to record every episode. if video_dir: env = gym_wrappers.Monitor(env=env, directory=video_dir, video_callable=lambda _: True, force=True) # Make episode writer env = EpisodeWriterWrapper(env, args=args) steps = 0 episodes = 0 while keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 while not done and keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): # action = agent.compute_action(a_obs) if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(r for r in reward.values() if r is not None) else: reward_total += reward if not no_render: env.render() steps += 1 obs = next_obs logging.info("Episode #{}: reward: {}".format(episodes, reward_total)) if done: episodes += 1
def rollout_episodes(env, agent, num_episodes=1, num_steps=1000, render=True, experiment_nr=0): multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"]["policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } reward_eps = [] cot_eps = [] vel_eps = [] dist_eps = [] steps_eps = [] power_total_eps = [] # For numerical calculation of gradients: # for each input dimension set the step size which # is based on the standard deviation of the specific input channel. step_dim_low = np.zeros((44, 44)) step_dim_high = np.zeros((44, 44)) for i in range(0, 44): step_dim_low[i, i] = -0.1 * agent.workers.local_worker().get_filters( )['central_policy'].rs.std[i] step_dim_high[i, i] = 0.1 * agent.workers.local_worker().get_filters( )['central_policy'].rs.std[i] manual_grads = np.zeros((44, 8)) manual_grads_abs = np.zeros((44, 8)) for episodes in range(0, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic # saver.begin_rollout() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 power_total = 0.0 steps = 0 done = False #env.env.create_new_random_hfield() obs = env.reset() start_pos = env.env.sim.data.qpos[0] while not done and steps < num_steps: multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) # Variation of inputs by small steps # and observe how the output (for the mean value) # changes for all action channels. variation_obs_high = a_obs.reshape(1, 44) + step_dim_high variation_obs_low = a_obs.reshape(1, 44) + step_dim_low act_low = np.zeros((44, 8)) act_high = np.zeros((44, 8)) for i in range(0, 44): act_low[i] = agent.compute_action( variation_obs_low[i, :], policy_id=policy_id, explore=False) act_high[i] = agent.compute_action( variation_obs_high[i, :], policy_id=policy_id, explore=False) manual_grads += act_high - act_low manual_grads_abs += np.abs(act_high - act_low) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if render: env.render() # saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs # Calculated as torque (during last time step - or in this case sum of # proportional control signal (clipped to [-1,1], multiplied by 150 to torque) # multiplied by joint velocity for each joint. # Important: unfortunately there is a shift in the ctrl signals - therefore use roll # (control signals start with front right leg, front left leg starts at index 2) current_power = np.sum( np.abs( np.roll(env.env.sim.data.ctrl, -2) * env.env.sim.data.qvel[6:])) power_total += current_power # saver.end_rollout() distance_x = env.env.sim.data.qpos[0] - start_pos com_vel = distance_x / steps cost_of_transport = (power_total / steps) / ( mujoco_py.functions.mj_getTotalmass(env.env.model) * com_vel) # Weight is 8.78710174560547 #print(mujoco_py.functions.mj_getTotalmass(env.env.model)) #print(steps, " - ", power_total, " / ", power_total/steps, "; CoT: ", cost_of_transport) cot_eps.append(cost_of_transport) reward_eps.append(reward_total) vel_eps.append(com_vel) dist_eps.append(distance_x) steps_eps.append(steps) power_total_eps.append(power_total) print(episodes, ' - ', reward_total, '; CoT: ', cost_of_transport, '; Vel: ', 20 * com_vel) #print("GRADS: ", manual_grads) np.save("grads_tvel1_" + str(experiment_nr) + ".npy", manual_grads) np.save("grads_tvel1_abs_" + str(experiment_nr) + ".npy", manual_grads_abs) return (reward_eps, steps_eps, dist_eps, power_total_eps, vel_eps, cot_eps)
def _initialize_loss_with_dummy_batch(self): # Dummy forward pass to initialize any policy attributes, etc. dummy_batch = { SampleBatch.CUR_OBS: np.array( [self.observation_space.sample()]), SampleBatch.NEXT_OBS: np.array( [self.observation_space.sample()]), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } if isinstance(self.action_space, (Dict, Tuple)): dummy_batch[SampleBatch.ACTIONS] = [ flatten_to_single_ndarray(self.action_space.sample()) ] else: dummy_batch[SampleBatch.ACTIONS] = tf.nest.map_structure( lambda c: np.array([c]), self.action_space.sample()) if obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: dummy_batch[SampleBatch.ACTIONS], SampleBatch.PREV_REWARDS: dummy_batch[SampleBatch.REWARDS], }) for i, h in enumerate(self._state_in): dummy_batch["state_in_{}".format(i)] = h dummy_batch["state_out_{}".format(i)] = h if self._state_in: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) # Convert everything to tensors. dummy_batch = tf.nest.map_structure(tf1.convert_to_tensor, dummy_batch) # for IMPALA which expects a certain sample batch size. def tile_to(tensor, n): return tf.tile(tensor, [n] + [1 for _ in tensor.shape.as_list()[1:]]) if get_batch_divisibility_req: dummy_batch = tf.nest.map_structure( lambda c: tile_to(c, get_batch_divisibility_req(self)), dummy_batch) i = 0 self._state_in = [] while "state_in_{}".format(i) in dummy_batch: self._state_in.append(dummy_batch["state_in_{}".format(i)]) i += 1 # Execute a forward pass to get self.action_dist etc initialized, # and also obtain the extra action fetches _, _, fetches = self.compute_actions( dummy_batch[SampleBatch.CUR_OBS], self._state_in, dummy_batch.get(SampleBatch.PREV_ACTIONS), dummy_batch.get(SampleBatch.PREV_REWARDS), explore=False) dummy_batch.update(fetches) postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) # model forward pass for the loss (needed after postprocess to # overwrite any tensor state from that call) self.model.from_batch(dummy_batch) postprocessed_batch = tf.nest.map_structure( lambda c: tf.convert_to_tensor(c), postprocessed_batch.data) loss_fn(self, self.model, self.dist_class, postprocessed_batch) if stats_fn: stats_fn(self, postprocessed_batch)
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = "tfe" Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) self.exploration = self._create_exploration() self._state_in = [ tf.convert_to_tensor([s]) for s in self.model.get_initial_state() ] input_dict = { SampleBatch.CUR_OBS: tf.convert_to_tensor( np.array([observation_space.sample()])), SampleBatch.PREV_ACTIONS: tf.convert_to_tensor( [flatten_to_single_ndarray(action_space.sample())]), SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]), } if action_distribution_fn: dist_inputs, self.dist_class, _ = action_distribution_fn( self, self.model, input_dict[SampleBatch.CUR_OBS]) else: self.model(input_dict, self._state_in, tf.convert_to_tensor([1])) if before_loss_init: before_loss_init(self, observation_space, action_space, config) self._initialize_loss_with_dummy_batch() self._loss_initialized = True if optimizer_fn: self._optimizer = optimizer_fn(self, config) else: self._optimizer = tf.keras.optimizers.Adam(config["lr"]) if after_init: after_init(self, observation_space, action_space, config)
def rollout_episodes(env, agent, num_episodes=1, num_steps=1000, render=True, save_images=None, explore_during_rollout=None, tvel=None, save_obs=None): """ Rollout an episode: step through an episode, using the - agent = trained policies (is a multiagent consisting of a dict of agents) - env = in the given environment for num_steps control steps and running num_episodes episodes. render: shows OpenGL window save_images: save individual frames (can be combined to video) tvel: set target velocity """ if tvel: env.target_velocity_list = [tvel] # Setting up the agent for running an episode. multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"]["policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } #if save_images: # viewer = mujoco_py.MjRenderContextOffscreen(env.env.sim, 0) # Collecting statistics over episodes. reward_eps = [] cot_eps = [] vel_eps = [] dist_eps = [] steps_eps = [] power_total_eps = [] if save_obs: obs_list = [] for episodes in range(0, num_episodes): # Reset all values for this episode. mapping_cache = {} # in case policy_agent_mapping is stochastic # saver.begin_rollout() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 power_total = 0.0 steps = 0 done = False env.env.create_new_random_hfield() obs = env.reset() start_pos = env.env.sim.data.qpos[0] # Control stepping: while not done and steps < num_steps: multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, explore=explore_during_rollout) agent_states[agent_id] = p_state else: # Sample an action for the current observation # for one entry of the agent dict. a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] # Stepping the environment. next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if render: if save_images: #viewer.render(1280, 800, 0) if tvel: env.env.model.body_pos[14][0] += tvel * 0.05 img = env.env.sim.render(width=1280, height=800, camera_name="side_run") #data = np.asarray(viewer.read_pixels(800, 1280, depth=False)[::-1, :, :], dtype=np.uint8) #img_array = env.env.render('rgb_array') plt.imsave(save_images + str(steps).zfill(4) + '.png', img, origin='lower') else: env.render() #saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs if save_obs: obs_list.append(obs) # Calculated as torque (during last time step - or in this case sum of # proportional control signal (clipped to [-1,1], multiplied by 150 to torque) # multiplied by joint velocity for each joint. # Important: unfortunately there is a shift in the ctrl signals - therefore use roll # (control signals start with front right leg, front left leg starts at index 2) current_power = np.sum( np.abs( np.roll(env.env.sim.data.ctrl, -2) * env.env.sim.data.qvel[6:])) power_total += current_power #saver.end_rollout() distance_x = env.env.sim.data.qpos[0] - start_pos com_vel = distance_x / steps cost_of_transport = (power_total / steps) / ( mujoco_py.functions.mj_getTotalmass(env.env.model) * com_vel) # Weight is 8.78710174560547 #print(steps, " - ", power_total, " / ", power_total/steps, "; CoT: ", cost_of_transport) cot_eps.append(cost_of_transport) reward_eps.append(reward_total) vel_eps.append(com_vel) dist_eps.append(distance_x) steps_eps.append(steps) power_total_eps.append(power_total) #print(episodes, ' - ', reward_total, '; CoT: ', cost_of_transport, '; Vel: ', com_vel) # Return collected information from episode. if save_obs: np.save(str(save_obs + '/obs_list'), obs_list) return (reward_eps, steps_eps, dist_eps, power_total_eps, vel_eps, cot_eps)
def rollout(trainer, env_name, metrics_handler, num_steps, num_episodes, log_dir): """Reference: https://github.com/ray-project/ray/blob/master/rllib/rollout.py""" policy_agent_mapping = default_policy_agent_mapping assert hasattr(trainer, "workers") and isinstance(trainer.workers, WorkerSet) env = trainer.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if trainer.workers.local_worker().multiagent: policy_agent_mapping = trainer.config["multiagent"][ "policy_mapping_fn"] policy_map = trainer.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } for episode in range(num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.0) done = False reward_total = 0.0 step = 0 while not done and step < num_steps: multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = trainer.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, ) agent_states[agent_id] = p_state else: a_action = trainer.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, ) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) metrics_handler.log_step( episode=episode, observations=multi_obs, actions=action, rewards=reward, dones=done, infos=info, ) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward # filter dead agents if multiagent: next_obs = { agent_id: obs for agent_id, obs in next_obs.items() if not done[agent_id] } if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward step += 1 obs = next_obs logger.info("\nEpisode #{}: steps: {} reward: {}".format( episode, step, reward_total)) if done: episode += 1 metrics_handler.write_to_csv(csv_dir=log_dir)
def internal_rollout( worker, num_steps, policy_map=None, policy_agent_mapping=None, reset_env_before=True, num_episodes=0, last_obs=None, saver=None, no_render=True, video_dir=None, seed=None, explore=None, ): """ Can perform rollouts on the environment from inside a worker_rollout or from a policy. Can perform rollouts during the evaluation rollouts ran from an RLLib Trainer. :param worker: worker from an RLLib Trainer. The interal rollouts will be run inside this worker, using its policies. :param num_steps: number of maximum steps to perform in total :param policy_map: (optional) by default the policy_map of the provided worker will be used :param policy_agent_mapping: (optional) by default the policy_mapping_fn of the provided worker will be used :param reset_env_before: (optional) reset the environment from the worker before first using it :param num_episodes: (optional) number of maximum episodes to perform :param last_obs: (optional) if reset_env_before is False then you must provide the last observation :param saver: (optional) an instance of a RolloutManager :param no_render: (optional) option to call env.render() :param video_dir: (optional) :param seed: (optional) random seed to set for the environment by calling env.seed(seed) :return: an instance of a RolloutManager, which contains the data about the rollouts performed """ assert num_steps is not None or num_episodes is not None assert reset_env_before or last_obs is not None if saver is None: saver = RolloutManager() env = copy.deepcopy(worker.env) if hasattr(env, "seed") and callable(env.seed): env.seed(seed) multiagent = isinstance(env, MultiAgentEnv) if policy_agent_mapping is None: if worker.multiagent: policy_agent_mapping = worker.policy_config["multiagent"][ "policy_mapping_fn"] else: policy_agent_mapping = default_policy_agent_mapping if policy_map is None: policy_map = worker.policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } # If monitoring has been requested, manually wrap our environment with a # gym monitor, which is set to record every episode. if video_dir: env = gym_wrappers.Monitor(env=env, directory=video_dir, video_callable=lambda x: True, force=True) random_policy_id = list(policy_map.keys())[0] virtual_global_timestep = worker.get_policy( random_policy_id).global_timestep steps = 0 episodes = 0 while _keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() if reset_env_before or episodes > 0: obs = env.reset() else: obs = last_obs agent_states = DefaultMapping( lambda agent_id_: state_init[mapping_cache[agent_id_]]) prev_actions = DefaultMapping( lambda agent_id_: action_init[mapping_cache[agent_id_]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 while not done and _keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} virtual_global_timestep += 1 for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] # print("rollout") if p_use_lstm: a_action, p_state, _ = _worker_compute_action( worker, timestep=virtual_global_timestep, observation=a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, explore=explore) agent_states[agent_id] = p_state else: a_action = _worker_compute_action( worker, virtual_global_timestep, observation=a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, explore=explore) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(r for r in reward.values() if r is not None) else: reward_total += reward if not no_render: env.render() saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs saver.end_rollout() if done: episodes += 1 return saver
def rollout(agent, env_name, num_steps, num_episodes=0, saver=None, no_render=True, video_dir=None): policy_agent_mapping = default_policy_agent_mapping if saver is None: saver = RolloutSaver() if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} else: env = gym.make(env_name) multiagent = False try: policy_map = {DEFAULT_POLICY_ID: agent.policy} except AttributeError: raise AttributeError( "Agent ({}) does not have a `policy` property! This is needed " "for performing (trained) agent rollouts.".format(agent)) use_lstm = {DEFAULT_POLICY_ID: False} action_init = { p: flatten_to_single_ndarray(m.action_space.sample()) for p, m in policy_map.items() } # If monitoring has been requested, manually wrap our environment with a # gym monitor, which is set to record every episode. if video_dir: env = gym.wrappers.Monitor(env=env, directory=video_dir, video_callable=lambda x: True, force=True) steps = 0 episodes = 0 while keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 while not done and keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = flatten_to_single_ndarray(a_action) action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if not no_render: env.render() saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs saver.end_rollout() print("Episode #{}: reward: {}".format(episodes, reward_total)) if done: episodes += 1
def _process_observations(worker, base_env, policies, batch_builder_pool, active_episodes, unfiltered_obs, rewards, dones, infos, off_policy_actions, horizon, preprocessors, obs_filters, rollout_fragment_length, pack, callbacks, soft_horizon, no_done_at_end, observation_fn): """Record new data from the environment and prepare for policy evaluation. Returns: active_envs: set of non-terminated env ids to_eval: map of policy_id to list of agent PolicyEvalData outputs: list of metrics and samples to return from the sampler """ active_envs = set() to_eval = defaultdict(list) outputs = [] large_batch_threshold = max(1000, rollout_fragment_length * 10) if \ rollout_fragment_length != float("inf") else 5000 # For each environment for env_id, agent_obs in unfiltered_obs.items(): new_episode = env_id not in active_episodes episode = active_episodes[env_id] if not new_episode: episode.length += 1 episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) if (episode.batch_builder.total() > large_batch_threshold and log_once("large_batch_warning")): logger.warning( "More than {} observations for {} env steps ".format( episode.batch_builder.total(), episode.batch_builder.count) + "are buffered in " "the sampler. If this is more than you expected, check that " "that you set a horizon on your environment correctly and that" " it terminates at some point. " "Note: In multi-agent environments, `rollout_fragment_length` " "sets the batch size based on environment steps, not the " "steps of " "individual agents, which can result in unexpectedly large " "batches. Also, you may be in evaluation waiting for your Env " "to terminate (batch_mode=`complete_episodes`). Make sure it " "does at some point.") # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: hit_horizon = (episode.length >= horizon and not dones[env_id]["__all__"]) all_done = True atari_metrics = _fetch_atari_metrics(base_env) if atari_metrics is not None: for m in atari_metrics: outputs.append( m._replace(custom_metrics=episode.custom_metrics)) else: outputs.append( RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards), episode.custom_metrics, {}, episode.hist_data)) else: hit_horizon = False all_done = False active_envs.add(env_id) # Custom observation function is applied before preprocessing. if observation_fn: agent_obs = observation_fn(agent_obs=agent_obs, worker=worker, base_env=base_env, policies=policies, episode=episode) if not isinstance(agent_obs, dict): raise ValueError( "observe() must return a dict of agent observations") # For each agent in the environment. for agent_id, raw_obs in agent_obs.items(): assert agent_id != "__all__" policy_id = episode.policy_for(agent_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) if log_once("prep_obs"): logger.info("Preprocessed obs: {}".format(summarize(prep_obs))) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) if log_once("filtered_obs"): logger.info("Filtered obs: {}".format(summarize(filtered_obs))) agent_done = bool(all_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( PolicyEvalData(env_id, agent_id, filtered_obs, infos[env_id].get(agent_id, {}), episode.rnn_state_for(agent_id), episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0)) last_observation = episode.last_observation_for(agent_id) episode._set_last_observation(agent_id, filtered_obs) episode._set_last_raw_obs(agent_id, raw_obs) episode._set_last_info(agent_id, infos[env_id].get(agent_id, {})) # Record transition info if applicable if (last_observation is not None and infos[env_id].get( agent_id, {}).get("training_enabled", True)): episode.batch_builder.add_values( agent_id, policy_id, t=episode.length - 1, eps_id=episode.episode_id, agent_index=episode._agent_index(agent_id), obs=last_observation, actions=episode.last_action_for(agent_id), rewards=rewards[env_id][agent_id], prev_actions=episode.prev_action_for(agent_id), prev_rewards=episode.prev_reward_for(agent_id), dones=(False if (no_done_at_end or (hit_horizon and soft_horizon)) else agent_done), infos=infos[env_id].get(agent_id, {}), new_obs=filtered_obs, **episode.last_pi_info_for(agent_id)) # Invoke the step callback after the step is logged to the episode callbacks.on_episode_step(worker=worker, base_env=base_env, episode=episode) # Cut the batch if we're not packing multiple episodes into one, # or if we've exceeded the requested batch size. if episode.batch_builder.has_pending_agent_data(): if dones[env_id]["__all__"] and not no_done_at_end: episode.batch_builder.check_missing_dones() if (all_done and not pack) or \ episode.batch_builder.count >= rollout_fragment_length: outputs.append(episode.batch_builder.build_and_reset(episode)) elif all_done: # Make sure postprocessor stays within one episode episode.batch_builder.postprocess_batch_so_far(episode) if all_done: # Handle episode termination batch_builder_pool.append(episode.batch_builder) # Call each policy's Exploration.on_episode_end method. for p in policies.values(): if getattr(p, "exploration", None) is not None: p.exploration.on_episode_end(policy=p, environment=base_env, episode=episode, tf_sess=getattr( p, "_sess", None)) # Call custom on_episode_end callback. callbacks.on_episode_end(worker=worker, base_env=base_env, policies=policies, episode=episode) if hit_horizon and soft_horizon: episode.soft_reset() resetted_obs = agent_obs else: del active_episodes[env_id] resetted_obs = base_env.try_reset(env_id) if resetted_obs is None: # Reset not supported, drop this env from the ready list if horizon != float("inf"): raise ValueError( "Setting episode horizon requires reset() support " "from the environment.") elif resetted_obs != ASYNC_RESET_RETURN: # Creates a new episode if this is not async return # If reset is async, we will get its result in some future poll episode = active_episodes[env_id] if observation_fn: resetted_obs = observation_fn(agent_obs=resetted_obs, worker=worker, base_env=base_env, policies=policies, episode=episode) for agent_id, raw_obs in resetted_obs.items(): policy_id = episode.policy_for(agent_id) policy = _get_or_raise(policies, policy_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) episode._set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( PolicyEvalData( env_id, agent_id, filtered_obs, episode.last_info_for(agent_id) or {}, episode.rnn_state_for(agent_id), np.zeros_like( flatten_to_single_ndarray( policy.action_space.sample())), 0.0)) return active_envs, to_eval, outputs