def compute(self, observation, add_noise=False, update=True): observation = self.preprocessor.transform(observation) observation = self.observation_filter(observation[None], update=update) action = self.sess.run( self.sampler, feed_dict={self.inputs: observation}) action = _unbatch_tuple_actions(action) if add_noise and isinstance(self.action_space, gym.spaces.Box): action += np.random.randn(*action.shape) * self.action_noise_std return action
def compute(self, observation, add_noise=False, update=True): observation = self.preprocessor.transform(observation) observation = self.observation_filter(observation[None], update=update) action = self.sess.run( self.sampler, feed_dict={self.inputs: observation}) action = _unbatch_tuple_actions(action) if add_noise and isinstance(self.action_space, gym.spaces.Box): action += np.random.randn(*action.shape) * self.action_noise_std return action
def _compute_actions(policy, obs_batch, add_noise=False, update=True): observation = policy.preprocessor.transform(obs_batch) observation = policy.observation_filter(observation[None], update=update) observation = convert_to_torch_tensor(observation) dist_inputs, _ = policy.model({SampleBatch.CUR_OBS: observation}, [], None) dist = policy.dist_class(dist_inputs, policy.model) action = dist.sample().detach().numpy() action = _unbatch_tuple_actions(action) if add_noise and isinstance(policy.action_space, gym.spaces.Box): action += np.random.randn(*action.shape) * policy.action_noise_std return action
def rollout_loop(agent, env_name, num_steps, num_episodes, no_render=True, fps=1000, frameskip=1): policy_agent_mapping = default_policy_agent_mapping if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: m.action_space.sample() for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} steps = 0 full_episodes = 0 last_render_start = time.time() avg_reward = collections.deque([], maxlen=100) while steps < (num_steps or steps + 1) and full_episodes < num_episodes: mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() agent_states = DefaultMapping( lambda agent_id_: state_init[mapping_cache[agent_id_]]) prev_actions = DefaultMapping( lambda agent_id_: action_init[mapping_cache[agent_id_]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_episode = 0.0 while not done and steps < (num_steps or steps + 1): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) if isinstance(env.action_space, gym.spaces.Tuple): a_action = TupleActions(a_action) a_action = _unbatch_tuple_actions(a_action)[0] action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] rewards = None for frame in range(frameskip): next_obs, reward, done, _ = env.step(action) if done: log.info('Done at steps %d', steps) break if rewards is None: rewards = reward else: if multiagent: for agent_id, r in reward.items(): rewards[agent_id] += r else: rewards += reward if not no_render: target_delay = 1.0 / fps if fps > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay # note: ASYNC_PLAYER mode actually makes this sleep redundant if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() steps += 1 obs = next_obs if multiagent: for agent_id, r in rewards.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = rewards if multiagent: done = done['__all__'] reward_episode += 0 if rewards is None else sum( rewards.values()) else: reward_episode += 0 if rewards is None else rewards full_episodes += 1 avg_reward.append(reward_episode) log.info('Reward episode: %.3f, avg_reward %.3f', reward_episode, np.mean(avg_reward)) env.reset() # this guarantees that recordings are saved to disk