def _update_obs_estimate(self, obs): flat_obs = flatten(self.env.observation_space, obs) self._obs_mean = ( 1 - self._obs_alpha) * self._obs_mean + self._obs_alpha * flat_obs self._obs_var = ( 1 - self._obs_alpha) * self._obs_var + self._obs_alpha * np.square( flat_obs - self._obs_mean)
def _apply_normalize_obs(self, obs): self._update_obs_estimate(obs) normalized_obs = (flatten(self.env.observation_space, obs) - self._obs_mean) / (np.sqrt(self._obs_var) + 1e-8) if not self._flatten_obs: normalized_obs = unflatten(self.env.observation_space, normalized_obs) return normalized_obs
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(flatten(env.observation_space, o)) rewards.append(r) actions.append(flatten(env.action_space, a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return None return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def test_unflatten(): env = normalize(gym.make('Blackjack-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=False) for i in range(10): env.reset() for e in range(100): action = env.action_space.sample() next_obs, reward, done, info = env.step(action) assert flatten(env.observation_space, next_obs).shape == flat_dim(env.observation_space) if done: break env.close()
def get_action(self, observation): flat_obs = flatten(self.observation_space, observation) prob = self._f_prob([flat_obs])[0] action = weighted_sample(self.action_space, prob) return action, dict(prob=prob)