def test_unflatten(self): env = TheanoEnv( normalize(gym.make('Blackjack-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=False)) for i in range(10): env.reset() for e in range(5): action = env.action_space.sample() next_obs, reward, done, info = env.step(action) assert (env.observation_space.flatten(next_obs).shape == env.observation_space.flat_dim) # yapf: disable if done: break env.close()
def test_flatten(self): env = TheanoEnv( normalize(gym.make('Pendulum-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=True)) for i in range(10): env.reset() for e in range(5): env.render() action = env.action_space.sample() next_obs, reward, done, info = env.step(action) assert next_obs.shape == env.observation_space.low.shape if done: break env.close()
observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second # one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It # should at least contain entries that would be returned # by calling policy.dist_info(), which is the non-symbolic # analog of policy.dist_info_sym(). Storing these # statistics is useful, e.g., when forming importance # sampling ratios. In our case it is not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic # information about the environment. In our case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), )