def test_unflatten(self): env = TheanoEnv( normalize(gym.make('Blackjack-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=False)) for i in range(10): env.reset() for e in range(5): action = env.action_space.sample() next_obs, reward, done, info = env.step(action) assert (env.observation_space.flatten(next_obs).shape == env.observation_space.flat_dim) # yapf: disable if done: break env.close()
def test_flatten(self): env = TheanoEnv( normalize(gym.make('Pendulum-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=True)) for i in range(10): env.reset() for e in range(5): env.render() action = env.action_space.sample() next_obs, reward, done, info = env.step(action) assert next_obs.shape == env.observation_space.low.shape if done: break env.close()
f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True) for _ in range(n_itr): paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second # one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It # should at least contain entries that would be returned # by calling policy.dist_info(), which is the non-symbolic # analog of policy.dist_info_sym(). Storing these # statistics is useful, e.g., when forming importance # sampling ratios. In our case it is not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic # information about the environment. In our case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation)