Exemplo n.º 1
0
    def test_batching_scheme_does_not_restart(self):
        """Test if BatchEnv correctly handle environments that come back to life."""

        env = batch_env.BatchEnv([FakeEnvThatComesBackToLife(i) \
                                  for i in range(BATCH_SIZE)])
        env.reset()
        policy = policies.NormalPolicyFixedStd(ACTION_SPACE, std=0.5)
        spec = env_spec.EnvSpec(env)
        trajectories = trajectory_collector.collect_trajectories(
            env, policy, max_steps=MAX_STEPS, env_spec=spec)
        _, _, masks, _ = trajectories
        checks = []
        for t in range(1, masks.shape[0]):
            # Here the logic is:
            # (1) Find environments that were terminated (mask = 0) in the previous
            # time step.
            # (2) Check that in the current time step they are still terminated.
            # At the end we check if this was true for every time pair.
            # We expect that it will be True when trajectoies do not come back to
            # life.
            prev_time_step_end = np.where(masks[t - 1] == 0)
            checks.append(np.all(masks[t, prev_time_step_end] == 0))

        # assert that no environments came back to line.
        self.assertTrue(np.all(checks))
Exemplo n.º 2
0
def create_env():
    """Creates an environment to be used within a process."""
    env = BatchEnv([gym.make(FLAGS.env) \
                    for _ in range(FLAGS.batch_size)])
    spec = env_spec.EnvSpec(env)
    return env, spec, {
        'max_steps_env': FLAGS.max_steps_env,
        'n_trajectories': FLAGS.n_trajectories
    }
  def test_observation_conversion(self):
    """Checks if observations are converted correctly from gym to tf."""
    env = get_batched_environment('Pendulum-v0', batch_size=5)
    spec = env_spec.EnvSpec(env)

    # Check dtype conversion for observations from gym to tensorflow.
    obs_gym = env.reset()
    obs_tf = spec.convert_obs_gym_to_tf(obs_gym)
    self.assertIsInstance(obs_tf, tf.Tensor)
    self.assertEqual(obs_tf.dtype, tf.float32)
  def test_action_conversion(self):
    """Checks if actions are converted correctly from tf to gym."""
    env = get_batched_environment('Pendulum-v0', batch_size=5)
    spec = env_spec.EnvSpec(env)
    env.reset()

    # Simulate an action from tensorflow.
    act_tf = tf.random_normal((128, 1))
    act_gym = spec.convert_act_tf_to_gym(act_tf)
    self.assertEqual(act_gym.dtype, np.float32)
    self.assertIsNotNone(env.step(act_gym))  # Action can be executed.
  def test_detect_mujoco(self):
    """Checks if the spaces in a MuJoCo env are detected properly."""
    env = get_batched_environment('Pendulum-v0', batch_size=5)
    spec = env_spec.EnvSpec(env)

    # Check if the Types are detected properly.
    self.assertEqual(spec.obs_type, env_spec.SpaceEnum.box)
    self.assertEqual(spec.act_type, env_spec.SpaceEnum.box)

    # Check if the sizes are detected properly.
    self.assertEqual(spec.total_obs_dim, 3)
    self.assertEqual(spec.total_sampled_act_dim, 1)
  def test_detect_cartpole(self):
    """Checks if the spaces on CartPole are detected properly."""
    env = get_batched_environment('CartPole-v0', batch_size=5)
    spec = env_spec.EnvSpec(env)

    # Check if the Types are detected properly.
    self.assertEqual(spec.obs_type, env_spec.SpaceEnum.box)
    self.assertEqual(spec.act_type, env_spec.SpaceEnum.discrete)

    # Check if the sizes are detected properly.
    self.assertEqual(spec.total_obs_dim, 4)
    self.assertEqual(spec.total_sampled_act_dim, 2)
  def test_action_clipping_works(self):
    """Checks if actions are clipped correctly."""
    env = get_batched_environment('Pendulum-v0', batch_size=5)
    spec = env_spec.EnvSpec(env)
    env.reset()

    # This action should not be clipped.
    act_tf_should_not_clip = tf.zeros((128, 1)) + 0.2
    act_processed = spec.convert_act_tf_to_gym(act_tf_should_not_clip)
    self.assertTrue(np.allclose(act_processed, 0.2), '{}'.format(act_processed))
    self.assertIsNotNone(env.step(act_processed))

    # This action should be clipped.
    act_tf_should_clip = tf.ones((128, 1)) * 5000
    act_processed = spec.convert_act_tf_to_gym(act_tf_should_clip)
    self.assertTrue(np.allclose(act_processed, 2))
    self.assertIsNotNone(env.step(act_processed))
Exemplo n.º 8
0
    def test_collector_in_env(self, env_name, policy_fn, policy_args):
        """Will do a rollout in the environment.

    The goal of this test is two fold:
    - trajectory collections can happen.
    - action clipping happens.

    Args:
      env_name: Name of the environment to load.
      policy_fn: a policies.* object that executes actions in the environment.
      policy_args: The arguments needed to load the policy.
    """
        env = batch_env.BatchEnv([gym.make(env_name) \
                                  for _ in range(BATCH_SIZE)])
        env.reset()
        policy = policy_fn(env.action_space.shape[0], **policy_args)
        spec = env_spec.EnvSpec(env)
        trajectories = trajectory_collector.collect_trajectories(
            env, policy, max_steps=MAX_STEPS, env_spec=spec)
        self.assertIsNotNone(trajectories)
Exemplo n.º 9
0
    def test_repeated_trajectory_collector_has_gradients(self):
        """Make sure concatenating trajectories maintains gradient information."""
        env = batch_env.BatchEnv(
            [gym.make('Pendulum-v0') for _ in range(BATCH_SIZE)])
        env.reset()
        policy = policies.NormalPolicyFixedStd(env.action_space.shape[0],
                                               std=0.5)
        spec = env_spec.EnvSpec(env)
        objective = objectives.REINFORCE()
        with tf.GradientTape() as tape:
            (rewards, log_probs, masks,
             _) = trajectory_collector.repeat_collect_trajectories(
                 env,
                 policy,
                 n_trajectories=BATCH_SIZE * 5,
                 env_spec=spec,
                 max_steps=100)
            returns = rl_utils.compute_discounted_return(rewards, 0.99, masks)
            loss = objective(log_probs=log_probs, returns=returns, masks=masks)
            grads = tape.gradient(loss, policy.trainable_variables)

        self.assertTrue(len(grads))
        self.assertFalse(np.all([np.all(t.numpy() == 0) for t in grads]))
Exemplo n.º 10
0
 def create_env():
     """Creates environments and useful things to rollout trajectories."""
     env = batch_env.BatchEnv([gym.make(ENV_NAME) for _ in range(5)])
     spec = env_spec.EnvSpec(env)
     others = {'max_steps_env': 1500, 'n_trajectories': 5}
     return env, spec, others