def test_density_reward(density_type, is_stationary):
  # test on Pendulum rather than Cartpole because I don't handle episodes that
  # terminate early yet (see issue #40)
  env_name = 'Pendulum-v0'
  env = util.make_vec_env(env_name, 2)

  # construct density-based reward from expert rollouts
  with open("tests/data/expert_models/pendulum_0/rollouts/final.pkl",
            "rb") as f:
    expert_trajectories_all = pickle.load(f)
  n_experts = len(expert_trajectories_all)
  expert_trajectories_train = expert_trajectories_all[:n_experts // 2]
  reward_fn = DensityReward(trajectories=expert_trajectories_train,
                            density_type=density_type,
                            kernel='gaussian',
                            obs_space=env.observation_space,
                            act_space=env.action_space,
                            is_stationary=is_stationary,
                            kernel_bandwidth=0.2,
                            standardise_inputs=True)

  # check that expert policy does better than a random policy under our reward
  # function
  random_policy = RandomPolicy(env.observation_space, env.action_space)
  sample_until = rollout.min_episodes(n_experts // 2)
  random_trajectories = rollout.generate_trajectories(random_policy,
                                                      env,
                                                      sample_until=sample_until)
  expert_trajectories_test = expert_trajectories_all[n_experts // 2:]
  random_score = score_trajectories(random_trajectories, reward_fn)
  expert_score = score_trajectories(expert_trajectories_test, reward_fn)
  assert expert_score > random_score
Exemplo n.º 2
0
def sample_trajectories(env, expert, n_episodes=None, n_timesteps=None):
    if n_episodes is None and n_timesteps is None:
        n_episodes = 20

    expert_trajectories = generate_trajectories(
        expert,
        env,
        sample_until=make_sample_until(n_episodes=n_episodes,
                                       n_timesteps=n_timesteps),
    )
    return expert_trajectories
Exemplo n.º 3
0
    def test_policy(self, *, min_episodes: int = 10) -> dict:
        """Test current imitation policy on environment & give some rollout stats.

    Args:
      min_episodes: Minimum number of rolled-out episodes.

    Returns:
      rollout statistics collected by `imitation.utils.rollout.rollout_stats()`.
    """
        trajs = rollout.generate_trajectories(
            self.policy,
            self.env,
            sample_until=rollout.min_episodes(min_episodes))
        reward_stats = rollout.rollout_stats(trajs)
        return reward_stats
Exemplo n.º 4
0
def test_complete_trajectories():
    """Check that complete trajectories are returned by vecenv wrapper,
  including the terminal observation."""
    n_episodes = 13
    max_acts = 5
    num_envs = 4
    vec_env = DummyVecEnv([lambda: TerminalSentinelEnv(max_acts)] * num_envs)
    policy = RandomPolicy(vec_env.observation_space, vec_env.action_space)
    trajectories = rollout.generate_trajectories(policy,
                                                 vec_env,
                                                 n_episodes=n_episodes)
    assert len(trajectories) >= n_episodes
    expected_obs = np.array([[0]] * max_acts + [[1]])
    for trajectory in trajectories:
        obs = trajectory['obs']
        act = trajectory['act']
        assert len(obs) == len(act) + 1
        assert np.all(obs == expected_obs)
Exemplo n.º 5
0
    def test_policy(self, *, n_trajectories=10, true_reward=True):
        """Test current imitation policy on environment & give some rollout
    stats.

    Args:
      n_trajectories (int): number of rolled-out trajectories.
      true_reward (bool): should this use ground truth reward from underlying
        environment (True), or imitation reward (False)?

    Returns:
      dict: rollout statistics collected by
        `imitation.utils.rollout.rollout_stats()`.
    """
        self.imitation_trainer.set_env(self.venv)
        trajs = rollout.generate_trajectories(
            self.imitation_trainer,
            self.venv if true_reward else self.wrapped_env,
            sample_until=rollout.min_episodes(n_trajectories),
        )
        reward_stats = rollout.rollout_stats(trajs)
        return reward_stats
Exemplo n.º 6
0
def eval_policy(
    _run,
    _seed: int,
    env_name: str,
    eval_n_timesteps: Optional[int],
    eval_n_episodes: Optional[int],
    num_vec: int,
    parallel: bool,
    render: bool,
    render_fps: int,
    log_dir: str,
    policy_type: str,
    policy_path: str,
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
):
    """Rolls a policy out in an environment, collecting statistics.

  Args:
    _seed: generated by Sacred.
    env_name: Gym environment identifier.
    eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    num_vec: Number of environments to run simultaneously.
    parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
        uses `DummyVecEnv`.
    max_episode_steps: If not None, then environments are wrapped by
        TimeLimit so that they have at most `max_episode_steps` steps per
        episode.
    render: If True, renders interactively to the screen.
    log_dir: The directory to log intermediate output to. (As of 2019-07-19
        this is just episode-by-episode reward from bench.Monitor.)
    policy_type: A unique identifier for the saved policy,
        defined in POLICY_CLASSES.
    policy_path: A path to the serialized policy.
    reward_type: If specified, overrides the environment reward with
        a reward of this.
    reward_path: If reward_type is specified, the path to a serialized reward
        of `reward_type` to override the environment reward with.

  Returns:
    Return value of `imitation.util.rollout.rollout_stats()`.
  """

    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info('Logging to %s', log_dir)
    sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes)
    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir,
                             max_episode_steps=max_episode_steps)
    venv = VecNormalize(venv, training=False, norm_reward=False)
    venv = venv.load(policy_path + "/vec_normalize.pkl", venv)

    if render:
        venv = InteractiveRender(venv, render_fps)
    # TODO(adam): add support for videos using VideoRecorder?

    with contextlib.ExitStack() as stack:
        if reward_type is not None:
            reward_fn_ctx = load_reward(reward_type, reward_path, venv)
            reward_fn = stack.enter_context(reward_fn_ctx)
            venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
            tf.logging.info(
                f"Wrapped env in reward {reward_type} from {reward_path}.")

        with serialize.load_policy(policy_type, policy_path, venv) as policy:
            trajs = rollout.generate_trajectories(policy, venv, sample_until)
    return rollout.rollout_stats(trajs)