def test_density_reward(density_type, is_stationary): # test on Pendulum rather than Cartpole because I don't handle episodes that # terminate early yet (see issue #40) env_name = 'Pendulum-v0' env = util.make_vec_env(env_name, 2) # construct density-based reward from expert rollouts with open("tests/data/expert_models/pendulum_0/rollouts/final.pkl", "rb") as f: expert_trajectories_all = pickle.load(f) n_experts = len(expert_trajectories_all) expert_trajectories_train = expert_trajectories_all[:n_experts // 2] reward_fn = DensityReward(trajectories=expert_trajectories_train, density_type=density_type, kernel='gaussian', obs_space=env.observation_space, act_space=env.action_space, is_stationary=is_stationary, kernel_bandwidth=0.2, standardise_inputs=True) # check that expert policy does better than a random policy under our reward # function random_policy = RandomPolicy(env.observation_space, env.action_space) sample_until = rollout.min_episodes(n_experts // 2) random_trajectories = rollout.generate_trajectories(random_policy, env, sample_until=sample_until) expert_trajectories_test = expert_trajectories_all[n_experts // 2:] random_score = score_trajectories(random_trajectories, reward_fn) expert_score = score_trajectories(expert_trajectories_test, reward_fn) assert expert_score > random_score
def sample_trajectories(env, expert, n_episodes=None, n_timesteps=None): if n_episodes is None and n_timesteps is None: n_episodes = 20 expert_trajectories = generate_trajectories( expert, env, sample_until=make_sample_until(n_episodes=n_episodes, n_timesteps=n_timesteps), ) return expert_trajectories
def test_policy(self, *, min_episodes: int = 10) -> dict: """Test current imitation policy on environment & give some rollout stats. Args: min_episodes: Minimum number of rolled-out episodes. Returns: rollout statistics collected by `imitation.utils.rollout.rollout_stats()`. """ trajs = rollout.generate_trajectories( self.policy, self.env, sample_until=rollout.min_episodes(min_episodes)) reward_stats = rollout.rollout_stats(trajs) return reward_stats
def test_complete_trajectories(): """Check that complete trajectories are returned by vecenv wrapper, including the terminal observation.""" n_episodes = 13 max_acts = 5 num_envs = 4 vec_env = DummyVecEnv([lambda: TerminalSentinelEnv(max_acts)] * num_envs) policy = RandomPolicy(vec_env.observation_space, vec_env.action_space) trajectories = rollout.generate_trajectories(policy, vec_env, n_episodes=n_episodes) assert len(trajectories) >= n_episodes expected_obs = np.array([[0]] * max_acts + [[1]]) for trajectory in trajectories: obs = trajectory['obs'] act = trajectory['act'] assert len(obs) == len(act) + 1 assert np.all(obs == expected_obs)
def test_policy(self, *, n_trajectories=10, true_reward=True): """Test current imitation policy on environment & give some rollout stats. Args: n_trajectories (int): number of rolled-out trajectories. true_reward (bool): should this use ground truth reward from underlying environment (True), or imitation reward (False)? Returns: dict: rollout statistics collected by `imitation.utils.rollout.rollout_stats()`. """ self.imitation_trainer.set_env(self.venv) trajs = rollout.generate_trajectories( self.imitation_trainer, self.venv if true_reward else self.wrapped_env, sample_until=rollout.min_episodes(n_trajectories), ) reward_stats = rollout.rollout_stats(trajs) return reward_stats
def eval_policy( _run, _seed: int, env_name: str, eval_n_timesteps: Optional[int], eval_n_episodes: Optional[int], num_vec: int, parallel: bool, render: bool, render_fps: int, log_dir: str, policy_type: str, policy_path: str, reward_type: Optional[str] = None, reward_path: Optional[str] = None, max_episode_steps: Optional[int] = None, ): """Rolls a policy out in an environment, collecting statistics. Args: _seed: generated by Sacred. env_name: Gym environment identifier. eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. num_vec: Number of environments to run simultaneously. parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise, uses `DummyVecEnv`. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. render: If True, renders interactively to the screen. log_dir: The directory to log intermediate output to. (As of 2019-07-19 this is just episode-by-episode reward from bench.Monitor.) policy_type: A unique identifier for the saved policy, defined in POLICY_CLASSES. policy_path: A path to the serialized policy. reward_type: If specified, overrides the environment reward with a reward of this. reward_path: If reward_type is specified, the path to a serialized reward of `reward_type` to override the environment reward with. Returns: Return value of `imitation.util.rollout.rollout_stats()`. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info('Logging to %s', log_dir) sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) venv = VecNormalize(venv, training=False, norm_reward=False) venv = venv.load(policy_path + "/vec_normalize.pkl", venv) if render: venv = InteractiveRender(venv, render_fps) # TODO(adam): add support for videos using VideoRecorder? with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") with serialize.load_policy(policy_type, policy_path, venv) as policy: trajs = rollout.generate_trajectories(policy, venv, sample_until) return rollout.rollout_stats(trajs)