def make_PPO2(env_name, num_vec): env = util.make_vec_env(env_name, num_vec) # TODO(adam): add support for wrapping env with VecNormalize # (This is non-trivial since we'd need to make sure it's also applied # when the policy is re-loaded to generate rollouts.) policy = util.make_blank_policy(env, verbose=1, init_tensorboard=True) return policy
def make_trainer(): env_name = 'CartPole-v1' env = util.make_vec_env(env_name, 2) with open(ROLLOUT_PATH, "rb") as f: rollouts = pickle.load(f) rollouts = util.rollout.flatten_trajectories(rollouts) return bc.BCTrainer(env, expert_demos=rollouts)
def test_reward_overwrite(): """Test that reward wrapper actually overwrites base rewards.""" env_id = 'Pendulum-v0' num_envs = 3 env = util.make_vec_env(env_id, num_envs) reward_fn = FunkyReward() wrapped_env = util.reward_wrapper.RewardVecEnvWrapper(env, reward_fn) policy = RandomPolicy(env.observation_space, env.action_space) default_stats = util.rollout.rollout_stats(policy, env, n_episodes=10) wrapped_stats = util.rollout.rollout_stats(policy, wrapped_env, n_episodes=10) # Pendulum-v0 always has negative rewards assert default_stats['return_max'] < 0 # ours gives between 1 * traj_len and num_envs * traj_len reward # (trajectories are all constant length of 200 in Pendulum) steps = wrapped_stats['len_mean'] assert wrapped_stats['return_min'] == 1 * steps assert wrapped_stats['return_max'] == num_envs * steps # check that wrapped reward is negative (all pendulum rewards is negative) # and other rewards are non-negative rand_act, _, _, _ = policy.step(wrapped_env.reset()) _, rew, _, infos = wrapped_env.step(rand_act) assert np.all(rew >= 0) assert np.all([info_dict['wrapped_env_rew'] < 0 for info_dict in infos])
def test_density_reward(density_type, is_stationary): # test on Pendulum rather than Cartpole because I don't handle episodes that # terminate early yet (see issue #40) env_name = 'Pendulum-v0' env = util.make_vec_env(env_name, 2) # construct density-based reward from expert rollouts with open("tests/data/expert_models/pendulum_0/rollouts/final.pkl", "rb") as f: expert_trajectories_all = pickle.load(f) n_experts = len(expert_trajectories_all) expert_trajectories_train = expert_trajectories_all[:n_experts // 2] reward_fn = DensityReward(trajectories=expert_trajectories_train, density_type=density_type, kernel='gaussian', obs_space=env.observation_space, act_space=env.action_space, is_stationary=is_stationary, kernel_bandwidth=0.2, standardise_inputs=True) # check that expert policy does better than a random policy under our reward # function random_policy = RandomPolicy(env.observation_space, env.action_space) sample_until = rollout.min_episodes(n_experts // 2) random_trajectories = rollout.generate_trajectories(random_policy, env, sample_until=sample_until) expert_trajectories_test = expert_trajectories_all[n_experts // 2:] random_score = score_trajectories(random_trajectories, reward_fn) expert_score = score_trajectories(expert_trajectories_test, reward_fn) assert expert_score > random_score
def test_bc(): env_id = 'CartPole-v1' env = util.make_vec_env(env_id, 2) rollouts = util.rollout.load_trajectories( "tests/data/rollouts/CartPole-v1*.pkl") rollouts = util.rollout.flatten_trajectories(rollouts) bc_trainer = bc.BCTrainer(env, expert_rollouts=rollouts) novice_stats = bc_trainer.test_policy() bc_trainer.train(n_epochs=40) good_stats = bc_trainer.test_policy() # novice is bad assert novice_stats["return_mean"] < 100.0 # bc is okay but isn't perfect (for the purpose of this test) assert good_stats["return_mean"] > 200.0
def test_bc(): env_name = 'CartPole-v1' env = util.make_vec_env(env_name, 2) with open("tests/data/expert_models/cartpole_0/rollouts/final.pkl", "rb") as f: rollouts = pickle.load(f) rollouts = util.rollout.flatten_trajectories(rollouts) bc_trainer = bc.BCTrainer(env, expert_demos=rollouts) novice_stats = bc_trainer.test_policy() bc_trainer.train(n_epochs=40) good_stats = bc_trainer.test_policy(min_episodes=25) # novice is bad assert novice_stats["return_mean"] < 80.0 # bc is okay but isn't perfect (for the purpose of this test) assert good_stats["return_mean"] > 350.0
def rollouts_from_policy( _seed: int, *, num_vec: int, rollout_save_n_timesteps: int, rollout_save_n_episodes: int, log_dir: str, policy_path: str, policy_type: str = "ppo2", env_name: str = "CartPole-v1", parallel: bool = True, rollout_save_dir: Optional[str] = None, max_episode_steps: Optional[int] = None, ) -> None: """Loads a saved policy and generates rollouts. Default save path is f"{log_dir}/rollouts/{env_name}.pkl". Change to f"{rollout_save_dir}/{env_name}.pkl" by setting the `rollout_save_dir` param. Unlisted arguments are the same as in `rollouts_and_policy()`. Args: policy_type: Argument to `imitation.policies.serialize.load_policy`. policy_path: Argument to `imitation.policies.serialize.load_policy`. If not provided, then defaults to f"expert_models/{env_name}". rollout_save_dir: Rollout pickle is saved in this directory as f"{env_name}.pkl". """ if rollout_save_dir is None: rollout_save_dir = osp.join(log_dir, "rollouts") venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) with serialize.load_policy(policy_type, policy_path, venv) as policy: os.makedirs(rollout_save_dir, exist_ok=True) util.rollout.save( rollout_save_dir, policy, venv, basename=env_name, n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes, )
def test_bc(): env_id = 'CartPole-v1' policy_dir = gin.query_parameter('init_trainer.policy_dir') env = util.make_vec_env(env_id, 2) expert_algos = util.load_policy(env, basedir=policy_dir) if not expert_algos: raise ValueError(env) bc_trainer = bc.BCTrainer(env, expert_trainers=expert_algos, n_expert_timesteps=2000) novice_stats = bc_trainer.test_policy() bc_trainer.train(n_epochs=40) good_stats = bc_trainer.test_policy() # novice is bad assert novice_stats["return_mean"] < 100.0 # bc is okay but isn't perfect (for the purpose of this test) assert good_stats["return_mean"] > 200.0
def test_density_trainer(density_type, is_stationary): env_id = 'Pendulum-v0' rollouts = rollout.load_trajectories(f"tests/data/rollouts/{env_id}_*.pkl") env = util.make_vec_env(env_id, 2) imitation_trainer = util.init_rl(env) density_trainer = DensityTrainer(env, rollouts=rollouts, imitation_trainer=imitation_trainer, density_type=density_type, is_stationary=is_stationary, kernel='gaussian') novice_stats = density_trainer.test_policy() density_trainer.train_policy(2000) good_stats = density_trainer.test_policy() # Novice is bad assert novice_stats["return_mean"] < -500 # Density is also pretty bad, but shouldn't make things more than 50% worse. # It would be nice to have a less flaky/more meaningful test here. assert good_stats["return_mean"] > 1.5 * novice_stats["return_mean"]
def init_trainer(env_id, policy_dir, use_gail, use_random_expert=True, num_vec=8, discrim_scale=False, discrim_kwargs={}, reward_kwargs={}, trainer_kwargs={}): """Builds a Trainer, ready to be trained on a vectorized environment and either expert rollout data or random rollout data. Args: env_id (str): The string id of a gym environment. use_gail (bool): If True, then train using GAIL. If False, then train using AIRL. policy_dir (str): The directory containing the pickled experts for generating rollouts. Only applicable if `use_random_expert` is True. use_random_expert (bool): If True, then use a blank (random) policy to generate rollouts. If False, then load an expert policy. Will crash if there is no expert policy in `policy_dir`. trainer_kwargs (dict): Aguments for the Trainer constructor. reward_kwargs (dict): Arguments for the `*RewardNet` constructor. discrim_kwargs (dict): Arguments for the `DiscrimNet*` constructor. """ env = util.make_vec_env(env_id, num_vec) gen_policy = util.make_blank_policy(env, verbose=1) if use_random_expert: expert_policies = [gen_policy] else: expert_policies = util.load_policy(env, basedir=policy_dir) if expert_policies is None: raise ValueError(env) if use_gail: discrim = discrim_net.DiscrimNetGAIL(env.observation_space, env.action_space, scale=discrim_scale, **discrim_kwargs) else: rn = BasicShapedRewardNet(env.observation_space, env.action_space, scale=discrim_scale, **reward_kwargs) discrim = discrim_net.DiscrimNetAIRL(rn, **discrim_kwargs) trainer = Trainer(env, gen_policy, discrim, expert_policies=expert_policies, **trainer_kwargs) return trainer
def rollouts_from_policy( _run, _seed: int, *, num_vec: int, rollout_save_n_timesteps: int, rollout_save_n_episodes: int, log_dir: str, policy_path: str, policy_type: str, env_name: str, parallel: bool, rollout_save_path: str, max_episode_steps: Optional[int], dac: bool, ) -> None: """Loads a saved policy and generates rollouts. Unlisted arguments are the same as in `rollouts_and_policy()`. Args: policy_type: Argument to `imitation.policies.serialize.load_policy`. policy_path: Argument to `imitation.policies.serialize.load_policy`. rollout_save_path: Rollout pickle is saved to this path. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps, rollout_save_n_episodes) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps, dac=dac) with serialize.load_policy(policy_type, policy_path, venv) as policy: print(policy) util.rollout.save(rollout_save_path, policy, venv, sample_until)
def test_density_trainer(density_type, is_stationary): env_name = 'Pendulum-v0' with open("tests/data/expert_models/pendulum_0/rollouts/final.pkl", "rb") as f: rollouts = pickle.load(f) env = util.make_vec_env(env_name, 2) imitation_trainer = util.init_rl(env) density_trainer = DensityTrainer(env, rollouts=rollouts, imitation_trainer=imitation_trainer, density_type=density_type, is_stationary=is_stationary, kernel='gaussian') novice_stats = density_trainer.test_policy() density_trainer.train_policy(2000) good_stats = density_trainer.test_policy() # Novice is bad assert novice_stats["return_mean"] < -500 # Density is also pretty bad, but shouldn't make things more than 50% worse. # It would be nice to have a less flaky/more meaningful test here. assert good_stats["return_mean"] > 1.5 * novice_stats["return_mean"]
def test_serialize_identity(session, env_name, discrim_net_cls, tmpdir): """Does output of deserialized discriminator match that of original?""" venv = util.make_vec_env(env_name, parallel=False) original = DISCRIM_NET_SETUPS[discrim_net_cls](venv) random = base.RandomPolicy(venv.observation_space, venv.action_space) session.run(tf.global_variables_initializer()) original.save(tmpdir) with tf.variable_scope("loaded"): loaded = discrim_net.DiscrimNet.load(tmpdir) transitions = util.rollout.generate_transitions(random, venv, n_timesteps=100) length = len(transitions.obs) # n_timesteps is only a lower bound labels = np.random.randint(2, size=length).astype(np.float32) log_prob = np.random.randn(length) feed_dict = {} outputs = {'train': [], 'test': []} for net in [original, loaded]: feed_dict.update({ net.obs_ph: transitions.obs, net.act_ph: transitions.acts, net.next_obs_ph: transitions.next_obs, net.labels_gen_is_one_ph: labels, net.log_policy_act_prob_ph: log_prob, }) outputs['train'].append(net.policy_train_reward) outputs['test'].append(net.policy_test_reward) rewards = session.run(outputs, feed_dict=feed_dict) for key, predictions in rewards.items(): assert len(predictions) == 2 assert np.allclose(predictions[0], predictions[1])
def test_trained_policy_better_than_random(use_gail, env='CartPole-v1', n_episodes=50): """ Make sure that generator policy trained to mimick expert policy demonstrations) achieves higher reward than a random policy. In other words, perform a basic check on the imitation learning capabilities of AIRL and GAIL. """ env = util.make_vec_env(env, 32) trainer = init_trainer(env, use_expert_rollouts=True, use_gail=use_gail) expert_policy = util.load_policy(env, basedir="expert_models") random_policy = util.make_blank_policy(env) if expert_policy is None: pytest.fail("Couldn't load expert_policy!") trainer.train(n_epochs=200) # Idea: Plot n_epochs vs generator reward. for _ in range(4): expert_ret = rollout.mean_return(expert_policy, env, n_episodes=n_episodes) gen_ret = rollout.mean_return(trainer.gen_policy, env, n_episodes=n_episodes) random_ret = rollout.mean_return(random_policy, env, n_episodes=n_episodes) print("expert return:", expert_ret) print("generator return:", gen_ret) print("random return:", random_ret) assert expert_ret > random_ret assert gen_ret > random_ret
def rollouts_and_policy( _seed: int, env_name: str, total_timesteps: int, *, log_dir: str = None, num_vec: int = 8, parallel: bool = False, normalize: bool = True, make_blank_policy_kwargs: dict = {}, rollout_save_interval: int = 0, rollout_save_final: bool = False, rollout_save_n_timesteps: Optional[int] = None, rollout_save_n_episodes: Optional[int] = None, policy_save_interval: int = -1, policy_save_final: bool = True, ) -> None: """Trains an expert policy from scratch and saves the rollouts and policy. At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}.pkl`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. normalize: If True, then rescale observations and reward. make_blank_policy_kwargs: Kwargs for `make_blank_policy`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. """ _validate_traj_generate_params(rollout_save_n_timesteps, rollout_save_n_episodes) with util.make_session(): tf.logging.set_verbosity(tf.logging.INFO) sb_logger.configure(folder=osp.join(log_dir, 'rl'), format_strs=['tensorboard', 'stdout']) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir) vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs) # Make callback to save intermediate artifacts during training. step = 0 rollout_ok = rollout_save_interval > 0 policy_ok = policy_save_interval > 0 def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] if rollout_ok and step % rollout_save_interval == 0: util.rollout.save( rollout_dir, policy, venv, step, n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_ok and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:5d}') serialize.save_stable_model(output_dir, policy, vec_normalize) return True policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: util.rollout.save( rollout_dir, policy, venv, "final", n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize)
def train(_run, _seed: int, env_name: str, rollout_path: str, normalize: bool, normalize_kwargs: dict, n_expert_demos: Optional[int], log_dir: str, init_trainer_kwargs: dict, total_timesteps: int, n_episodes_eval: int, init_tensorboard: bool, checkpoint_interval: int, dac: bool, rollout_save_n_timesteps: int, rollout_save_n_episodes: int, num_vec: int, parallel: bool, max_episode_steps: Optional[int]) -> dict: """Train an adversarial-network-based imitation learning algorithm. Plots (turn on using `plot_interval > 0`): - Plot discriminator loss during discriminator training steps in blue and discriminator loss during generator training steps in red. - Plot the performance of the generator policy versus the performance of a random policy. Also plot the performance of an expert policy if that is provided in the arguments. Checkpoints: - DiscrimNets are saved to f"{log_dir}/checkpoints/{step}/discrim/", where step is either the training epoch or "final". - Generator policies are saved to f"{log_dir}/checkpoints/{step}/gen_policy/". Args: _seed: Random seed. env_name: The environment to train in. rollout_path: Path to pickle containing list of Trajectories. Used as expert demonstrations. n_expert_demos: The number of expert trajectories to actually use after loading them from `rollout_path`. If None, then use all available trajectories. If `n_expert_demos` is an `int`, then use exactly `n_expert_demos` trajectories, erroring if there aren't enough trajectories. If there are surplus trajectories, then use the first `n_expert_demos` trajectories and drop the rest. log_dir: Directory to save models and other logging to. init_trainer_kwargs: Keyword arguments passed to `init_trainer`, used to initialize the trainer. total_timesteps: The number of transitions to sample from the environment during training. n_episodes_eval: The number of episodes to average over when calculating the average episode reward of the imitation policy for return. plot_interval: The number of epochs between each plot. If negative, then plots are disabled. If zero, then only plot at the end of training. n_plot_episodes: The number of episodes averaged over when calculating the average episode reward of a policy for the performance plots. extra_episode_data_interval: Usually mean episode rewards are calculated immediately before every plot. Set this parameter to a nonnegative number to also add episode reward data points every `extra_episodes_data_interval` epochs. show_plots: Figures are always saved to `f"{log_dir}/plots/*.png"`. If `show_plots` is True, then also show plots as they are created. init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`. checkpoint_interval: Save the discriminator and generator models every `checkpoint_interval` epochs and after training is complete. If 0, then only save weights after training is complete. If <0, then don't save weights at all. Returns: A dictionary with two keys. "imit_stats" gives the return value of `rollout_stats()` on rollouts test-reward-wrapped environment, using the final policy (remember that the ground-truth reward can be recovered from the "monitor_return" key). "expert_stats" gives the return value of `rollout_stats()` on the expert demonstrations loaded from `rollout_path`. """ total_timesteps = int(total_timesteps) tf.logging.info("Logging to %s", log_dir) os.makedirs(log_dir, exist_ok=True) # try: # sacred_util.build_sacred_symlink(log_dir, _run) # except Exception as e: # print("didnt build symlink") # # Calculate stats for expert rollouts. Used for plot and return value. # with open(rollout_path, "rb") as f: # expert_trajs = pickle.load(f) # if n_expert_demos is not None: # assert len(expert_trajs) >= n_expert_demos # expert_trajs = expert_trajs[:n_expert_demos] # # # expert_stats = util.rollout.rollout_stats(expert_trajs) sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps, rollout_save_n_episodes) with util.make_session(): venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps, dac=dac) print("type of venv is: ", type(venv)) vec_normalize = None venv = vec_normalize = VecNormalize(venv) print("type of venv is: ", type(venv)) # time.sleep(10) gen_policy_path = os.path.join(log_dir, "checkpoints", "final", "gen_policy") print("gen policy path is: ", gen_policy_path) time.sleep(10) with serialize.load_policy('ppo2', gen_policy_path, venv) as policy: print(policy) print('right before: ', type(venv)) time.sleep(10) util.rollout.save(gen_policy_path, policy, venv, sample_until)
def init_trainer( env_name: str, expert_trajectories: Sequence[rollout.Trajectory], *, log_dir: str, seed: int = 0, use_gail: bool = False, num_vec: int = 8, parallel: bool = False, max_episode_steps: Optional[int] = None, scale: bool = True, airl_entropy_weight: float = 1.0, discrim_kwargs: dict = {}, reward_kwargs: dict = {}, trainer_kwargs: dict = {}, init_rl_kwargs: dict = {}, ): """Builds an AdversarialTrainer, ready to be trained on a vectorized environment and expert demonstrations. Args: env_name: The string id of a gym environment. expert_trajectories: Demonstrations from expert. seed: Random seed. log_dir: Directory for logging output. Will generate a unique sub-directory within this directory for all output. use_gail: If True, then train using GAIL. If False, then train using AIRL. num_vec: The number of vectorized environments. parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv. max_episode_steps: If specified, wraps VecEnv in TimeLimit wrapper with this episode length before returning. policy_dir: The directory containing the pickled experts for generating rollouts. scale: If True, then scale input Tensors to the interval [0, 1]. airl_entropy_weight: Only applicable for AIRL. The `entropy_weight` argument of `DiscrimNetAIRL.__init__`. trainer_kwargs: Arguments for the Trainer constructor. reward_kwargs: Arguments for the `*RewardNet` constructor. discrim_kwargs: Arguments for the `DiscrimNet*` constructor. init_rl_kwargs: Keyword arguments passed to `init_rl`, used to initialize the RL algorithm. """ util.logger.configure(folder=log_dir, format_strs=['tensorboard', 'stdout']) env = util.make_vec_env(env_name, num_vec, seed=seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) gen_policy = util.init_rl(env, verbose=1, **init_rl_kwargs) if use_gail: discrim = discrim_net.DiscrimNetGAIL(env.observation_space, env.action_space, scale=scale, **discrim_kwargs) else: rn = BasicShapedRewardNet(env.observation_space, env.action_space, scale=scale, **reward_kwargs) discrim = discrim_net.DiscrimNetAIRL( rn, entropy_weight=airl_entropy_weight, **discrim_kwargs) expert_demos = util.rollout.flatten_trajectories(expert_trajectories) trainer = AdversarialTrainer(env, gen_policy, discrim, expert_demos, log_dir=log_dir, **trainer_kwargs) return trainer
def init_trainer( env_id: str, rollout_glob: str, *, n_expert_demos: Optional[int] = None, seed: int = 0, log_dir: str = None, use_gail: bool = False, num_vec: int = 8, parallel: bool = False, max_n_files: int = 1, scale: bool = True, airl_entropy_weight: float = 1.0, discrim_kwargs: bool = {}, reward_kwargs: bool = {}, trainer_kwargs: bool = {}, make_blank_policy_kwargs: bool = {}, ): """Builds a Trainer, ready to be trained on a vectorized environment and expert demonstrations. Args: env_id: The string id of a gym environment. rollout_glob: Argument for `imitation.util.rollout.load_trajectories`. n_expert_demos: The number of expert trajectories to actually use after loading them via `load_trajectories`. If None, then use all available trajectories. If `n_expert_demos` is an `int`, then use exactly `n_expert_demos` trajectories, erroring if there aren't enough trajectories. If there are surplus trajectories, then use the first `n_expert_demos` trajectories and drop the rest. seed: Random seed. log_dir: Directory for logging output. use_gail: If True, then train using GAIL. If False, then train using AIRL. num_vec: The number of vectorized environments. parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv. max_n_files: If provided, then only load the most recent `max_n_files` files, as sorted by modification times. policy_dir: The directory containing the pickled experts for generating rollouts. scale: If True, then scale input Tensors to the interval [0, 1]. airl_entropy_weight: Only applicable for AIRL. The `entropy_weight` argument of `DiscrimNetAIRL.__init__`. trainer_kwargs: Arguments for the Trainer constructor. reward_kwargs: Arguments for the `*RewardNet` constructor. discrim_kwargs: Arguments for the `DiscrimNet*` constructor. make_blank_policy_kwargs: Keyword arguments passed to `make_blank_policy`, used to initialize the trainer. """ env = util.make_vec_env(env_id, num_vec, seed=seed, parallel=parallel, log_dir=log_dir) gen_policy = util.init_rl(env, verbose=1, **make_blank_policy_kwargs) if use_gail: discrim = discrim_net.DiscrimNetGAIL(env.observation_space, env.action_space, scale=scale, **discrim_kwargs) else: rn = BasicShapedRewardNet(env.observation_space, env.action_space, scale=scale, **reward_kwargs) discrim = discrim_net.DiscrimNetAIRL( rn, entropy_weight=airl_entropy_weight, **discrim_kwargs) expert_demos = util.rollout.load_trajectories(rollout_glob, max_n_files=max_n_files) if n_expert_demos is not None: assert len(expert_demos) >= n_expert_demos expert_demos = expert_demos[:n_expert_demos] expert_rollouts = util.rollout.flatten_trajectories(expert_demos)[:3] trainer = Trainer(env, gen_policy, discrim, expert_rollouts, **trainer_kwargs) return trainer
def rollouts_and_policy( _seed: int, env_name: str, total_timesteps: int, *, log_dir: str = None, num_vec: int = 8, parallel: bool = False, max_episode_steps: Optional[int] = None, normalize: bool = True, make_blank_policy_kwargs: dict = {}, reward_type: Optional[str] = None, reward_path: Optional[str] = None, rollout_save_interval: int = 0, rollout_save_final: bool = False, rollout_save_n_timesteps: Optional[int] = None, rollout_save_n_episodes: Optional[int] = None, policy_save_interval: int = -1, policy_save_final: bool = True, ) -> None: """Trains an expert policy from scratch and saves the rollouts and policy. At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}.pkl`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. normalize: If True, then rescale observations and reward. make_blank_policy_kwargs: Kwargs for `make_blank_policy`. reward_type: If provided, then load the serialized reward of this type, wrapping the environment in this reward. This is useful to test whether a reward model transfers. For more information, see `imitation.rewards.serialize.load_reward`. reward_path: A specifier, such as a path to a file on disk, used by reward_type to load the reward model. For more information, see `imitation.rewards.serialize.load_reward`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. """ _validate_traj_generate_params(rollout_save_n_timesteps, rollout_save_n_episodes) with util.make_session(): tf.logging.set_verbosity(tf.logging.INFO) sb_logger.configure(folder=osp.join(log_dir, 'rl'), format_strs=['tensorboard', 'stdout']) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) log_callbacks = [] with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = RewardVecEnvWrapper(venv, reward_fn) log_callbacks.append(venv.log_callback) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs) # Make callback to save intermediate artifacts during training. step = 0 def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] # TODO(adam): make logging frequency configurable for callback in log_callbacks: callback(sb_logger) if rollout_save_interval > 0 and step % rollout_save_interval == 0: util.rollout.save(rollout_dir, policy, venv, step, n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_interval > 0 and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:05d}') serialize.save_stable_model(output_dir, policy, vec_normalize) return True # Continue training. policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: util.rollout.save(rollout_dir, policy, venv, "final", n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize)
def rollouts_and_policy( _run, _seed: int, env_name: str, total_timesteps: int, *, log_dir: str, num_vec: int, parallel: bool, max_episode_steps: Optional[int], normalize: bool, normalize_kwargs: dict, init_rl_kwargs: dict, n_episodes_eval: int, reward_type: Optional[str], reward_path: Optional[str], rollout_save_interval: int, rollout_save_final: bool, rollout_save_n_timesteps: Optional[int], rollout_save_n_episodes: Optional[int], policy_save_interval: int, policy_save_final: bool, init_tensorboard: bool, ) -> dict: """Trains an expert policy from scratch and saves the rollouts and policy. Checkpoints: At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}/`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. normalize: If True, then rescale observations and reward. normalize_kwargs: kwargs for `VecNormalize`. init_rl_kwargs: kwargs for `init_rl`. n_episodes_eval: The number of episodes to average over when calculating the average ground truth reward return of the final policy. reward_type: If provided, then load the serialized reward of this type, wrapping the environment in this reward. This is useful to test whether a reward model transfers. For more information, see `imitation.rewards.serialize.load_reward`. reward_path: A specifier, such as a path to a file on disk, used by reward_type to load the reward model. For more information, see `imitation.rewards.serialize.load_reward`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb and "output/summary/...". Returns: The return value of `rollout_stats()` using the final policy. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps, rollout_save_n_episodes) eval_sample_until = util.rollout.min_episodes(n_episodes_eval) with util.make_session(): tf.logging.set_verbosity(tf.logging.INFO) sb_logger.configure(folder=osp.join(log_dir, 'rl'), format_strs=['tensorboard', 'stdout']) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) if init_tensorboard: sb_tensorboard_dir = osp.join(log_dir, "sb_tb") init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) log_callbacks = [] with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = RewardVecEnvWrapper(venv, reward_fn) log_callbacks.append(venv.log_callback) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv, **normalize_kwargs) policy = util.init_rl(venv, verbose=1, **init_rl_kwargs) # Make callback to save intermediate artifacts during training. step = 0 def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] # TODO(adam): make logging frequency configurable for callback in log_callbacks: callback(sb_logger) if rollout_save_interval > 0 and step % rollout_save_interval == 0: save_path = osp.join(rollout_dir, f"{step}.pkl") util.rollout.save(save_path, policy, venv, sample_until) if policy_save_interval > 0 and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:05d}') serialize.save_stable_model(output_dir, policy, vec_normalize) policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: save_path = osp.join(rollout_dir, "final.pkl") util.rollout.save(save_path, policy, venv, sample_until) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize) # Final evaluation of expert policy. trajs = util.rollout.generate_trajectories( policy, venv, eval_sample_until) stats = util.rollout.rollout_stats(trajs) return stats
def test_discrim_net_no_crash(session, env_name, discrim_net_cls): # If parallel=True, codecov sometimes acts up. venv = util.make_vec_env(env_name, parallel=False) DISCRIM_NET_SETUPS[discrim_net_cls](venv)