def main(): """ Runs the test """ """ Create an argparse.ArgumentParser for run_mujoco.py. :return: (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False} parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--num-timesteps', type=int, default=int(1e6)) parser.add_argument('--play', default=False, action='store_true') return parse """ env_id = 'UR5Gripper-v0' model_path = '/tmp/gym/trpo_mpi/' # args = mujoco_arg_parser().parse_args() # train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) # train(env_id=env_id, num_timesteps=int(1e7), seed=0, model_path=model_path) env = gym.make(env_id) env = Monitor(env, model_path, allow_early_resets=True) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=model_path) model = model.load(model_path + "trpo.pkl") model.learn(total_timesteps=int(1e5), callback=callback) model.save(model_path + "trpo.pkl") # tf_util.save_state(model_path) # Enjoy trained agent obs = env.reset() for i in range(100): obs = env.reset() env.render() for i in range(200): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
class GAIL(ActorCriticRLModel): """ Generative Adversarial Imitation Learning (GAIL) .. warning:: Images are not yet handled properly by the current implementation :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param expert_dataset: (ExpertDataset) the dataset manager :param gamma: (float) the discount value :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) :param max_kl: (float) the kullback leiber loss threashold :param cg_iters: (int) the number of iterations for the conjugate gradient calculation :param lam: (float) GAE factor :param entcoeff: (float) the weight for the entropy loss :param cg_damping: (float) the compute gradient dampening factor :param vf_stepsize: (float) the value function stepsize :param vf_iters: (int) the value function's number iterations for learning :param hidden_size: ([int]) the hidden dimension for the MLP :param g_step: (int) number of steps to train policy in each epoch :param d_step: (int) number of steps to train discriminator in each epoch :param d_stepsize: (float) the reward giver stepsize :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly """ def __init__(self, policy, env, expert_dataset=None, hidden_size_adversary=100, adversary_entcoeff=1e-3, g_step=3, d_step=1, d_stepsize=3e-4, verbose=0, _init_setup_model=True, **kwargs): super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, _init_setup_model=_init_setup_model) self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs) self.trpo.using_gail = True self.trpo.expert_dataset = expert_dataset self.trpo.g_step = g_step self.trpo.d_step = d_step self.trpo.d_stepsize = d_stepsize self.trpo.hidden_size_adversary = hidden_size_adversary self.trpo.adversary_entcoeff = adversary_entcoeff self.env = self.trpo.env if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): pass def pretrain(self, dataset, n_epochs=10, learning_rate=1e-4, adam_epsilon=1e-8, val_interval=None): self.trpo.pretrain(dataset, n_epochs=n_epochs, learning_rate=learning_rate, adam_epsilon=adam_epsilon, val_interval=val_interval) return self def set_env(self, env): self.trpo.set_env(env) self.env = self.trpo.env def setup_model(self): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the GAIL model must be an " \ "instance of common.policies.ActorCriticPolicy." self.trpo.setup_model() def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="GAIL", reset_num_timesteps=True): assert self.trpo.expert_dataset is not None, "You must pass an expert dataset to GAIL for training" self.trpo.learn(total_timesteps, callback, seed, log_interval, tb_log_name, reset_num_timesteps) return self def predict(self, observation, state=None, mask=None, deterministic=False): return self.trpo.predict(observation, state=state, mask=mask, deterministic=deterministic) def action_probability(self, observation, state=None, mask=None, actions=None): return self.trpo.action_probability(observation, state=state, mask=mask, actions=actions) def save(self, save_path): self.trpo.save(save_path) @classmethod def load(cls, load_path, env=None, **kwargs): data, params = cls._load_from_file(load_path) model = cls(policy=data["policy"], env=None, _init_setup_model=False) model.trpo.__dict__.update(data) model.trpo.__dict__.update(kwargs) model.set_env(env) model.setup_model() restores = [] for param, loaded_p in zip(model.trpo.params, params): restores.append(param.assign(loaded_p)) model.trpo.sess.run(restores) return model
class GAIL(ActorCriticRLModel): """ Generative Adversarial Imitation Learning (GAIL) :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount value :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) :param max_kl: (float) the kullback leiber loss threashold :param cg_iters: (int) the number of iterations for the conjugate gradient calculation :param lam: (float) GAE factor :param entcoeff: (float) the weight for the entropy loss :param cg_damping: (float) the compute gradient dampening factor :param vf_stepsize: (float) the value function stepsize :param vf_iters: (int) the value function's number iterations for learning :param pretrained_weight: (str) the save location for the pretrained weights :param hidden_size: ([int]) the hidden dimension for the MLP :param expert_dataset: (Dset) the dataset manager :param save_per_iter: (int) the number of iterations before saving :param checkpoint_dir: (str) the location for saving checkpoints :param g_step: (int) number of steps to train policy in each epoch :param d_step: (int) number of steps to train discriminator in each epoch :param task_name: (str) the name of the task (can be None) :param d_stepsize: (float) the reward giver stepsize :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ def __init__(self, policy, env, pretrained_weight=False, hidden_size_adversary=100, adversary_entcoeff=1e-3, expert_dataset=None, save_per_iter=1, checkpoint_dir="/tmp/gail/ckpt/", g_step=1, d_step=1, task_name="task_name", d_stepsize=3e-4, verbose=0, _init_setup_model=True, **kwargs): super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, _init_setup_model=_init_setup_model) self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs) self.trpo.using_gail = True self.trpo.pretrained_weight = pretrained_weight self.trpo.expert_dataset = expert_dataset self.trpo.save_per_iter = save_per_iter self.trpo.checkpoint_dir = checkpoint_dir self.trpo.g_step = g_step self.trpo.d_step = d_step self.trpo.task_name = task_name self.trpo.d_stepsize = d_stepsize self.trpo.hidden_size_adversary = hidden_size_adversary self.trpo.adversary_entcoeff = adversary_entcoeff if _init_setup_model: self.setup_model() def set_env(self, env): super().set_env(env) self.trpo.set_env(env) def setup_model(self): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the GAIL model must be an " \ "instance of common.policies.ActorCriticPolicy." assert isinstance( self.action_space, gym.spaces.Box), "Error: GAIL requires a continuous action space." self.trpo.setup_model() def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="GAIL"): self.trpo.learn(total_timesteps, callback, seed, log_interval, tb_log_name) return self def predict(self, observation, state=None, mask=None, deterministic=False): return self.trpo.predict(observation, state, mask, deterministic=deterministic) def action_probability(self, observation, state=None, mask=None): return self.trpo.action_probability(observation, state, mask) def save(self, save_path): self.trpo.save(save_path) @classmethod def load(cls, load_path, env=None, **kwargs): data, params = cls._load_from_file(load_path) model = cls(policy=data["policy"], env=None, _init_setup_model=False) model.trpo.__dict__.update(data) model.trpo.__dict__.update(kwargs) model.set_env(env) model.setup_model() restores = [] for param, loaded_p in zip(model.trpo.params, params): restores.append(param.assign(loaded_p)) model.trpo.sess.run(restores) return model