def train(logdir, env, expert_path, seed, batch_size, lr, traj_limitation): env_id = env logdir = logdir + '/bc/' + env_id + '/s-{}/l-{}-b-{}/seed-{}'.format( traj_limitation, lr, batch_size, seed) print(logdir, env, expert_path, seed) logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) expert = MADataSet(expert_path, ret_threshold=-10, traj_limitation=traj_limitation) def create_env(rank): def _thunk(): env = make_env.make_env(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk env = SubprocVecEnv([create_env(i) for i in range(1)], is_multi_agent=True) policy_fn = CategoricalPolicy learn(policy_fn, env, expert, seed, int(2e7), batch_size=batch_size, lr=lr)
def train(env_id, num_timesteps, seed, num_cpu, batch, lr): from rl.common import set_global_seeds from rl.common.vec_env.vec_normalize import MAVecNormalize from rl.common.ma_wrappers import MAWrapper from sandbox.mppo import ppo2 from sandbox.mppo.policies import MlpPolicy import gym import tensorflow as tf from rl.common.vec_env.subproc_vec_env import SubprocVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def _make_env(rank): env = gym.make('RoboSumo-Ant-vs-Ant-v0') env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env env = SubprocVecEnv([lambda: _make_env(i) for i in range(num_cpu)], is_multi_agent=True) env = MAVecNormalize(env) set_global_seeds(seed) policy = MlpPolicy expert = MADataSet('/atlas/u/tsong/Projects/imitation/ant-vs-ant.pkl') ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=160, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=lr, cliprange=0.2, total_timesteps=num_timesteps, expert=expert, clone_iters=1000)
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path, traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500, l2=0.1, d_iters=1, rew_scale=0.1): def create_env(rank): def _thunk(): env = make_env.make_env(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) set_global_seeds(seed) env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) print(num_cpu) policy_fn = CategoricalPolicy expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation, nobs_flag=True) learn(policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr, disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id), l2=l2, d_iters=d_iters, rew_scale=rew_scale) env.close()
def train(logdir, env, expert_path, seed, max_episode_len): print(logdir, env, expert_path, seed, max_episode_len) logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) expert = MADataSet(expert_path, ret_threshold=-10, traj_limitation=200) env_id = env def create_env(rank): def _thunk(): env = make_env.make_env(env_id, max_episode_len=max_episode_len) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk env = SubprocVecEnv([create_env(i) for i in range(1)], is_multi_agent=True) policy_fn = CategoricalPolicy learn(policy_fn, env, expert, seed)