def train(env_id, num_timesteps, seed, num_cpu, batch, lr): from rl.common import set_global_seeds from rl.common.vec_env.vec_normalize import MAVecNormalize from rl.common.ma_wrappers import MAWrapper from sandbox.mppo import ppo2 from sandbox.mppo.policies import MlpPolicy import gym import tensorflow as tf from rl.common.vec_env.subproc_vec_env import SubprocVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def _make_env(): env = gym.make(env_id) env = MAWrapper(env) env = bench.Monitor(env, logger.get_dir()) return env env = SubprocVecEnv([_make_env for _ in range(num_cpu)], is_multi_agent=True) env = MAVecNormalize(env) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=lr, cliprange=0.2, total_timesteps=num_timesteps)
def train(env_id, num_timesteps, seed, num_cpu, batch, lr): from rl.common import set_global_seeds from rl.common.vec_env.vec_normalize import MAVecNormalize from rl.common.ma_wrappers import MAWrapper from sandbox.mppo import ppo2 from sandbox.mppo.policies import MlpPolicy import gym import tensorflow as tf from rl.common.vec_env.subproc_vec_env import SubprocVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def _make_env(rank): env = gym.make('RoboSumo-Ant-vs-Ant-v0') env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env env = SubprocVecEnv([lambda: _make_env(i) for i in range(num_cpu)], is_multi_agent=True) env = MAVecNormalize(env) set_global_seeds(seed) policy = MlpPolicy expert = MADataSet('/atlas/u/tsong/Projects/imitation/ant-vs-ant.pkl') ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=160, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=lr, cliprange=0.2, total_timesteps=num_timesteps, expert=expert, clone_iters=1000)