예제 #1
0
def train(logdir, env, expert_path, seed, batch_size, lr, traj_limitation):
    env_id = env
    logdir = logdir + '/bc/' + env_id + '/s-{}/l-{}-b-{}/seed-{}'.format(
        traj_limitation, lr, batch_size, seed)
    print(logdir, env, expert_path, seed)
    logger.configure(logdir,
                     format_strs=['stdout', 'log', 'json', 'tensorboard'])
    expert = MADataSet(expert_path,
                       ret_threshold=-10,
                       traj_limitation=traj_limitation)

    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    env = SubprocVecEnv([create_env(i) for i in range(1)], is_multi_agent=True)

    policy_fn = CategoricalPolicy
    learn(policy_fn, env, expert, seed, int(2e7), batch_size=batch_size, lr=lr)
예제 #2
0
def train(env_id, num_timesteps, seed, num_cpu, batch, lr):
    from rl.common import set_global_seeds
    from rl.common.vec_env.vec_normalize import MAVecNormalize
    from rl.common.ma_wrappers import MAWrapper
    from sandbox.mppo import ppo2
    from sandbox.mppo.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    def _make_env(rank):
        env = gym.make('RoboSumo-Ant-vs-Ant-v0')
        env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
        return env

    env = SubprocVecEnv([lambda: _make_env(i) for i in range(num_cpu)], is_multi_agent=True)
    env = MAVecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    expert = MADataSet('/atlas/u/tsong/Projects/imitation/ant-vs-ant.pkl')
    ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=160,
        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
        ent_coef=0.0,
        lr=lr,
        cliprange=0.2,
        total_timesteps=num_timesteps, expert=expert, clone_iters=1000)
예제 #3
0
def train(logdir,
          env_id,
          num_timesteps,
          lr,
          timesteps_per_batch,
          seed,
          num_cpu,
          expert_path,
          traj_limitation,
          ret_threshold,
          dis_lr,
          disc_type='decentralized',
          bc_iters=500,
          l2=0.1,
          d_iters=1,
          rew_scale=0.1):
    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    logger.configure(logdir,
                     format_strs=['stdout', 'log', 'json', 'tensorboard'])

    set_global_seeds(seed)
    env = SubprocVecEnv([create_env(i) for i in range(num_cpu)],
                        is_multi_agent=True)
    print(num_cpu)
    policy_fn = CategoricalPolicy
    expert = MADataSet(expert_path,
                       ret_threshold=ret_threshold,
                       traj_limitation=traj_limitation,
                       nobs_flag=True)
    learn(policy_fn,
          expert,
          env,
          env_id,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu,
          nsteps=timesteps_per_batch // num_cpu,
          lr=lr,
          ent_coef=0.0,
          dis_lr=dis_lr,
          disc_type=disc_type,
          bc_iters=bc_iters,
          identical=make_env.get_identical(env_id),
          l2=l2,
          d_iters=d_iters,
          rew_scale=rew_scale)
    env.close()
예제 #4
0
def train(logdir, env, expert_path, seed, max_episode_len):
    print(logdir, env, expert_path, seed, max_episode_len)
    logger.configure(logdir,
                     format_strs=['stdout', 'log', 'json', 'tensorboard'])
    expert = MADataSet(expert_path, ret_threshold=-10, traj_limitation=200)
    env_id = env

    def create_env(rank):
        def _thunk():
            env = make_env.make_env(env_id, max_episode_len=max_episode_len)
            env.seed(seed + rank)
            env = bench.Monitor(env,
                                logger.get_dir()
                                and os.path.join(logger.get_dir(), str(rank)),
                                allow_early_resets=True)
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    env = SubprocVecEnv([create_env(i) for i in range(1)], is_multi_agent=True)

    policy_fn = CategoricalPolicy
    learn(policy_fn, env, expert, seed)