Exemplo n.º 1
0
    def train(self, num_frames, seed, policy, lrschedule, num_cpu):
        def make_env(rank):
            def _thunk():
                env = helpers.get_env_wrapper(render=FLAGS.render)
                env.seed(seed + rank)
                env = bench.Monitor(
                    env,
                    logger.get_dir() and os.path.join(
                        logger.get_dir(), "{}.monitor.json".format(rank)))
                gym.logger.setLevel(logging.WARN)
                return env

            return _thunk

        set_global_seeds(seed)
        env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
        if policy == 'cnn':
            policy_fn = CnnPolicy
        elif policy == 'lstm':
            policy_fn = LstmPolicy
        elif policy == 'lnlstm':
            policy_fn = LnLstmPolicy
        self.learn(policy_fn,
                   env,
                   seed,
                   total_timesteps=num_frames,
                   lrschedule=lrschedule,
                   nsteps=(1 if FLAGS.render else 5),
                   lr=FLAGS.lr)
        env.close()
Exemplo n.º 2
0
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'):

  def make_env(rank):

    def _thunk():
      if env_id == "TestEnv":
        env = TestEnv(renderer=renderer)  #gym.make(env_id)
      else:
        env = gym.make(env_id)
      env.seed(seed + rank)
      env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
      gym.logger.setLevel(logging.WARN)
      # only clip rewards when not evaluating
      return env

    return _thunk

  set_global_seeds(seed)
  env = SubprocVecEnv([make_env(i) for i in range(num_env)])

  env.reset()
  start = time.time()
  for i in range(num_timesteps):
    action = [env.action_space.sample() for _ in range(num_env)]
    env.step(action)
  stop = time.time()
  duration = (stop - start)
  if (duration):
    fps = num_timesteps / duration
  else:
    fps = 0
  env.close()
  return num_env, fps
Exemplo n.º 3
0
def train(env_id, save_name, num_timesteps, seed, policy, lrschedule,
          sil_update, sil_beta, num_env):
    policy_fn = CnnPolicy_grid
    # env_args = {'episode_life': False, 'clip_rewards': False}
    env = gym.make(env_id)
    obs = env.reset()
    ob_space = obs["image"].shape
    ac_space = env.action_space
    env.close()
    # print(env.observation_space)
    print('ob_space:', ob_space)
    print('num act:', ac_space)
    envs = [make_env(env_id, seed, i) for i in range(num_env)]
    if num_env > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)
    # obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])
    # print('obs_shape_stack:',obs_shape)
    learn(policy_fn,
          envs,
          seed,
          ob_space,
          ac_space,
          save_name=save_name,
          nsteps=5,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule,
          lr=7e-4)

    envs.close()
Exemplo n.º 4
0
def show(env_id, num_timesteps, seed, policy):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            #env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            #gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(1)], render=True)
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    elif policy == 'autoencoder':
        policy_fn = AutoencoderPolicy

    if policy == 'autoencoder':
        auto_a2c.enjoy(policy_fn, policy, env, seed, num_timesteps)
    else:
        a2c.enjoy(policy_fn, policy, env, seed, num_timesteps)

    env.close()
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1)

    # divide by 4 due to frameskip, then do a little extras so episodes end
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(
                    logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=num_timesteps,
          lrschedule=lrschedule)
    env.close()
Exemplo n.º 6
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = AcerCnnPolicy
    elif policy == 'lstm':
        policy_fn = AcerLstmPolicy
    else:
        print("Policy {} not implemented".format(policy))
        return
    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule)
    env.close()
Exemplo n.º 7
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    elif policy == 'autoencoder':
        policy_fn = AutoencoderPolicy

    if policy == 'autoencoder':
        auto_a2c.learn(policy_fn, policy, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
    else:
        a2c.learn(policy_fn, policy, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)

    env.close()
Exemplo n.º 8
0
def train(env_id, num_frames, seed, load_path, num_cpu):
    num_timesteps = int(num_frames // 4)

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=num_timesteps,
          load_path=load_path,
          nprocs=num_cpu)
    env.close()
Exemplo n.º 9
0
def train(env_id, num_frames, seed, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1)

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env = gym.wrappers.Monitor(
                env,
                directory='/home/vasu/Desktop/acktr_json',
                force=True,
                video_callable=False,
                write_upon_reset=True)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
    env.close()
Exemplo n.º 10
0
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'):
    def make_env(rank):
        def _thunk():
            if env_id == "TestEnv":
                env = TestEnv(renderer=renderer)  #gym.make(env_id)
            else:
                env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            # only clip rewards when not evaluating
            return env

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_env)])

    env.reset()
    start = time.time()
    for i in range(num_timesteps):
        action = [env.action_space.sample() for _ in range(num_env)]
        env.step(action)
    stop = time.time()
    duration = (stop - start)
    if (duration):
        fps = num_timesteps / duration
    else:
        fps = 0
    env.close()
    return num_env, fps
Exemplo n.º 11
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu, perform, use_expert, save_networks, learn_time, expert_buffer_size):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = AcerCnnPolicy
    elif policy == 'lstm':
        policy_fn = AcerLstmPolicy
    else:
        print("Policy {} not implemented".format(policy))
        return

    network_saving_dir = os.path.join('./saved_networks', env_id)+'/'
    if not os.path.exists(network_saving_dir):
        os.makedirs(network_saving_dir)

    learn(policy_fn, env, seed, env_id, learn_time, expert_buffer_size, perform, use_expert, save_networks, network_saving_dir, int(num_timesteps * 1.1), lrschedule=lrschedule)
    env.close()
Exemplo n.º 12
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])

    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'radlstm':
        policy_fn = RadLstmPolicy

    learn(policy_fn,
          env,
          seed,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule)
    env.close()
Exemplo n.º 13
0
def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
    def make_env(rank):
        def env_fn():
            print(rank)
            if num_cpu == 1:
                env = MarioEnv(num_steering_dir=0)
            else:
                env = MarioEnv(num_steering_dir=11, num_env=rank)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return env

        return env_fn

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = AcerCnnPolicy
    elif policy == 'lstm':
        policy_fn = AcerLstmPolicy
    else:
        print("Policy {} not implemented".format(policy))
        return
    learn(policy_fn,
          env,
          seed,
          nsteps=50,
          total_timesteps=int(num_timesteps * 1.1),
          lrschedule=lrschedule,
          buffer_size=15000,
          gamma=0.95)
    env.close()
Exemplo n.º 14
0
def train(num_timesteps, seed, num_cpu):
    # TODO: Just f****n ugly handle that better
    def make_env(rank):
        def _thunk():
            print(rank)
            if num_cpu == 1:
                env = MarioEnv(num_steering_dir=11)
            else:
                env = MarioEnv(num_steering_dir=11, num_env=rank)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = OurAcktrPolicy
    learn(policy_fn,
          env,
          seed,
          nsteps=4,
          total_timesteps=int(num_timesteps * 1.1),
          nprocs=num_cpu,
          save_interval=10)
    env.close()
Exemplo n.º 15
0
def train(make_env,
          num_timesteps,
          seed,
          policy,
          lrschedule,
          num_cpu,
          vf_coef=0.5,
          ent_coef=0.01):
    def _make_env(rank):
        def _thunk():
            env = make_env()
            env.seed(seed + rank)
            return env

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([_make_env(i) for i in range(num_cpu)])

    learn(policy,
          env,
          seed,
          nstack=1,
          total_timesteps=num_timesteps,
          lrschedule=lrschedule,
          vf_coef=vf_coef,
          ent_coef=ent_coef)
    env.close()
Exemplo n.º 16
0
def train():
    env_args = dict(map_name=FLAGS.map_name,
                    step_mul=FLAGS.step_mul,
                    game_steps_per_episode=0,
                    screen_size_px=(FLAGS.resolution, ) * 2,
                    minimap_size_px=(FLAGS.resolution, ) * 2,
                    visualize=FLAGS.visualize)

    envs = SubprocVecEnv(
        [partial(make_sc2env, id=i, **env_args) for i in range(FLAGS.n_envs)])
    policy_fn = FullyConvPolicy
    try:
        learn(
            policy_fn,
            envs,
            seed=1,
            total_timesteps=int(1e6) * FLAGS.frames,
            lrschedule=FLAGS.lrschedule,
            nstack=1,  #must be 1 for FullyConvPolicy above
            ent_coef=FLAGS.entropy_weight,
            vf_coef=FLAGS.value_weight,
            max_grad_norm=1.0,
            lr=FLAGS.learning_rate)
    except KeyboardInterrupt:
        pass

    envs.close()
Exemplo n.º 17
0
def train(env_id, num_frames, seed, num_cpu, save_interval, ckpt_dir):
    num_timesteps = int(num_frames / 4 * 1.1)

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            # check to ensure full action space is used
            assert env.action_space.n == 18, "amount of actions in action space is :{}, not equal to full action space".format(
                env.action_space.n)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=num_timesteps,
          nprocs=num_cpu,
          save_interval=save_interval,
          ckpt_dir=ckpt_dir)
    env.close()
Exemplo n.º 18
0
def train(env_id,
          num_frames,
          seed,
          nsteps,
          policy,
          lrschedule,
          num_cpu,
          model_path,
          lr=7e-4,
          pg_coef=1.0,
          ent_coef=0.01,
          vf_coef=0.5):
    num_timesteps = int(num_frames / 4)

    # divide by 4 due to frameskip
    def make_env(rank, isTraining=True):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(
                    logger.get_dir(), "{}.monitor.json".format(rank)),
                allow_early_resets=(not isTraining))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env,
                                 episode_life=isTraining,
                                 clip_rewards=isTraining)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i, isTraining=True) for i in range(num_cpu)])
    eval_env = SubprocVecEnv(
        [make_env(num_cpu + i, isTraining=False) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    learn(policy_fn,
          env,
          eval_env,
          seed,
          nsteps=nsteps,
          total_timesteps=num_timesteps,
          lr=lr,
          pg_coef=pg_coef,
          ent_coef=ent_coef,
          vf_coef=vf_coef,
          lrschedule=lrschedule,
          model_path=model_path)
    eval_env.close()
    env.close()
Exemplo n.º 19
0
def train(env_id, policy_fn, num_timesteps, seed, num_cpu):
    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
    env.close()
Exemplo n.º 20
0
def train(model_name, num_processes, max_grad_norm, num_env_steps, log_dir, epoch, env_name, save_dir, use_linear_clip_decay):
  records = []
  envs = [make_env(rank = i) for i in range(num_processes)]
  replaybuffer = Buffer()
  if len(envs) > 1:
    envs = SubprocVecEnv(envs)
  else:
    envs = DummyVecEnv(envs)
  try:
    state_shape = envs.observation_space.shape[0]
    action_shape = envs.action_space.shape[0]
    model = model_dict[model_name](state_shape, action_shape)
    cumpute_loss = loss_dict[model_name]
    optimizer = torch.optim.Adam(model.parameters())
    state = envs.reset()
    returns = 0
    for t in range(num_env_steps//num_processes):
      action, log_prob = model.act(state)
      next_state, reward, done, info = envs.step(to_np(action))
      returns += reward
      replaybuffer.store(zip(state, to_np(action), to_np(log_prob), reward, next_state, 1 - done))
      for i, d in enumerate(done):
        if d:
          records.append((t * num_processes + i, returns[i]))
          if i==0:
            print(returns[0])
          returns[i] = 0
      state = next_state

      if t % 500//num_processes == (500//num_processes-1):
        for _ in range(epoch):
          optimizer.zero_grad()
          loss = cumpute_loss(replaybuffer.sample(), model)
          loss.backward()
          nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
          optimizer.step()
        if model_name == 'PPO' or model_name == 'DPPO':
          replaybuffer.clear()

      if t % (num_env_steps//num_processes//10) == 0:
        i = t//(num_env_steps//num_processes//10)
        torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+str(i)+'.pt'))
      if use_linear_clip_decay:
        update_linear_schedule(optimizer, t * num_processes)
    torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt'))
    timesteps , sumofrewards = zip(*records)
    savemat(os.path.join(save_dir, model_name,env_name,'returns.mat'),{'timesteps':timesteps, 'returns':sumofrewards})
  except Exception as e:
    traceback.print_exc()
  finally:
    envs.close()
Exemplo n.º 21
0
Arquivo: train.py Projeto: lhm3561/OSS
def train_acktr(env_id, num_timesteps, seed, num_cpu):
    """Train a acktr model.

    Parameters
    -------
    env_id: environment to train on
    num_timesteps: int
        number of env steps to optimizer for
    seed: int
        number of random seed
    num_cpu: int
        number of parallel agents

    """
    num_timesteps //= 4

    def make_env(rank):
        def _thunk():
            # 1. Create gym environment
            env = gym.make(env_id)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(
                    env,
                    os.path.join(logger.get_dir(),
                                 "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            # 2. Apply action space wrapper
            env = MarioActionSpaceWrapper(env)
            # 3. Apply observation space wrapper to reduce input size
            env = ProcessFrame84(env)

            return env

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])

    policy_fn = CnnPolicy
    acktr_disc.learn(policy_fn,
                     env,
                     seed,
                     total_timesteps=num_timesteps,
                     nprocs=num_cpu,
                     save_interval=True,
                     lr=FLAGS.lr,
                     callback=acktr_callback)
    env.close()
Exemplo n.º 22
0
def test_env_after_learn(algo):
    def make_env():
        # acktr requires too much RAM, fails on travis
        env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
        return env

    make_session(make_default=True, graph=tf.Graph())
    env = SubprocVecEnv([make_env])

    learn = get_learn_function(algo)

    # Commenting out the following line resolves the issue, though crash happens at env.reset().
    learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)

    env.reset()
    env.close()
def test_env_after_learn(algo):
    def make_env():
        # acktr requires too much RAM, fails on travis
        env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
        return env

    make_session(make_default=True, graph=tf.Graph())
    env = SubprocVecEnv([make_env])

    learn = get_learn_function(algo)

    # Commenting out the following line resolves the issue, though crash happens at env.reset().
    learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)

    env.reset()
    env.close()
Exemplo n.º 24
0
def train(env_id, num_frames, seed, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1) 
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
    env.close()
Exemplo n.º 25
0
def train():
    # Fetch the requested environment set in flags.
    env_class = attrgetter(FLAGS.env)(sc2g.env)

    env_args = dict(
        map_name=FLAGS.map_name,
        feature_screen_size=FLAGS.screen_size,
        feature_minimap_size=FLAGS.minimap_size,
        visualize=FLAGS.visualize,
        save_replay_episodes=FLAGS.save_replay_episodes,
        replay_dir=FLAGS.replay_dir,
    )

    envs = SubprocVecEnv([
        partial(env_class.make_env, id=i, **env_args)
        for i in range(FLAGS.envs)
    ])

    policy_fn = CnnPolicy
    if FLAGS.policy == 'cnn':
        policy_fn = CnnPolicy
    elif FLAGS.policy == 'lstm':
        policy_fn = LstmPolicy
    elif FLAGS.policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    elif FLAGS.policy == 'fullyconv':
        policy_fn = FullyConvPolicy
    else:
        print("Invalid policy function! Defaulting to {}.".format(policy_fn))

    try:
        learn(policy_fn,
              envs,
              seed=1,
              total_timesteps=int(1e6 * FLAGS.max_timesteps),
              lrschedule=FLAGS.lrschedule,
              ent_coef=FLAGS.entropy_weight,
              vf_coef=FLAGS.value_weight,
              max_grad_norm=1.0,
              lr=FLAGS.learning_rate)
    except KeyboardInterrupt:
        pass

    print("Closing environment...")
    envs.close()
Exemplo n.º 26
0
class EnvironmentContext:
    def __init__(self,
                 *,
                 env_name=None,
                 make_env=None,
                 seed,
                 n_envs=1,
                 env_modifiers=list(),
                 vec_env_modifiers=list()):
        self.env_name = env_name
        if make_env is None:
            make_env = lambda: gym.make(self.env_name)
        self.make_env = make_env
        self.n_envs = n_envs
        self.env_modifiers = env_modifiers
        self.vec_env_modifiers = vec_env_modifiers
        self.seed = seed

    def __enter__(self):
        def make_env(i):
            def _thunk():
                env = self.make_env()
                env.seed(i)
                for fn in self.env_modifiers:
                    env = fn(env)
                env = bench.Monitor(env,
                                    logger.get_dir(),
                                    allow_early_resets=True)
                return env

            return _thunk

        set_global_seeds(self.seed)
        self.base_vec_env = SubprocVecEnv(
            [make_env(i + self.seed) for i in range(self.n_envs)])
        self.environments = self.base_vec_env
        for fn in self.vec_env_modifiers:
            self.environments = fn(self.environments)

        return self

    def __exit__(self, *args):
        self.base_vec_env.close()
Exemplo n.º 27
0
def train(env_id, num_frames, seed, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1) 
    def make_env(rank):
        def _thunk():
            env_spec = gym.spec('ppaquette/DoomBasic-v0')
            env_spec.id = 'DoomBasic-v0'
            env = env_spec.make()
            env.seed(seed + rank)
            env = PreprocessImage((SkipWrapper(4)(ToDiscrete("minimal")(env))))
            if logger.get_dir():
                env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return ScaleRewardEnv(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu, nstack=1)
    env.close()
Exemplo n.º 28
0
def train(env_id, num_timesteps, seed, num_cpu, save_interval=None, animate_interval=None):
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            gym.logger.setLevel(logging.WARN)
            #env = NoopResetEnv(env, noop_max=30)
            #env = MaxAndSkipEnv(env, skip=4)
            #if 'FIRE' in env.unwrapped.get_action_meanings():
            #    env = FireResetEnv(env)
            #env = WarpFrame(env)
            env = ClipRewardEnv(env)
            return env
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = FcnPolicy
    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, save_interval=save_interval, animate_interval=animate_interval, env_id=env_id)
    env.close()
Exemplo n.º 29
0
def test_env_after_learn(algo):
    def make_env():
        env = gym.make('PongNoFrameskip-v4')
        return env

    make_session(make_default=True, graph=tf.Graph())
    env = SubprocVecEnv([make_env])

    learn = get_learn_function(algo)
    network = cnn(one_dim_bias=True)

    # Commenting out the following line resolves the issue, though crash happens at env.reset().
    learn(network=network,
          env=env,
          total_timesteps=0,
          load_path=None,
          seed=None)

    env.reset()
    env.close()
Exemplo n.º 30
0
def train(num_timesteps, seed, lrschedule, num_cpu):
    def make_env(rank):
        def _thunk():
            env = LearnToRunEnv(difficulty=(seed + rank) % 3)
            env.seed(seed + rank)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(
                    logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return env

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = MlpPolicy
    learn(policy_fn, env, seed, nsteps=5, nstack=2, total_timesteps=num_timesteps,\
        gamma=0.995, vf_coef=0.5,ent_coef=0.0001,max_grad_norm=args.max_grad_norm,
        lr=args.lr,lrschedule='linear') #constant does much worse)
    env.close()
Exemplo n.º 31
0
def main():
    """
    Example program using SubProcVecEnv
    """
    num_envs = 2
    env_name = 'BreakoutNoFrameskip-v4'

    env = SubprocVecEnv([
        lambda: env_instantiate_fn(env_name, seed) for seed in range(num_envs)
    ])
    obs = env.reset()

    print("After reset:")
    print(obs.shape)

    obs, rews, dones, infos = env.step([0, 0])

    print("After first action:")
    print(obs.shape)
    print(rews)
    print(dones)
    print(infos)

    obs, rews, dones, infos = env.step([1, 0])

    print("After second action:")
    print(obs.shape)
    print(rews)
    print(dones)
    print(infos)

    obs, rews, dones, infos = env.step([0, 1])

    print("After third action:")
    print(obs.shape)
    print(rews)
    print(dones)
    print(infos)

    env.close()
Exemplo n.º 32
0
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1) 
    # divide by 4 due to frameskip, then do a little extras so episodes end
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env, logger.get_dir() and 
                os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule)
    env.close()
def test(num_env_steps, num_processes, log_dir, env_name, model_name, save_dir):
  records = []
  epoch = 0
  envs = [make_env(rank = i) for i in range(num_processes)]
  if len(envs) > 1:
    envs = SubprocVecEnv(envs)
  else:
    envs = DummyVecEnv(envs)
  try:
    state_shape = envs.observation_space.shape[0]
    action_shape = envs.action_space.shape[0]
    model = model_dict[model_name](state_shape, action_shape)
    state_dict = torch.load(os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt'))
    model.load_state_dict(state_dict)
    state = envs.reset()
    returns = 0
    for t in range(num_env_steps//num_processes):
      action, log_prob = model.act(state)
      next_state, reward, done, info = envs.step(to_np(action))
      returns += reward
      for i, d in enumerate(done):
        if d:
          records.append(returns[i])
          returns[i] = 0
          epoch += 1
      if epoch >= 100:
        break
      state = next_state
    records = np.array(records)
    print("# of epoch: {0}".format(epoch))
    print("mean: {0}".format(np.mean(records)))
    print("std: {0}".format(np.std(records)))
    print("max: {0}".format(np.max(records)))
    print("min: {0}".format(np.min(records)))
    print("median: {0}".format(np.median(records)))
  except Exception as e:
    traceback.print_exc()
  finally:
    envs.close()
Exemplo n.º 34
0
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1)

    # divide by 4 due to frameskip, then do a little extras so episodes end
    def make_env(rank):
        def _thunk():
            env_spec = gym.spec('ppaquette/DoomBasic-v0')
            env_spec.id = 'DoomBasic-v0'
            env = env_spec.make()
            env.seed(seed + rank)
            env = PreprocessImage((SkipWrapper(4)(ToDiscrete("minimal")(env))))
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(
                    logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return ScaleRewardEnv(env)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=num_timesteps,
          lrschedule=lrschedule,
          lr=1e-4,
          nsteps=10,
          nstack=1)
    env.close()