Exemplo n.º 1
0
def test_env_after_learn(algo):
    def make_env():
        # acktr requires too much RAM, fails on travis
        env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
        return env

    make_session(make_default=True, graph=tf.Graph())
    env = SubprocVecEnv([make_env])

    learn = get_learn_function(algo)

    # Commenting out the following line resolves the issue, though crash happens at env.reset().
    learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)

    env.reset()
    env.close()
Exemplo n.º 2
0
def train(env_id, num_frames, seed, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1) 
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            if logger.get_dir():
                env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    policy_fn = CnnPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
    env.close()
Exemplo n.º 3
0
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'):

  def make_env(rank):

    def _thunk():
      if env_id == "TestEnv":
        env = TestEnv(renderer=renderer)  #gym.make(env_id)
      else:
        env = gym.make(env_id)
      env.seed(seed + rank)
      env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
      gym.logger.setLevel(logging.WARN)
      # only clip rewards when not evaluating
      return env

    return _thunk

  set_global_seeds(seed)
  env = SubprocVecEnv([make_env(i) for i in range(num_env)])

  env.reset()
  start = time.time()
  for i in range(num_timesteps):
    action = [env.action_space.sample() for _ in range(num_env)]
    env.step(action)
  stop = time.time()
  duration = (stop - start)
  if (duration):
    fps = num_timesteps / duration
  else:
    fps = 0
  env.close()
  return num_env, fps
Exemplo n.º 4
0
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1) 
    # divide by 4 due to frameskip, then do a little extras so episodes end
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = bench.Monitor(env, logger.get_dir() and 
                os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)
        return _thunk
    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule)
    env.close()
Exemplo n.º 5
0
def register_and_create_Envs(id_tmp_dir, seed, environment, rl_setting):
    """
    Register environment, create vector of n_e environments and return it.

    Args:
        id_temp_dir (str): Working directory.
        All other args are automatically provided by sacred by passing the equally named
        configuration variables that are either defined in the yaml files or the command line.

    """
    if environment['entry_point']:
        try:
            register(
                id=environment['name'],
                entry_point=environment['entry_point'],
                kwargs=environment['config'],
                max_episode_steps=environment['max_episode_steps']
            )
        except Exception:
            pass

    envs = [make_env(environment['name'], seed, i, id_tmp_dir,
                     frameskips_cases=environment['frameskips_cases'])
            for i in range(rl_setting['num_processes'])]

    # Vectorise envs
    if rl_setting['num_processes'] > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    # Normalise rewards. Unnecessary for Atari, unwanted for Mountain Hike.
    # Probably useful for MuJoCo?
    # if len(envs.observation_space.shape) == 1:
    if environment['vec_norm']:
        envs = VecNormalize(envs)

    return envs
Exemplo n.º 6
0
def make_doom_env(num_env, seed, name):
    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            if name == 'shoot':
                env = ShootEnv()
                env.seed(rank)
            elif name == 'navi':
                env = NaviEnv()
                env.seed(rank)
            elif name == 'consnavi':
                env = ConservativeNaviEnv()
                env.seed(rank)
            elif name == 'mixed':
                env = MixedEnv()
                env.seed(rank)
            elif name == 'dodge':
                env = DodgeEnv()
                env.seed(rank)
            elif name == 'upfloor':
                env = UpFloorEnv()
                env.seed(seed + rank)
            elif name == 'finddoor':
                env = FindDoorEnv()
                env.seed(rank)
            elif name == 'gather':
                env = GatherEnv()
                env.seed(rank)
            else:
                print('Invalid env name')
            #For finddoor env

            #env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return env

        return _thunk

    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i) for i in range(num_env)])
Exemplo n.º 7
0
def make_vec_env(env_id, env_type, num_env, seed,
                 wrapper_kwargs=None,
                 env_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None,
                 initializer=None,
                 force_dummy=False):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    wrapper_kwargs = wrapper_kwargs or {}
    env_kwargs = env_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    logger_dir = logger.get_dir()
    def make_thunk(rank, initializer=None):
        return lambda: make_env(
            env_id=env_id,
            env_type=env_type,
            mpi_rank=mpi_rank,
            subrank=rank,
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
            flatten_dict_observations=flatten_dict_observations,
            wrapper_kwargs=wrapper_kwargs,
            env_kwargs=env_kwargs,
            logger_dir=logger_dir,
            initializer=initializer
        )

    set_global_seeds(seed)
    if not force_dummy and num_env > 1:
        return SubprocVecEnv([make_thunk(i + start_index, initializer=initializer) for i in range(num_env)])
    else:
        return DummyVecEnv([make_thunk(i + start_index, initializer=None) for i in range(num_env)])
Exemplo n.º 8
0
def main():
    """Run PPO until the environment throws an exception."""
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    env_fns, env_names = create_envs()
    with tf.Session(config=config):
        # Take more timesteps than we need to be sure that
        # we stop due to an exception.
        ppo2.learn(policy=policies.CnnPolicy,
                   env=SubprocVecEnv(env_fns),
                   nsteps=4096,
                   nminibatches=8,
                   lam=0.95,
                   gamma=0.99,
                   noptepochs=3,
                   log_interval=1,
                   ent_coef=0.01,
                   lr=lambda _: 2e-4,
                   cliprange=lambda _: 0.1,
                   total_timesteps=int(1e9),
                   save_interval=10,
                   save_path='./checkpoints_joint_ppo2',
                   load_path=None)
Exemplo n.º 9
0
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + 10000 * mpi_rank +
                     rank if seed is not None else None)
            env = Monitor(
                env,
                logger.get_dir()
                and os.path.join(logger.get_dir(),
                                 str(mpi_rank) + '.' + str(rank)))
            return wrap_deepmind(env, **wrapper_kwargs)

        return _thunk

    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
Exemplo n.º 10
0
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            # Monitor is a wrapper of gym env, 对环境Env进行封装, 主要添加了对episode结束时信息的记录。
            env = Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return wrap_deepmind(env, **wrapper_kwargs)

        return _thunk

    set_global_seeds(seed)

    # SubproVecEnv 将上面创建好的函数(_thunk)放到各个子进程中去执行
    # (i + start_index) 传入不同的seed
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
Exemplo n.º 11
0
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, gamestate=None):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    def make_thunk(rank):
        return lambda: make_env(
            env_id=env_id,
            env_type=env_type,
            subrank = rank,
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
            wrapper_kwargs=wrapper_kwargs
        )

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_thunk(start_index)])
Exemplo n.º 12
0
def make_cartpole_env(env_id,
                      num_env,
                      seed,
                      wrapper_kwargs=None,
                      start_index=0):
    """
    Create a wrapped, monitored SubprocVecEnv for CartPole.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return env

        return _thunk

    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
Exemplo n.º 13
0
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, add_timestep,
                  device, allow_early_resets):
    envs = [
        make_env(env_name, seed, i, log_dir, add_timestep, allow_early_resets)
        for i in range(num_processes)
    ]
    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if len(envs.observation_space.shape) == 3:
        envs = VecPyTorchFrameStack(envs, 4, device)

    return envs
Exemplo n.º 14
0
def make_obstacle_tower(num, seed=0, show=False):
    assert ObstacleTowerEnv is not None,\
        'install https://github.com/Unity-Technologies/obstacle-tower-env'

    def make_env(rank):
        def _thunk():
            env = ObstacleTowerEnv('../ObstacleTower/obstacletower',
                                   retro=True,
                                   worker_id=rank,
                                   realtime_mode=show,
                                   config={'total-floors': 20})
            env.seed(seed + rank % 8)
            env = bench.Monitor(env, None, allow_early_resets=True)
            env = OTWrapper(env)
            env = FrameStack(env, 4)
            return env

        return _thunk

    envs = [make_env(i) for i in range(num)]
    envs = SubprocVecEnv(envs, context='fork')
    envs = VecPyTorch(envs)
    return envs
Exemplo n.º 15
0
def make_neyboy_env(env_id,
                    num_env,
                    seed,
                    wrapper_kwargs=None,
                    start_index=0,
                    allow_early_resets=False,
                    frame_skip=4,
                    save_video=False):
    """
    Create a wrapped, monitored SubprocVecEnv for Neyboy.
    """
    if wrapper_kwargs is None:
        wrapper_kwargs = {}

    def make_env(rank):
        def _thunk():
            env = make_neyboy_environment(env_id,
                                          seed,
                                          rank,
                                          allow_early_resets,
                                          frame_skip=frame_skip,
                                          save_video=save_video)
            # env = Cropper(env)
            env = WarpFrame(env)
            return env

        return _thunk

    set_global_seeds(seed)

    envs = [make_env(i + start_index) for i in range(num_env)]
    if num_env > 1:
        env = SubprocVecEnv(envs)
    else:
        env = DummyVecEnv(envs)

    return env
Exemplo n.º 16
0
def make_vec_env(env_id,
                 env_type,
                 num_env,
                 seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id) if env_type == 'atari' else gym.make(
                env_id)
            env.seed(seed + 10000 * mpi_rank +
                     rank if seed is not None else None)
            env = Monitor(env,
                          logger.get_dir()
                          and os.path.join(logger.get_dir(),
                                           str(mpi_rank) + '.' + str(rank)),
                          allow_early_resets=True)

            if env_type == 'atari': return wrap_deepmind(env, **wrapper_kwargs)
            elif reward_scale != 1: return RewardScaler(env, reward_scale)
            else: return env

        return _thunk

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv(
            [make_env(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_env(start_index)])
Exemplo n.º 17
0
def make_dm_control(domain_name,
                    task_name,
                    num_env,
                    seed,
                    frame_stack,
                    vis_reward=False,
                    wrapper_kwargs={},
                    start_index=0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}

    def wrap_env(seed):

        env = suite.load(domain_name,
                         task_name,
                         task_kwargs={'random': seed},
                         visualize_reward=vis_reward)
        env = pixels.Wrapper(env, pixels_only=False)
        env = MakeGym(env)
        env = WarpFrame(env, keep_obs=True)
        env = FrameStack(env, frame_stack, keep_obs=True)
        return env

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = wrap_env(seed + rank)
            env = Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return env

        return _thunk

    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
Exemplo n.º 18
0
def train(_):
    """Trains a PPO2 policy."""

    num_envs = 8  # number to run in parallel

    vec_env = SubprocVecEnv([(lambda _i=i: create_multiagent_env(_i))
                             for i in range(num_envs)],
                            context=None)

    # Import tensorflow after we create environments. TF is not fork sake, and
    # we could be using TF as part of environment if one of the players is
    # controled by an already trained model.
    ncpu = multiprocessing.cpu_count()
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    tf.Session(config=config).__enter__()

    ppo2.learn(
        network='gfootball_impala_cnn',
        total_timesteps=1e6 + 1,
        env=vec_env,
        seed=0,
        nsteps=128,
        nminibatches=8,
        noptepochs=2,
        max_grad_norm=0.64,
        gamma=0.993,
        ent_coef=0.003,
        lr=0.000343,
        log_interval=10,
        save_interval=10,
        cliprange=0.8,
        load_path=
        '/Users/stephen/Documents/football/checkpoints/11_vs_11_easy_stochastic_v2'
    )
Exemplo n.º 19
0
def make_vec_envs(env_name,
                  seed,
                  num_processes,
                  gamma,
                  log_dir,
                  add_timestep,
                  device,
                  allow_early_resets,
                  num_frame_stack=None,
                  new_wrapper=False,
                  clip_rewards=False,
                  primitive_reward=False):
    envs = [
        make_env(env_name, seed, i, log_dir, add_timestep, allow_early_resets,
                 new_wrapper, clip_rewards, primitive_reward)
        for i in range(num_processes)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if num_frame_stack is not None:
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
    elif len(envs.observation_space.shape) == 3:
        envs = VecPyTorchFrameStack(envs, 4, device)

    return envs
def test(num_env_steps, num_processes, log_dir, env_name, model_name, save_dir):
  records = []
  epoch = 0
  envs = [make_env(rank = i) for i in range(num_processes)]
  if len(envs) > 1:
    envs = SubprocVecEnv(envs)
  else:
    envs = DummyVecEnv(envs)
  try:
    state_shape = envs.observation_space.shape[0]
    action_shape = envs.action_space.shape[0]
    model = model_dict[model_name](state_shape, action_shape)
    state_dict = torch.load(os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt'))
    model.load_state_dict(state_dict)
    state = envs.reset()
    returns = 0
    for t in range(num_env_steps//num_processes):
      action, log_prob = model.act(state)
      next_state, reward, done, info = envs.step(to_np(action))
      returns += reward
      for i, d in enumerate(done):
        if d:
          records.append(returns[i])
          returns[i] = 0
          epoch += 1
      if epoch >= 100:
        break
      state = next_state
    records = np.array(records)
    print("# of epoch: {0}".format(epoch))
    print("mean: {0}".format(np.mean(records)))
    print("std: {0}".format(np.std(records)))
    print("max: {0}".format(np.max(records)))
    print("min: {0}".format(np.min(records)))
    print("median: {0}".format(np.median(records)))
  except Exception as e:
    traceback.print_exc()
  finally:
    envs.close()
Exemplo n.º 21
0
def my_make_vec_env(env_id,
                    env_type,
                    num_env,
                    seed,
                    wrapper_kwargs=None,
                    start_index=0,
                    reward_scale=1.0):
    if wrapper_kwargs is None: wrapper_kwargs = {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = ProstheticsEnv(visualize=False)
            env.seed(seed + 10000 * mpi_rank +
                     rank if seed is not None else None)
            env = ForceDictObservation(env)
            env = DictToListFull(env)
            env = JSONable(env)

            env = Monitor(
                env,
                logger.get_dir()
                and os.path.join(logger.get_dir(),
                                 str(mpi_rank) + '.' + str(rank)))

            if reward_scale != 1: return RewardScaler(env, reward_scale)
            else: return env

        return _thunk

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv(
            [make_env(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_env(start_index)])
Exemplo n.º 22
0
def main():
    config = tf.ConfigProto()
    os.environ["CURA_VISIBLE_DEVICES"] = "0"
    config.gpu_options_allow_growth = True

    with tf.Session(Config=config):
        model.learn(
            policy=policies.A2CPolicy,
            env=SubprocVecEnv([
                env.make_train_0, env.make_train_1, env.make_train_2,
                env.make_train_3, env.make_train_4, env.make_train_5,
                env.make_train_6, env.make_train_7, env.make_train_8,
                env.make_train_9, env.make_train_10, env.make_train_11,
                env.make_train_12
            ]),
            nsteps=2048,  # Steps per environment
            total_timesteps=10000000,
            gamma=0.99,
            lam=0.95,
            vf_coef=0.5,
            ent_coef=0.01,
            lr=2e-4,
            max_grad_norm=0.5,
            log_interval=10)
Exemplo n.º 23
0
    def create_env_vec(self, env_id, seed, num_workers):

        # divide by 4 due to frameskip, then do a little extras so episodes end
        def make_env(rank):
            def _thunk():
                env = gym.make(env_id)
                env.seed(seed + rank)
                env = bench.Monitor(
                    env, self.save_path and os.path.join(
                        self.save_path, "{}.monitor.json".format(rank)))
                if env_id.startswith('CartPole') or env_id.startswith(
                        'Acrobot'):
                    env = NumpyWrapper(env)
                elif env_id.startswith('MountainCar'):
                    env = MountainCarNumpyWrapper(env)
                elif 'NoFrameskip' in env.spec.id:
                    env = wrap_deepmind(env)
                return env

            return _thunk

        set_global_seeds(seed)
        env = SubprocVecEnv([make_env(i) for i in range(num_workers)])
        return env
Exemplo n.º 24
0
 def make_env(self,
              env_id,
              seed,
              id=None,
              num_processes=None,
              force_new=True):
     if id in self.envs:
         '''
         if force_new or env_id != self.env_id or self.num_envs != num_processes:
             self.close()
         else:
             print('env existed, use created env')
             return True
         '''
         self.close(id)
     if num_processes is None:
         num_processes = cpu_count()
     self.envs[id] = SubprocVecEnv([
         make_env(env_id, seed, rank=i, log_dir=None, visualize=False)
         for i in range(num_processes)
     ])
     print('Started! env_id:{}, seed:{}, num_processes:{}, id:{}'.format(
         env_id, seed, num_processes, id))
     return True
Exemplo n.º 25
0
def make_vec_env(env_id, env_type, num_env, seed,
                 prioritize,
                 n_active_envs,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None,
                 ):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    wrapper_kwargs = wrapper_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None

    def make_thunk(rank):
        return lambda: make_env(
            env_id=env_id,
            env_type=env_type,
            subrank=rank,
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
            flatten_dict_observations=flatten_dict_observations,
            wrapper_kwargs=wrapper_kwargs
        )

    set_global_seeds(seed)
    if num_env > 1:
        if prioritize:
            return ModifiedSubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)], n_active_envs=n_active_envs)
        else:
            return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_thunk(start_index)])
Exemplo n.º 26
0
def make_aai_env(env_directory, num_env, arenas_configurations, start_index=0):
    """
    Create a wrapped, monitored Unity environment.
    """
    def make_env(rank, arena_configuration):  # pylint: disable=C0111
        def _thunk():
            env = AnimalAIGym(
                environment_filename=env_directory,
                worker_id=rank,
                flatten_branched=True,
                arenas_configurations=arena_configuration,
                uint8_visual=True,
            )
            env = Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return env

        return _thunk

    return SubprocVecEnv([
        make_env(i + start_index, arenas_configurations)
        for i in range(num_env)
    ])
Exemplo n.º 27
0
def test_env_after_learn(algo):
    def make_env():
        env = gym.make('PongNoFrameskip-v4')
        return env

    make_session(make_default=True, graph=tf.Graph())
    env = SubprocVecEnv([make_env])

    learn = get_learn_function(algo)
    network = cnn(one_dim_bias=True)

    # Commenting out the following line resolves the issue, though crash happens at env.reset().
    learn(network=network,
          env=env,
          total_timesteps=0,
          load_path=None,
          seed=None)

    env.reset()
    env.close()
Exemplo n.º 28
0
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
    num_timesteps = int(num_frames / 4 * 1.1)

    # divide by 4 due to frameskip, then do a little extras so episodes end
    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = gym.wrappers.Monitor(env,
                                       directory='/home/vasu/Desktop/a2c_json',
                                       force=True)
            env = bench.Monitor(
                env,
                logger.get_dir() and os.path.join(
                    logger.get_dir(), "{}.monitor.json".format(rank)))
            env.reset()
            env.render()
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env)

        return _thunk

    set_global_seeds(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = LstmPolicy
    elif policy == 'lnlstm':
        policy_fn = LnLstmPolicy
    learn(policy_fn,
          env,
          seed,
          total_timesteps=num_timesteps,
          lrschedule=lrschedule)
    env.reset()
    env.close()
Exemplo n.º 29
0
    def train(self):
        #my laptop only has 8 cores and I generally use 8 actors for stuff, so make sure that the multiprocessing module doesn't try to give each actor multiple threads and make them fight
        os.environ['OMP_NUM_THREADS'] = '1'

        #make the environments and set them to run in parallel
        #thank you OpenAI for doing the multiprocessing stuff for me
        envs = [self.make_env(self.env_name, 42, n) for n in range(self.N)]
        envs = SubprocVecEnv(envs)

        obs_shape = envs.observation_space.shape

        #create policy network and set it to training mode
        entry_obs_shape = (obs_shape[0] * self.num_stack, *obs_shape[1:])
        self.policy = Policy(entry_obs_shape, envs.action_space)
        self.policy.train()

        #create storage for past actions
        rollouts = RolloutStorage()

        #set optimizer for updating the weights of our network
        optimizer = optim.Adam(self.policy.parameters(), lr=self.lr, eps=self.eps)

        #load saved weights if you can
        if os.path.isfile(self.filename):
            print("loading saved params")
            self.policy.load_state_dict(torch.load(self.filename))

        #init some variables to track how much reward we're getting
        episode_rewards = torch.zeros([self.N, 1])
        final_rewards = torch.zeros([self.N, 1])

        #init the stack
        #with most things we won't stack inputs, but having a 'num_stack' works the same as not having a stack at all so we good
        stacked_s = torch.zeros(self.N, self.num_stack * obs_shape[0], *obs_shape[1:])
        s = envs.reset()
        stacked_s = update_stacked_s(stacked_s, s, obs_shape)

        #start the training
        for iter in range(self.iters):

            #go through some timesteps
            for step in range(self.T):

                #get the predicted action and how sure the network is of taking that action
                #get the predicted value of our current state too
                with torch.no_grad():
                    a, log_p, v = self.policy(stacked_s)

                #transform the action so it's only 1 dimension
                a_np = a.squeeze(1).cpu().numpy()

                #step through the environment and observe what happens
                s2, r, done, _ = envs.step(a_np)
                #reshape the rewards so they're all in separate rows
                #each actor has its own row
                r = torch.from_numpy(r).view(-1, 1).float()
                episode_rewards += r

                #set a mask for this state
                #we'll use this calculate returns and update the stack
                #if we're done, the mask is 0 -> this'll make returns stop cumulating at this point and it'll clear past actions from the stack so those past actions don't confuse the network
                #we should apply the mask to the stack after we've stored it (so we don't mess up the data we're currently using), so we don't do it just yet
                #I struggled with that last part for a bit, so imagine you're playing pong with frame stacking. Once the env resets, the last frames of the previous game don't affect you at all so they shouldnt be used to predict what comes next
                mask = torch.FloatTensor([[0.0] if d else [1.0] for d in done])

                #store the data from this state
                #since stacked_s is declared at a higher scope, chaning its value in the training loop will change all the stored stacked_s values unless you store a copy of it instead
                rollouts.add(deepcopy(stacked_s), log_p, v, a, r, mask)

                #clears the stack if the env is done
                #there's no point in resetting the stack if there's only 1 value in it. the value will get reset in a few lines anyway so why do unnecessary math
                if self.num_stack > 1:
                    stacked_s *= mask

                #keep track of those rewards
                final_rewards *= mask
                final_rewards += (1 - mask) * episode_rewards
                episode_rewards *= mask

                #update stacked_s
                s = s2
                stacked_s = update_stacked_s(stacked_s, s, obs_shape)

            #predict one more value so we can calculate returns and advantages
            with torch.no_grad():
                next_v = self.policy.get_value(stacked_s)
            rollouts.compute_adv_and_returns(next_v, self.gamma, self.tau, self.eps)

            #optimization epochs
            for epoch in range(self.epochs):

                #get the minibatches
                data = rollouts.get_mb(self.num_mb, self.N, self.T)

                #loop through the minibatches
                for sample in data:
                    s_mb, log_p_old_mb, a_mb, returns_mb, adv_mb = sample
                    log_p_mb, v_mb, entropy = self.policy.eval_a(s_mb, a_mb)

                    #calculate the surrogate function
                    #https://arxiv.org/pdf/1707.06347.pdf
                    ratio = torch.exp(log_p_mb - log_p_old_mb)
                    f1 = ratio * adv_mb
                    f2 = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) * adv_mb

                    #calculate the loss
                    #policy loss is based on the surrogate
                    policy_loss = -torch.min(f1, f2).mean()
                    #value loss is mean squared error of the returns and the predicted values
                    value_loss = torch.pow(returns_mb - v_mb, 2).mean() * self.value_loss_coef
                    #entropy loss isn't really loss -> it subtracts from the loss to promote exploration
                    entropy_loss = (entropy * self.entropy_coef)
                    loss = policy_loss + value_loss - entropy_loss

                    #backprop and update weights
                    optimizer.zero_grad()
                    loss.backward()
                    nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
                    optimizer.step()

            #clear storage
            rollouts.reset()

            #update plots
            total_num_steps = (iter + 1) * self.N * self.T

            if iter % self.vis_iter == self.vis_iter - 1:
                xs.append(total_num_steps)

                graph_rewards = final_rewards.view(1, -1)
                mean_r = graph_rewards.mean().item()
                median_r = graph_rewards.median().item()
                min_r = torch.min(graph_rewards).item()
                max_r = torch.max(graph_rewards).item()
                std_r = graph_rewards.std().item()

                medians.append(median_r)
                first_quartiles.append(np.percentile(graph_rewards.numpy(), 25))
                third_quartiles.append(np.percentile(graph_rewards.numpy(), 75))
                mins.append(min_r)
                maxes.append(max_r)
                means.append(mean_r)
                stds.append(std_r)

                losses.append(loss.item())

                self.visualizer.update_viz_median(xs, medians, first_quartiles, third_quartiles, mins, maxes, self.graph_colors, self.env_name, self.win_name)
                self.visualizer.update_viz_mean(xs, means, stds, self.graph_colors[1:], self.env_name, self.win_name)
                self.visualizer.update_viz_loss(xs, losses, self.graph_colors[2], self.env_name, self.win_name)

            #log the current data
            if iter % self.log_iter == self.log_iter - 1:
                print("iter: %d, steps: %d -> mean: %.1f, median: %.1f / min: %.1f, max: %.1f / policy loss: %.3f, value loss: %.1f, entropy loss: %.3f" % (iter + 1, total_num_steps, mean_r, median_r, min_r, max_r, policy_loss, value_loss, entropy_loss))

            #save current weights
            if iter % self.save_iter == self.save_iter - 1:
                torch.save(self.policy.state_dict(), self.filename)
                print("params saved")

        #save current weights when we're all done
        torch.save(self.policy.state_dict(), self.filename)
        print("params saved")
Exemplo n.º 30
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'





    print (args.cuda)
    print (args.num_steps)
    print (args.num_processes)
    print (args.lr)
    print (args.eps)
    print (args.alpha)
    print (args.use_gae)
    print (args.gamma)
    print (args.tau)
    print (args.value_loss_coef)
    print (args.entropy_coef)
    # fsdaf





    # Create environment
    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])
    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    # action_shape = action_shape


    # shape_dim0 = envs.observation_space.shape[0]

    # if args.cuda:
    #     dtype = torch.cuda.FloatTensor
    # else:
    #     dtype = torch.FloatTensor

    hparams = {'cuda':args.cuda,
                'num_steps':args.num_steps,
                'num_processes':args.num_processes, 
                'obs_shape':obs_shape,
                'lr':args.lr,
                'eps':args.eps, 
                'alpha':args.alpha,
                'use_gae':args.use_gae, 
                'gamma':args.gamma, 
                'tau':args.tau,
                'value_loss_coef':args.value_loss_coef, 
                'entropy_coef':args.entropy_coef}




    # Create agent
    # agent = a2c(envs, hparams)


    # rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
    #it has a self.state that is [steps, processes, obs]
    #steps is used to compute expected reward

    if args.cuda:
        actor_critic.cuda()
        # rollouts.cuda()
    optimizer = optim.RMSprop(actor_critic.parameters(), hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])







    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)



    # Init state

    current_state = torch.zeros(args.num_processes, *obs_shape)#.type(dtype)
    def update_current_state(state):#, shape_dim0):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state
        # return current_state


    state = envs.reset()

    update_current_state(state)#, shape_dim0) 
    # agent.insert_first_state(current_state)
    rollouts.states[0].copy_(current_state)
        #set the first state to current state


    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()#type(dtype)
        # if args.cuda:
        rollouts.cuda()
    #Begin training
    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):

            # Act
            # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Observe reward and next state
            state, reward, done, info = envs.step(cpu_actions) # state:[nProcesss, ndims, height, width]

            # Record rewards
            # reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward
            # If done then clean the history of observations.
            # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet
            # oh its just clearing the env that finished, and resetting its episode_reward
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if args.cuda:
                masks = masks.cuda()
            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks
            # return reward, masks, final_rewards, episode_rewards, current_state




            # Update state
            update_current_state(state)#, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)



        #Optimize agent
        # agent.update()
        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data
        # use last state to make prediction of next value



        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))
        #not sure what this is




        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        # this computes R =  r + r+ ...+ V(t)  for each step



        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                                                    Variable(rollouts.states[:-1].view(-1, *obs_shape)), 
                                                    Variable(rollouts.actions.view(-1, action_shape)))
        # I think this aciton log prob could have been computed and stored earlier 
        # and didnt we already store the value prediction???

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

        optimizer.step()




        rollouts.states[0].copy_(rollouts.states[-1])
        # the first state is now the last state of the previous 









        # #Save model
        # if j % args.save_interval == 0 and args.save_dir != "":
        #     save_path = os.path.join(args.save_dir, args.algo)
        #     try:
        #         os.makedirs(save_path)
        #     except OSError:
        #         pass

        #     # A really ugly way to save a model to CPU
        #     save_model = actor_critic
        #     if args.cuda:
        #         save_model = copy.deepcopy(actor_critic).cpu()
        #     torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        #Print updates
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            # print("Updates {}, n_timesteps {}, FPS {}, mean/median R {:.1f}/{:.1f}, min/max R {:.1f}/{:.1f}, T:{:.4f}".#, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
            #     format(j, total_num_steps,
            #            int(total_num_steps / (end - start)),
            #            final_rewards.mean(),
            #            final_rewards.median(),
            #            final_rewards.min(),
            #            final_rewards.max(),
            #            end - start))#, -dist_entropy.data[0],
            #            # value_loss.data[0], action_loss.data[0]))

            # print("Upts {}, n_timesteps {}, min/med/mean/max {:.1f}/{:.1f}/{:.1f}/{:.1f}, FPS {}, T:{:.1f}".
            #     format(j, total_num_steps,
            #            final_rewards.min(),
            #            final_rewards.median(),
            #            final_rewards.mean(),
            #            final_rewards.max(),
            #            int(total_num_steps / (end - start)),
            #            end - start))

            if j % (args.log_interval*30) == 0:
                print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".
                    format(j, total_num_steps,
                           final_rewards.min(),
                           final_rewards.median(),
                           final_rewards.mean(),
                           final_rewards.max(),
                           int(total_num_steps / (end - start)),
                           end - start))
Exemplo n.º 31
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    torch.set_num_threads(1)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()

    lmdb_idx = 0
    try:
        os.makedirs(os.path.join(args.lmdb_path, args.env_name))
        os.makedirs(os.path.join(args.lmdb_path, args.env_name, 'test'))
    except:
        print('Directory already exists.')

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Observe reward and next obs
            # obs, reward, done, info = envs.step(cpu_actions)
            '''unwrapped obs, reward'''
            obs, reward, done, info, wr_obs, wr_reward = envs.step(cpu_actions)
            # sample images
            # img = np.squeeze(np.transpose(obs[3], (1, 2, 0)), 2)
            for img, rwd in zip(wr_obs, wr_reward):
                if rwd > 0:
                    lmdb_idx += 1
                    convert_to_lmdb(
                        img, rwd, os.path.join(args.lmdb_path, args.env_name),
                        lmdb_idx)

            # Evaluate unwrapped rewards
            # model = Model()
            # model.load(args.digit_checkpoint)
            # model.cuda()
            # accuracy = digit_eval(image, length_labels, digits_labels, model)
            # img.show()

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Exemplo n.º 32
0
 def load_new_screen(self):
     self.screen = SubprocVecEnv([make_env(*self.screen_name)])
Exemplo n.º 33
0
class AtariRAMEnvironment(RawEnvironment):
    '''
    generates the necessary components from the atari environment, including the object dictionary and other components
    '''
    def __init__(self, env_id, seed, rank, log_dir):
        try:
            os.makedirs(log_dir)
        except OSError:
            pass
        self.screen_name = (env_id, seed, rank, log_dir)
        self.screen = SubprocVecEnv([make_env(env_id, seed, rank, log_dir)])
        self.num_actions = self.screen.action_space.n
        self.itr = 0
        self.save_path = ""
        self.factor_state = None
        self.reward = 0
        self.current_raw = np.squeeze(self.screen.reset())
        self.current_action = 0
        # self.focus_model.cuda()

    def load_new_screen(self):
        self.screen = SubprocVecEnv([make_env(*self.screen_name)])

    def set_save(self, itr, save_dir, recycle):
        self.save_path = save_dir
        self.itr = itr
        self.recycle = recycle
        try:
            os.makedirs(save_dir)
        except OSError:
            pass

    def step(self, action):
        # TODO: action is tensor, might not be safe assumption
        # t = time.time()
        uaction = pytorch_model.unwrap(action.long())
        raw_state, reward, done, info = self.screen.step([uaction])
        # a = time.time()
        # print("screen step", a - t)
        raw_state = np.squeeze(raw_state)
        # raw_state[:10,:] = 0.0
        self.current_raw = raw_state
        raw_factor_state = {'Action': [[0.0, 0.0], (float(uaction), )]}
        self.current_action = action
        self.reward = reward[0]
        self.factor_state = raw_factor_state
        self.last_action = uaction

        # logging
        if len(self.save_path) > 0:
            if self.recycle > 0:
                state_path = os.path.join(
                    self.save_path, str((self.itr % self.recycle) // 2000))
                count = self.itr % self.recycle
            else:
                state_path = os.path.join(self.save_path,
                                          str(self.itr // 2000))
                count = self.itr
            try:
                os.makedirs(state_path)
            except OSError:
                pass
            if self.itr != 0:
                object_dumps = open(
                    os.path.join(self.save_path, "focus_dumps.txt"), 'a')
            else:
                object_dumps = open(
                    os.path.join(self.save_path, "focus_dumps.txt"),
                    'w')  # create file if it does not exist
            for key in factor_state.keys():
                writeable = list(factor_state[key][0]) + list(
                    factor_state[key][1])
                object_dumps.write(
                    key + ":" + " ".join([str(fs) for fs in writeable]) +
                    "\t")  # TODO: attributes are limited to single floats
            object_dumps.write(
                "\n")  # TODO: recycling does not stop object dumping

            # imio.imsave(os.path.join(state_path, "state" + str(count % 2000) + ".png"), self.current_raw)
            self.itr += 1
        # print("elapsed ", time.time() - t)
        return raw_state, self.factor_state, done

    def getState(self):
        raw_state = self.current_raw
        raw_factor_state = {'Action': self.current_action}
        if self.factor_state is None:
            factor_state = dict()
            factor_state['Action'] = raw_factor_state['Action']
            self.factor_state = factor_state
        factor_state = self.factor_state
        return raw_state, factor_state
Exemplo n.º 34
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass