예제 #1
0
def main():
  env_id = 'PongNoFrameskip-v4'
  # env_id = 'MsPacmanNoFrameskip-v4'
  # env_id = 'BreakoutNoFrameskip-v4'
  num_env = 16
  num_steps = 5
  num_batch = num_env * num_steps

  seed = 0
  env_args = {'episode_life': False, 'clip_rewards': False, 'scale': False,
              'transpose_image': True}
  env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)

  network = ConvVAE([84, 84], 2048)

  observs = []
  actions = []
  next_observs = []

  observ = env.reset()
  observ = observ.transpose(0, 3, 2, 1)
  observ = tensor(observ)
  print(observ.shape)
  out = network(observ)[0]
  print(out.shape)
예제 #2
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
    """
    Train A2C model for atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_env: (int) The number of environments
    """
    policy_fn = None
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = CnnLstmPolicy
    elif policy == 'lnlstm':
        policy_fn = CnnLnLstmPolicy
    if policy_fn is None:
        raise ValueError("Error: policy {} not implemented".format(policy))

    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)

    model = A2C(policy_fn, env, lr_schedule=lr_schedule)
    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
    env.close()
예제 #3
0
def play(env_id, num_timesteps, seed, policy, lr_schedule, num_env, sil_update,
         sil_beta, load_path):
    policy_fn = CnnPolicy
    if policy_fn is None:
        raise ValueError("Error: policy {} not implemented".format(policy))

    env_args = {'episode_life': False, 'clip_rewards': False, 'scale': True}
    env = make_video_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args)
    env = VecFrameStack(env, 4)

    model = SelfImitationA2C.load(load_path, env=env)
    print(model.params)
    return_ = np.zeros((env.num_envs, ))
    terminals_ = np.zeros((env.num_envs, ), dtype=np.bool)
    print(model.env)
    observ = env.reset()
    while True:
        actions, values, states, _ = model.step(observ, None, None)
        next_observ, rewards, terminals, _ = env.step(actions)
        print(rewards)
        return_ += rewards
        terminals_ |= terminals
        # print('terminals', terminals_)
        done = True
        for terminal in terminals_.tolist():
            done &= terminal
        if done:
            break

    for mp4_file in Path('/tmp/video').glob('*.mp4'):
        if int(mp4_file.stat().st_size) < 100:
            mp4_file.unlink()
    print(return_)
예제 #4
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env,
          sil_update, sil_beta,
          tensorboard_log, tb_log_name):
  """
  Train A2C model for atari environment, for testing purposes

  :param env_id: (str) Environment ID
  :param num_timesteps: (int) The total number of samples
  :param seed: (int) The initial seed for training
  :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
  :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                               'double_linear_con', 'middle_drop' or 'double_middle_drop')
  :param num_env: (int) The number of environments
  """
  policy_fn = None
  if policy == 'cnn':
    policy_fn = CnnPolicy
  elif policy == 'lstm':
    policy_fn = CnnLstmPolicy
  elif policy == 'lnlstm':
    policy_fn = CnnLnLstmPolicy
  if policy_fn is None:
    raise ValueError("Error: policy {} not implemented".format(policy))

  env_args = {'episode_life': False, 'clip_rewards': False, 'scale': True}
  env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)

  model = SelfImitationA2C(policy_fn, env, lr_schedule=lr_schedule, tensorboard_log=tensorboard_log,
                           verbose=1, sil_update=sil_update, sil_beta=sil_beta)
  model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed, tb_log_name=tb_log_name)
  env.close()
def main():
  env_id = 'BreakoutNoFrameskip-v4'
  num_env = 5
  seed = 0
  env_args = {'episode_life': False, 'clip_rewards': False}
  env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)
  graph = tf.Graph()
  with graph.as_default():
    sess = tf_util.make_session(graph=graph)
    with tf.variable_scope('input', reuse=False):
      input_x, process_x = observation_input(env.observation_space, num_env)
      print(env.action_space.shape)
      pdtype = make_proba_dist_type(env.action_space)
      actions_ph = pdtype.sample_placeholder([num_env], name="action_ph")
      one_hot_actions = tf.one_hot(actions_ph, env.action_space.n)
      
    print(input_x, process_x)
    print('action', actions_ph, one_hot_actions)

    beta = 0.1
    mu, sigma_sq, recons_x = build_network(process_x, one_hot_actions)
    print(mu)
    print(sigma_sq)
    print(recons_x)

    with tf.name_scope('losses'):
      recons_loss = tf.losses.mean_squared_error(input_x, recons_x, scope='recons_loss')
      kl_divergence = -tf.reduce_mean(0.5 * (tf.add(1., sigma_sq) - tf.pow(mu, 2) - tf.exp(sigma_sq)),
                                      name='kl_divergence')
      loss = tf.add(recons_loss,
                    tf.multiply(
                      kl_divergence,
                      beta), name='objective')
      print(loss)
    summary = utility.summary({recons_loss: 'recons_loss',
                               kl_divergence: 'kl_divergence',
                               mu: 'phi_mu',
                               sigma_sq: 'sigma_sq',
                               recons_x: 'recons_x',
                               input_x: 'input_x',
                               }, env.observation_space.shape)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5)
    train_op = optimizer.minimize(loss)

    for event_file in LOG_DIR.glob('event*'):
      event_file.unlink()
    writer = tf.summary.FileWriter(LOG_DIR.as_posix(), sess.graph)
    sess.run(tf.global_variables_initializer())

    observ = env.reset()
    actions = [env.action_space.sample() for _ in range(num_env)]
    print(env.observation_space)
    print(observ.shape)

    recons_image, summary_ = sess.run([recons_x, summary],
                                      feed_dict={input_x: observ,
                                                 actions_ph: actions})
    writer.add_summary(summary_, 0)
예제 #6
0
def train(env_id, num_timesteps, seed, num_cpu):
    """
    train an ACKTR model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param num_cpu: (int) The number of cpu to train on
    """
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    model = ACKTR(CnnPolicy, env, nprocs=num_cpu)
    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
    env.close()
예제 #7
0
파일: enjoy_pad.py 프로젝트: drwxyh/pad
def test(env_id, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    # if 'lstm' in policy:
    #     print('LSTM policies not supported for drawing')
    #     return 1
    env = DummyVecEnv([PadEnvRender for _ in range(1)])  # Need for lstm
    # else:
    #     env = PadEnvRender()

    env = VecFrameStack(env, 8)
    model = PPO2.load('./pad_5combo_ppo2.pkl', env)

    while True:
        obs, done = env.reset(), False
        episode_rew = 0

        while not done:
            env.render()
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            done = done.any()
            episode_rew += rew
            time.sleep(1 / 24.)
            if done:
                print('Episode reward:', rew)
예제 #8
0
def train(env_id, num_timesteps, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=128,
                 nminibatches=4,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    model.learn(total_timesteps=num_timesteps)
예제 #9
0
    def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None):
        if "num_population" in args.__dict__:
            args.num_cpu = args.num_population * 2

        assert not (registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \
            "Error: cannot have more than 1 CPU for the environment {}".format(args.env)
        if env_kwargs is not None and env_kwargs.get("use_srl", False):
            srl_model = MultiprocessSRLModel(args.num_cpu, args.env,
                                             env_kwargs)
            env_kwargs["state_dim"] = srl_model.state_dim
            env_kwargs["srl_pipe"] = srl_model.pipe

        envs = [
            makeEnv(args.env,
                    args.seed,
                    i,
                    args.log_dir,
                    allow_early_resets=True,
                    env_kwargs=env_kwargs) for i in range(args.num_cpu)
        ]
        envs = SubprocVecEnv(envs)
        envs = VecFrameStack(envs, args.num_stack)
        if args.srl_model != "raw_pixels" and args.algo_type == "v2":
            envs = VecNormalize(envs, norm_obs=True, norm_reward=False)
            envs = loadRunningAverage(envs,
                                      load_path_normalise=load_path_normalise)
        return envs
예제 #10
0
def train(env_id,
          num_timesteps,
          seed,
          policy,
          n_envs=8,
          nminibatches=4,
          n_steps=128):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param n_envs: (int) Number of parallel environments
    :param nminibatches: (int) Number of training minibatches per update. For recurrent policies,
        the number of environments run in parallel should be a multiple of nminibatches.
    :param n_steps: (int) The number of steps to run for each environment per update
        (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
    """

    env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=n_steps,
                 nminibatches=nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    model.learn(total_timesteps=num_timesteps)
    del model
예제 #11
0
파일: run_pad.py 프로젝트: drwxyh/pad
def train(env_id, num_timesteps, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    env = Monitor(PadEnv(), './logs', allow_early_resets=True)
    env = DummyVecEnv([lambda: env for _ in range(16)])
    env = VecFrameStack(env, 8)
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=256,
                 nminibatches=4,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    # model = model.load('./pad_4combo_ppo2.pkl', env)
    try:
        model.learn(total_timesteps=num_timesteps)
    except KeyboardInterrupt:
        print('Keyboard Interrupted')

    model.save('./pad_5combo_ppo2.pkl')
def main():
    beta = 0

    env_id = 'MsPacmanNoFrameskip-v4'
    num_env = 16
    num_steps = 5
    num_batch = num_env * num_steps

    seed = 0
    env_args = {'episode_life': False, 'clip_rewards': False, 'scale': False}
    env = VecFrameStack(
        make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)

    graph = tf.Graph()
    with graph.as_default():
        sess = tf_util.make_session(graph=graph)
        policy = ReconstructionModule(sess, env.observation_space,
                                      env.action_space, num_batch)

        def save(save_path: Path, params):
            data = {
                'policy': ReconstructionModule,
            }
            params = sess.run(params)
            _save_to_file(save_path, data=data, params=params)

        print(policy.mu)
        print(policy.log_sigma_sq)
        print(policy.recons_x)

        params = find_trainable_variables('model')
        tf.global_variables_initializer().run(session=sess)

        def load(load_path: Path):
            _data, load_params = _load_from_file(LOG_DIR)
            restores = []
            for param, load_param in zip(params, load_params):
                restores.append(param.assign(load_param))
            sess.run(restores)

        with tf.name_scope('losses'):
            recons_losses = tf.squared_difference(policy.next_process_x,
                                                  policy.recons_x)
            recons_loss = tf.reduce_mean(recons_losses,
                                         name='reconstruction_loss')

        summary = utility.summary(
            {
                policy.capacity_ph: 'capacity',
                recons_losses: 'recons_losses',
                policy.process_x: 'process_x',
                policy.next_process_x: 'next_process_x',
                policy.recons_x: 'recons_x',
            },
            env.observation_space.shape,
            ignore=['recons_x', 'recons_losses', 'process_x'])
        # optimizer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=0.99, epsilon=1e-5)
        optimizer = tf.train.AdamOptimizer(5e-4)
        train_op = optimizer.minimize(recons_loss)

        for event_file in LOG_DIR.glob('event*'):
            event_file.unlink()
        writer = tf.summary.FileWriter(LOG_DIR.as_posix(), sess.graph)
        sess.run(tf.global_variables_initializer())

        observs = []
        actions = []
        next_observs = []

        observ = env.reset()
        global_step = 0
        while True:
            if global_step > 100_000:
                break
            print('\rStep Global Step {}/{}'.format(global_step, 100_000 + 1),
                  end='',
                  flush=True)
            action = [env.action_space.sample() for _ in range(num_env)]
            next_observ, rewards, terminals, _ = env.step(action)

            observs.extend(observ)
            actions.extend(action)
            next_observs.extend(next_observ)

            observ = next_observ
            global_step += num_env

            if len(observs) == num_batch:
                feed_dict = {
                    policy.input_x: np.asarray(observs),
                    policy.next_input_x: np.asarray(next_observs),
                    policy.actions_ph: np.asarray(actions),
                    policy.capacity_ph: _calculate_encoding_capacity(step),
                }
                if global_step % (5 * num_batch) == 0:
                    summary_, _ = sess.run([summary, train_op],
                                           feed_dict=feed_dict)

                    writer.add_summary(summary_, global_step)
                else:
                    _ = sess.run([train_op], feed_dict=feed_dict)

                observs = []
                actions = []
                next_observs = []

        save(LOG_DIR, params)
예제 #13
0
def main():
  beta = 750

  # env_id = 'PongNoFrameskip-v4'
  env_id = 'MsPacmanNoFrameskip-v4'
  # env_id = 'BreakoutNoFrameskip-v4'
  num_env = 16
  num_steps = 5
  num_batch = num_env * num_steps

  seed = 0
  env_args = {'episode_life': False, 'clip_rewards': False, 'scale': False}
  env = VecFrameStack(make_atari_env(env_id, num_env, seed, wrapper_kwargs=env_args), 4)

  graph = tf.Graph()
  with graph.as_default():
    sess = tf_util.make_session(graph=graph)
    policy = ReconstructionModule(sess,
                                  env.observation_space,
                                  env.action_space,
                                  num_batch, use_batch_norm=True)

    def save(save_path: Path, params):
      data = {
        'policy': ReconstructionModule,
      }
      params = sess.run(params)
      _save_to_file(save_path, data=data, params=params)

    print(policy.mu)
    print(policy.log_sigma_sq)
    print(policy.recons_x)

    params = find_trainable_variables('model')
    tf.global_variables_initializer().run(session=sess)

    def load(load_path: Path):
      _data, load_params = _load_from_file(LOG_DIR)
      restores = []
      for param, load_param in zip(params, load_params):
        restores.append(param.assign(load_param))
      sess.run(restores)

    with tf.name_scope('losses'):
      recons_losses = tf.squared_difference(policy.next_process_x,
                                            policy.recons_x)
      recons_loss = tf.reduce_mean(recons_losses, name='reconstruction_loss')
      kl_divergences = -0.5 * (tf.add(1., policy.log_sigma_sq) - tf.square(policy.mu) - tf.exp(policy.log_sigma_sq))
      kl_divergence = tf.reduce_mean(kl_divergences, name='kl_divergence')
      coefed_kl = tf.multiply(tf.abs(kl_divergence - policy.capacity_ph), beta)
      loss = tf.add(recons_loss, coefed_kl, name='objective')
      # loss = tf.add(recons_loss,
      #               tf.multiply(
      #                 kl_divergence, beta), name='objective')

    summary = utility.summary({
      loss: 'loss',
      policy.capacity_ph: 'capacity',
      kl_divergences: 'kl_divergences',
      recons_losses: 'recons_losses',
      policy.process_x: 'process_x',
      policy.next_process_x: 'next_process_x',
      policy.mu: 'phi_mu',
      policy.log_sigma_sq: 'log_sigma_sq',
      policy.recons_x: 'recons_x',
      coefed_kl: 'coefed_KL',

    }, env.observation_space.shape,
      ignore=['recons_x', 'recons_losses', 'process_x'])
    optimizer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=0.99, epsilon=1e-5)
    # optimizer = tf.train.AdamOptimizer(5e-5)
    train_op = optimizer.minimize(loss)

    for event_file in LOG_DIR.glob('event*'):
      event_file.unlink()
    writer = tf.summary.FileWriter(LOG_DIR.as_posix(), sess.graph)
    sess.run(tf.global_variables_initializer())

    observs = []
    actions = []
    next_observs = []

    observ = env.reset()
    for step in tqdm(range(1, 2_000_000 + 1, num_env)):
      action = [env.action_space.sample() for _ in range(num_env)]
      next_observ, rewards, terminals, _ = env.step(action)

      observs.extend(observ)
      actions.extend(action)
      next_observs.extend(next_observ)

      observ = next_observ

      if len(observs) == num_batch:
        feed_dict = {policy.input_x: np.asarray(observs),
                     policy.next_input_x: np.asarray(next_observs),
                     policy.actions_ph: np.asarray(actions),
                     policy.capacity_ph: _calculate_encoding_capacity(step),
                     }
        if (step // num_env + 1) % 1000 == 0:
          summary_, _ = sess.run([summary, train_op], feed_dict=feed_dict)
          writer.add_summary(summary_, step)
        else:
          _ = sess.run([train_op], feed_dict=feed_dict)

        observs = []
        actions = []
        next_observs = []

    save(LOG_DIR, params)