示例#1
0
def main(_):
  """Run td3/ddpg evaluation."""
  contrib_eager_python_tfe.enable_eager_execution()

  if FLAGS.use_gpu:
    tf.device('/device:GPU:0').__enter__()

  tf.gfile.MakeDirs(FLAGS.log_dir)
  summary_writer = contrib_summary.create_file_writer(
      FLAGS.log_dir, flush_millis=10000)

  env = gym.make(FLAGS.env)
  if FLAGS.wrap_for_absorbing:
    env = lfd_envs.AbsorbingWrapper(env)

  obs_shape = env.observation_space.shape
  act_shape = env.action_space.shape

  with tf.variable_scope('actor'):
    actor = Actor(obs_shape[0], act_shape[0])

  random_reward, _ = do_rollout(
      env, actor, None, num_trajectories=10, sample_random=True)

  reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale')
  saver = contrib_eager_python_tfe.Saver(actor.variables + [reward_scale])

  last_checkpoint = tf.train.latest_checkpoint(FLAGS.load_dir)
  with summary_writer.as_default():
    while True:
      last_checkpoint = wait_for_next_checkpoint(FLAGS.load_dir,
                                                 last_checkpoint)

      total_numsteps = int(last_checkpoint.split('-')[-1])

      saver.restore(last_checkpoint)

      average_reward, average_length = do_rollout(
          env, actor, None, noise_scale=0.0, num_trajectories=FLAGS.num_trials)

      logging.info(
          'Evaluation: average episode length %d, average episode reward %f',
          average_length, average_reward)

      print('Evaluation: average episode length {}, average episode reward {}'.
            format(average_length, average_reward))

      with contrib_summary.always_record_summaries():
        if reward_scale.numpy() != 1.0:
          contrib_summary.scalar(
              'reward/scaled', (average_reward - random_reward) /
              (reward_scale.numpy() - random_reward),
              step=total_numsteps)
        contrib_summary.scalar('reward', average_reward, step=total_numsteps)
        contrib_summary.scalar('length', average_length, step=total_numsteps)
示例#2
0
    def __init__(self,
                 input_dim,
                 action_dim,
                 discount=0.99,
                 tau=0.005,
                 actor_lr=1e-3,
                 critic_lr=1e-3,
                 use_td3=True,
                 policy_noise=0.2,
                 policy_noise_clip=0.5,
                 policy_update_freq=2,
                 get_reward=None,
                 use_absorbing_state=False):
        """Initializes actor, critic, target networks and optimizers.

    The class handles absorbing state properly. Absorbing state corresponds to
    a state which a policy gets in after reaching a goal state and stays there
    forever. For most RL problems, we can just assign 0 to all reward after
    the goal. But for GAIL, we need to have an actual absorbing state.

    Args:
       input_dim: size of the observation space.
       action_dim: size of the action space.
       discount: reward discount.
       tau: target networks update coefficient.
       actor_lr: actor learning rate.
       critic_lr: critic learning rate.
       use_td3: whether to use standard ddpg or td3.
       policy_noise: std of gaussian added to critic action input.
       policy_noise_clip: clip added gaussian noise.
       policy_update_freq: perform policy update once per n steps.
       get_reward: a function that given (s,a,s') returns a reward.
       use_absorbing_state: whether to use an absorbing state or not.
    """
        self.discount = discount
        self.tau = tau

        self.use_td3 = use_td3
        self.policy_noise = policy_noise
        self.policy_noise_clip = policy_noise_clip
        self.policy_update_freq = policy_update_freq
        self.get_reward = get_reward
        self.use_absorbing_state = use_absorbing_state

        with tf.variable_scope('actor'):
            self.actor = Actor(input_dim, action_dim)
            with tf.variable_scope('target'):
                self.actor_target = Actor(input_dim, action_dim)

            self.initial_actor_lr = actor_lr
            self.actor_lr = contrib_eager_python_tfe.Variable(actor_lr,
                                                              name='lr')
            self.actor_step = contrib_eager_python_tfe.Variable(0,
                                                                dtype=tf.int64,
                                                                name='step')
            self.actor_optimizer = tf.train.AdamOptimizer(
                learning_rate=self.actor_lr)
            self.actor_optimizer._create_slots(self.actor.variables)  # pylint: disable=protected-access

        soft_update(self.actor.variables, self.actor_target.variables)

        with tf.variable_scope('critic'):
            if self.use_td3:
                self.critic = CriticTD3(input_dim + action_dim)
                with tf.variable_scope('target'):
                    self.critic_target = CriticTD3(input_dim + action_dim)
            else:
                self.critic = CriticDDPG(input_dim + action_dim)
                with tf.variable_scope('target'):
                    self.critic_target = CriticDDPG(input_dim + action_dim)

            self.critic_step = contrib_eager_python_tfe.Variable(
                0, dtype=tf.int64, name='step')
            self.critic_optimizer = tf.train.AdamOptimizer(
                learning_rate=critic_lr)
            self.critic_optimizer._create_slots(self.critic.variables)  # pylint: disable=protected-access

        soft_update(self.critic.variables, self.critic_target.variables)
示例#3
0
def main(_):
    """Run td3/ddpg training."""
    tfe.enable_eager_execution()

    if FLAGS.use_gpu:
        tf.device('/device:GPU:0').__enter__()

    if FLAGS.expert_dir.find(FLAGS.env) == -1:
        raise ValueError('Expert directory must contain the environment name')

    tf.set_random_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    random.seed(FLAGS.seed)

    env = gym.make(FLAGS.env)
    env.seed(FLAGS.seed)

    obs_shape = env.observation_space.shape
    act_shape = env.action_space.shape

    expert_replay_buffer_var = tfe.Variable('', name='expert_replay_buffer')

    saver = tfe.Saver([expert_replay_buffer_var])
    tf.gfile.MakeDirs(FLAGS.save_dir)

    with tf.variable_scope('actor'):
        actor = Actor(obs_shape[0], act_shape[0])
    expert_saver = tfe.Saver(actor.variables)

    best_checkpoint = None
    best_reward = float('-inf')

    checkpoint_state = tf.train.get_checkpoint_state(FLAGS.expert_dir)

    for checkpoint in checkpoint_state.all_model_checkpoint_paths:
        expert_saver.restore(checkpoint)
        expert_reward, _ = do_rollout(env,
                                      actor,
                                      replay_buffer=None,
                                      noise_scale=0.0,
                                      num_trajectories=10)

        if expert_reward > best_reward:
            best_reward = expert_reward
            best_checkpoint = checkpoint

    expert_saver.restore(best_checkpoint)

    expert_replay_buffer = ReplayBuffer()
    expert_reward, _ = do_rollout(
        env,
        actor,
        replay_buffer=expert_replay_buffer,
        noise_scale=0.0,
        num_trajectories=FLAGS.num_expert_trajectories)

    logging.info('Expert reward %f', expert_reward)
    print('Expert reward {}'.format(expert_reward))

    expert_replay_buffer_var.assign(pickle.dumps(expert_replay_buffer))
    saver.save(os.path.join(FLAGS.save_dir, 'expert_replay_buffer'))