Пример #1
0
 def testCallingEnableEagerExecutionMoreThanOnce(self):
   # Note that eager.test.main() has already invoked enable_eager_exceution().
   with self.assertRaisesRegexp(
       ValueError,
       r"Do not call tfe\.%s more than once in the same process" %
       tfe.enable_eager_execution.__name__):
     tfe.enable_eager_execution()
Пример #2
0
def main(_):
  """Run td3/ddpg evaluation."""
  contrib_eager_python_tfe.enable_eager_execution()

  if FLAGS.use_gpu:
    tf.device('/device:GPU:0').__enter__()

  tf.gfile.MakeDirs(FLAGS.log_dir)
  summary_writer = contrib_summary.create_file_writer(
      FLAGS.log_dir, flush_millis=10000)

  env = gym.make(FLAGS.env)
  if FLAGS.wrap_for_absorbing:
    env = lfd_envs.AbsorbingWrapper(env)

  obs_shape = env.observation_space.shape
  act_shape = env.action_space.shape

  with tf.variable_scope('actor'):
    actor = Actor(obs_shape[0], act_shape[0])

  random_reward, _ = do_rollout(
      env, actor, None, num_trajectories=10, sample_random=True)

  reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale')
  saver = contrib_eager_python_tfe.Saver(actor.variables + [reward_scale])

  last_checkpoint = tf.train.latest_checkpoint(FLAGS.load_dir)
  with summary_writer.as_default():
    while True:
      last_checkpoint = wait_for_next_checkpoint(FLAGS.load_dir,
                                                 last_checkpoint)

      total_numsteps = int(last_checkpoint.split('-')[-1])

      saver.restore(last_checkpoint)

      average_reward, average_length = do_rollout(
          env, actor, None, noise_scale=0.0, num_trajectories=FLAGS.num_trials)

      logging.info(
          'Evaluation: average episode length %d, average episode reward %f',
          average_length, average_reward)

      print('Evaluation: average episode length {}, average episode reward {}'.
            format(average_length, average_reward))

      with contrib_summary.always_record_summaries():
        if reward_scale.numpy() != 1.0:
          contrib_summary.scalar(
              'reward/scaled', (average_reward - random_reward) /
              (reward_scale.numpy() - random_reward),
              step=total_numsteps)
        contrib_summary.scalar('reward', average_reward, step=total_numsteps)
        contrib_summary.scalar('length', average_length, step=total_numsteps)
Пример #3
0
def main(_):
  tfe.enable_eager_execution()

  if not FLAGS.data_path:
    raise ValueError("Must specify --data-path")
  corpus = Datasets(FLAGS.data_path)
  train_data = _divide_into_batches(corpus.train, FLAGS.batch_size)
  eval_data = _divide_into_batches(corpus.valid, 10)

  have_gpu = tfe.num_gpus() > 0
  use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu

  with tf.device("/device:GPU:0" if have_gpu else None):
    # Make learning_rate a Variable so it can be included in the checkpoint
    # and we can resume training with the last saved learning_rate.
    learning_rate = tfe.Variable(20.0, name="learning_rate")
    model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
                     FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
                     use_cudnn_rnn)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    checkpoint = tfe.Checkpoint(
        learning_rate=learning_rate, model=model,
        # GradientDescentOptimizer has no state to checkpoint, but noting it
        # here lets us swap in an optimizer that does.
        optimizer=optimizer)
    # Restore existing variables now (learning_rate), and restore new variables
    # on creation if a checkpoint exists.
    checkpoint.restore(tf.train.latest_checkpoint(FLAGS.logdir))
    sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())

    best_loss = None
    for _ in range(FLAGS.epoch):
      train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
      eval_loss = evaluate(model, eval_data)
      if not best_loss or eval_loss < best_loss:
        if FLAGS.logdir:
          checkpoint.save(os.path.join(FLAGS.logdir, "ckpt"))
        best_loss = eval_loss
      else:
        learning_rate.assign(learning_rate / 4.0)
        sys.stderr.write("eval_loss did not reduce in this epoch, "
                         "changing learning rate to %f for the next epoch\n" %
                         learning_rate.numpy())
Пример #4
0
def main(_):
  tfe.enable_eager_execution()

  if not FLAGS.data_path:
    raise ValueError("Must specify --data-path")
  corpus = Datasets(FLAGS.data_path)
  train_data = _divide_into_batches(corpus.train, FLAGS.batch_size)
  eval_data = _divide_into_batches(corpus.valid, 10)

  have_gpu = tfe.num_gpus() > 0
  use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu

  with tf.device("/device:GPU:0" if have_gpu else None):
    # Make learning_rate a Variable so it can be included in the checkpoint
    # and we can resume training with the last saved learning_rate.
    learning_rate = tfe.Variable(20.0, name="learning_rate")
    model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
                     FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
                     use_cudnn_rnn)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    checkpoint = tfe.Checkpoint(
        learning_rate=learning_rate, model=model,
        # GradientDescentOptimizer has no state to checkpoint, but noting it
        # here lets us swap in an optimizer that does.
        optimizer=optimizer)
    # Restore existing variables now (learning_rate), and restore new variables
    # on creation if a checkpoint exists.
    checkpoint.restore(tf.train.latest_checkpoint(FLAGS.logdir))
    sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())

    best_loss = None
    for _ in range(FLAGS.epoch):
      train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
      eval_loss = evaluate(model, eval_data)
      if not best_loss or eval_loss < best_loss:
        if FLAGS.logdir:
          checkpoint.save(os.path.join(FLAGS.logdir, "ckpt"))
        best_loss = eval_loss
      else:
        learning_rate.assign(learning_rate / 4.0)
        sys.stderr.write("eval_loss did not reduce in this epoch, "
                         "changing learning rate to %f for the next epoch\n" %
                         learning_rate.numpy())
Пример #5
0
def main(_):
    tfe.enable_eager_execution()

    if not FLAGS.data_path:
        raise ValueError("Must specify --data_path")
    corpus = Corpus(FLAGS.data_path)
    # TODO(ashankar): Remove _batchify and _get_batch and use the Datasets API
    # instead.
    train_data = _batchify(corpus.train, FLAGS.batch_size)
    eval_data = _batchify(corpus.valid, 10)

    have_gpu = tfe.num_gpus() > 0
    use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu

    with tfe.restore_variables_on_create(
            tf.train.latest_checkpoint(FLAGS.logdir)):
        with tf.device("/device:GPU:0" if have_gpu else None):
            # Make learning_rate a Variable so it can be included in the checkpoint
            # and we can resume training with the last saved learning_rate.
            learning_rate = tfe.Variable(20.0, name="learning_rate")
            sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())
            model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
                             FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
                             use_cudnn_rnn)
            optimizer = tf.train.GradientDescentOptimizer(learning_rate)

            best_loss = None
            for _ in range(FLAGS.epoch):
                train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
                eval_loss = evaluate(model, eval_data)
                if not best_loss or eval_loss < best_loss:
                    if FLAGS.logdir:
                        tfe.Saver(model.trainable_weights +
                                  [learning_rate]).save(
                                      os.path.join(FLAGS.logdir, "ckpt"))
                    best_loss = eval_loss
                else:
                    learning_rate.assign(learning_rate / 4.0)
                    sys.stderr.write(
                        "eval_loss did not reduce in this epoch, "
                        "changing learning rate to %f for the next epoch\n" %
                        learning_rate.numpy())
import tensorflow as tf
import numpy as np
import sys

from tensor2tensor import models
from tensor2tensor import problems
from tensor2tensor.layers import common_layers
from tensor2tensor.tpu import tpu_trainer_lib
from tensor2tensor.utils import t2t_model
from tensor2tensor.utils import registry
from tensor2tensor.utils import metrics

# Enable TF Eager execution
from tensorflow.contrib.eager.python import tfe
tfe.enable_eager_execution()

# Other setup
Modes = tf.estimator.ModeKeys

ckpt_path = sys.argv[1]
fin_name = sys.argv[2]
fout_name = sys.argv[3]

# Fetch the problem
ende_problem = problems.problem("translate_ende_wmt32k")

# Get the encoders from the problem
encoders = ende_problem.feature_encoders(ckpt_path)

# Setup helper functions for encoding and decoding
def encode(input_str, output_str=None):
Пример #7
0
        RuntimeError,
        r'add_check_numerics_ops\(\) is not compatible with eager execution'):
      numerics.add_check_numerics_ops()

  def testClassicSummaryOpsErrorOut(self):
    x = constant_op.constant(42)
    x_summary = summary.scalar('x', x)
    y = constant_op.constant([1, 3, 3, 7])
    y_summary = summary.histogram('hist', y)

    with self.assertRaisesRegexp(
        RuntimeError,
        r'Merging tf\.summary\.\* ops is not compatible with eager execution'):
      summary.merge([x_summary, y_summary])

    with self.assertRaisesRegexp(
        RuntimeError,
        r'Merging tf\.summary\.\* ops is not compatible with eager execution'):
      summary.merge_all()

  def testClassicSummaryFileWriterErrorsOut(self):
    with self.assertRaisesRegexp(
        RuntimeError,
        r'tf\.summary\.FileWriter is not compatible with eager execution'):
      writer.FileWriter(tempfile.mkdtemp())


if __name__ == '__main__':
  tfe.enable_eager_execution()
  test.main()
Пример #8
0
def main(_):
    """Run td3/ddpg training."""
    contrib_eager_python_tfe.enable_eager_execution()

    if FLAGS.use_gpu:
        tf.device('/device:GPU:0').__enter__()

    tf.gfile.MakeDirs(FLAGS.log_dir)
    summary_writer = contrib_summary.create_file_writer(FLAGS.log_dir,
                                                        flush_millis=10000)

    tf.set_random_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    random.seed(FLAGS.seed)

    env = gym.make(FLAGS.env)
    env.seed(FLAGS.seed)

    if FLAGS.env in ['HalfCheetah-v2', 'Ant-v1']:
        rand_actions = int(1e4)
    else:
        rand_actions = int(1e3)

    obs_shape = env.observation_space.shape
    act_shape = env.action_space.shape

    if FLAGS.algo == 'td3':
        model = ddpg_td3.DDPG(obs_shape[0],
                              act_shape[0],
                              use_td3=True,
                              policy_update_freq=2,
                              actor_lr=1e-3)
    else:
        model = ddpg_td3.DDPG(obs_shape[0],
                              act_shape[0],
                              use_td3=False,
                              policy_update_freq=1,
                              actor_lr=1e-4)

    replay_buffer_var = contrib_eager_python_tfe.Variable('',
                                                          name='replay_buffer')
    gym_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='gym_random_state')
    np_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='np_random_state')
    py_random_state_var = contrib_eager_python_tfe.Variable(
        '', name='py_random_state')

    saver = contrib_eager_python_tfe.Saver(
        model.variables + [replay_buffer_var] +
        [gym_random_state_var, np_random_state_var, py_random_state_var])
    tf.gfile.MakeDirs(FLAGS.save_dir)

    reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale')
    eval_saver = contrib_eager_python_tfe.Saver(model.actor.variables +
                                                [reward_scale])
    tf.gfile.MakeDirs(FLAGS.eval_save_dir)

    last_checkpoint = tf.train.latest_checkpoint(FLAGS.save_dir)
    if last_checkpoint is None:
        replay_buffer = ReplayBuffer()
        total_numsteps = 0
        prev_save_timestep = 0
        prev_eval_save_timestep = 0
    else:
        saver.restore(last_checkpoint)
        replay_buffer = pickle.loads(zlib.decompress(
            replay_buffer_var.numpy()))
        total_numsteps = int(last_checkpoint.split('-')[-1])
        assert len(replay_buffer) == total_numsteps
        prev_save_timestep = total_numsteps
        prev_eval_save_timestep = total_numsteps
        env.unwrapped.np_random.set_state(
            pickle.loads(gym_random_state_var.numpy()))
        np.random.set_state(pickle.loads(np_random_state_var.numpy()))
        random.setstate(pickle.loads(py_random_state_var.numpy()))

    with summary_writer.as_default():
        while total_numsteps < FLAGS.training_steps:
            rollout_reward, rollout_timesteps = do_rollout(
                env,
                model.actor,
                replay_buffer,
                noise_scale=FLAGS.exploration_noise,
                rand_actions=rand_actions)
            total_numsteps += rollout_timesteps

            logging.info('Training: total timesteps %d, episode reward %f',
                         total_numsteps, rollout_reward)

            print('Training: total timesteps {}, episode reward {}'.format(
                total_numsteps, rollout_reward))

            with contrib_summary.always_record_summaries():
                contrib_summary.scalar('reward',
                                       rollout_reward,
                                       step=total_numsteps)
                contrib_summary.scalar('length',
                                       rollout_timesteps,
                                       step=total_numsteps)

            if len(replay_buffer) >= FLAGS.min_samples_to_start:
                for _ in range(rollout_timesteps):
                    time_step = replay_buffer.sample(
                        batch_size=FLAGS.batch_size)
                    batch = TimeStep(*zip(*time_step))
                    model.update(batch)

                if total_numsteps - prev_save_timestep >= FLAGS.save_interval:
                    replay_buffer_var.assign(
                        zlib.compress(pickle.dumps(replay_buffer)))
                    gym_random_state_var.assign(
                        pickle.dumps(env.unwrapped.np_random.get_state()))
                    np_random_state_var.assign(
                        pickle.dumps(np.random.get_state()))
                    py_random_state_var.assign(pickle.dumps(random.getstate()))

                    saver.save(os.path.join(FLAGS.save_dir, 'checkpoint'),
                               global_step=total_numsteps)
                    prev_save_timestep = total_numsteps

                if total_numsteps - prev_eval_save_timestep >= FLAGS.eval_save_interval:
                    eval_saver.save(os.path.join(FLAGS.eval_save_dir,
                                                 'checkpoint'),
                                    global_step=total_numsteps)
                    prev_eval_save_timestep = total_numsteps
def main(_):
  """Run td3/ddpg training."""
  contrib_eager_python_tfe.enable_eager_execution()

  if FLAGS.use_gpu:
    tf.device('/device:GPU:0').__enter__()

  tf.gfile.MakeDirs(FLAGS.log_dir)
  summary_writer = contrib_summary.create_file_writer(
      FLAGS.log_dir, flush_millis=10000)

  tf.set_random_seed(FLAGS.seed)
  np.random.seed(FLAGS.seed)
  random.seed(FLAGS.seed)

  env = gym.make(FLAGS.env)
  env.seed(FLAGS.seed)
  if FLAGS.learn_absorbing:
    env = lfd_envs.AbsorbingWrapper(env)

  if FLAGS.env in ['HalfCheetah-v2', 'Ant-v1']:
    rand_actions = int(1e4)
  else:
    rand_actions = int(1e3)

  obs_shape = env.observation_space.shape
  act_shape = env.action_space.shape

  subsampling_rate = env._max_episode_steps // FLAGS.trajectory_size  # pylint: disable=protected-access
  lfd = gail.GAIL(
      obs_shape[0] + act_shape[0],
      subsampling_rate=subsampling_rate,
      gail_loss=FLAGS.gail_loss)

  if FLAGS.algo == 'td3':
    model = ddpg_td3.DDPG(
        obs_shape[0],
        act_shape[0],
        use_td3=True,
        policy_update_freq=2,
        actor_lr=FLAGS.actor_lr,
        get_reward=lfd.get_reward,
        use_absorbing_state=FLAGS.learn_absorbing)
  else:
    model = ddpg_td3.DDPG(
        obs_shape[0],
        act_shape[0],
        use_td3=False,
        policy_update_freq=1,
        actor_lr=FLAGS.actor_lr,
        get_reward=lfd.get_reward,
        use_absorbing_state=FLAGS.learn_absorbing)

  random_reward, _ = do_rollout(
      env, model.actor, None, num_trajectories=10, sample_random=True)

  replay_buffer_var = contrib_eager_python_tfe.Variable(
      '', name='replay_buffer')
  expert_replay_buffer_var = contrib_eager_python_tfe.Variable(
      '', name='expert_replay_buffer')

  # Save and restore random states of gym/numpy/python.
  # If the job is preempted, it guarantees that it won't affect the results.
  # And the results will be deterministic (on CPU) and reproducible.
  gym_random_state_var = contrib_eager_python_tfe.Variable(
      '', name='gym_random_state')
  np_random_state_var = contrib_eager_python_tfe.Variable(
      '', name='np_random_state')
  py_random_state_var = contrib_eager_python_tfe.Variable(
      '', name='py_random_state')

  reward_scale = contrib_eager_python_tfe.Variable(1, name='reward_scale')

  saver = contrib_eager_python_tfe.Saver(
      model.variables + lfd.variables +
      [replay_buffer_var, expert_replay_buffer_var, reward_scale] +
      [gym_random_state_var, np_random_state_var, py_random_state_var])

  tf.gfile.MakeDirs(FLAGS.save_dir)

  eval_saver = contrib_eager_python_tfe.Saver(model.actor.variables +
                                              [reward_scale])
  tf.gfile.MakeDirs(FLAGS.eval_save_dir)

  last_checkpoint = tf.train.latest_checkpoint(FLAGS.save_dir)
  if last_checkpoint is None:
    expert_saver = contrib_eager_python_tfe.Saver([expert_replay_buffer_var])
    last_checkpoint = os.path.join(FLAGS.expert_dir, 'expert_replay_buffer')
    expert_saver.restore(last_checkpoint)
    expert_replay_buffer = pickle.loads(expert_replay_buffer_var.numpy())
    expert_reward = expert_replay_buffer.get_average_reward()

    logging.info('Expert reward %f', expert_reward)
    print('Expert reward {}'.format(expert_reward))

    reward_scale.assign(expert_reward)
    expert_replay_buffer.subsample_trajectories(FLAGS.num_expert_trajectories)
    if FLAGS.learn_absorbing:
      expert_replay_buffer.add_absorbing_states(env)

    # Subsample after adding absorbing states, because otherwise we can lose
    # final states.

    print('Original dataset size {}'.format(len(expert_replay_buffer)))
    expert_replay_buffer.subsample_transitions(subsampling_rate)
    print('Subsampled dataset size {}'.format(len(expert_replay_buffer)))
    replay_buffer = ReplayBuffer()
    total_numsteps = 0
    prev_save_timestep = 0
    prev_eval_save_timestep = 0
  else:
    saver.restore(last_checkpoint)
    replay_buffer = pickle.loads(zlib.decompress(replay_buffer_var.numpy()))
    expert_replay_buffer = pickle.loads(
        zlib.decompress(expert_replay_buffer_var.numpy()))
    total_numsteps = int(last_checkpoint.split('-')[-1])
    prev_save_timestep = total_numsteps
    prev_eval_save_timestep = total_numsteps
    env.unwrapped.np_random.set_state(
        pickle.loads(gym_random_state_var.numpy()))
    np.random.set_state(pickle.loads(np_random_state_var.numpy()))
    random.setstate(pickle.loads(py_random_state_var.numpy()))

  with summary_writer.as_default():
    while total_numsteps < FLAGS.training_steps:
      # Decay helps to make the model more stable.
      # TODO(agrawalk): Use tf.train.exponential_decay
      model.actor_lr.assign(
          model.initial_actor_lr * pow(0.5, total_numsteps // 100000))
      logging.info('Learning rate %f', model.actor_lr.numpy())
      rollout_reward, rollout_timesteps = do_rollout(
          env,
          model.actor,
          replay_buffer,
          noise_scale=FLAGS.exploration_noise,
          rand_actions=rand_actions,
          sample_random=(model.actor_step.numpy() == 0),
          add_absorbing_state=FLAGS.learn_absorbing)
      total_numsteps += rollout_timesteps

      logging.info('Training: total timesteps %d, episode reward %f',
                   total_numsteps, rollout_reward)

      print('Training: total timesteps {}, episode reward {}'.format(
          total_numsteps, rollout_reward))

      with contrib_summary.always_record_summaries():
        contrib_summary.scalar(
            'reward/scaled', (rollout_reward - random_reward) /
            (reward_scale.numpy() - random_reward),
            step=total_numsteps)
        contrib_summary.scalar('reward', rollout_reward, step=total_numsteps)
        contrib_summary.scalar('length', rollout_timesteps, step=total_numsteps)

      if len(replay_buffer) >= FLAGS.min_samples_to_start:
        for _ in range(rollout_timesteps):
          time_step = replay_buffer.sample(batch_size=FLAGS.batch_size)
          batch = TimeStep(*zip(*time_step))

          time_step = expert_replay_buffer.sample(batch_size=FLAGS.batch_size)
          expert_batch = TimeStep(*zip(*time_step))

          lfd.update(batch, expert_batch)

        for _ in range(FLAGS.updates_per_step * rollout_timesteps):
          time_step = replay_buffer.sample(batch_size=FLAGS.batch_size)
          batch = TimeStep(*zip(*time_step))
          model.update(
              batch,
              update_actor=model.critic_step.numpy() >=
              FLAGS.policy_updates_delay)

        if total_numsteps - prev_save_timestep >= FLAGS.save_interval:
          replay_buffer_var.assign(zlib.compress(pickle.dumps(replay_buffer)))
          expert_replay_buffer_var.assign(
              zlib.compress(pickle.dumps(expert_replay_buffer)))
          gym_random_state_var.assign(
              pickle.dumps(env.unwrapped.np_random.get_state()))
          np_random_state_var.assign(pickle.dumps(np.random.get_state()))
          py_random_state_var.assign(pickle.dumps(random.getstate()))
          saver.save(
              os.path.join(FLAGS.save_dir, 'checkpoint'),
              global_step=total_numsteps)
          prev_save_timestep = total_numsteps

        if total_numsteps - prev_eval_save_timestep >= FLAGS.eval_save_interval:
          eval_saver.save(
              os.path.join(FLAGS.eval_save_dir, 'checkpoint'),
              global_step=total_numsteps)
          prev_eval_save_timestep = total_numsteps
Пример #10
0
from __future__ import print_function
import keras
from tensorflow.contrib.eager.python import tfe
eager = True
if eager: tfe.enable_eager_execution()
import tensorflow as tf
from keras.datasets import cifar10
import tfl
import numpy as np
import os
from collections import OrderedDict

os.environ["CUDA_VISIBLE_DEVICES"]= "0"
os.environ['KMP_DUPLICATE_LIB_OK']='True'



# --------------------- TRAINING PARAMETERS----------------------------------
iterations = 20000
data_augmentation = False
subtract_pixel_mean = True
n = 6
depth = n * 9 + 2
original_num_classes = 10
num_classes = original_num_classes + 7
use_logic = True
transductive = True
minibatch_size = 20
supervided_size = 1000 # -1 means all of them

Пример #11
0
def main(_):
    """Run td3/ddpg training."""
    tfe.enable_eager_execution()

    if FLAGS.use_gpu:
        tf.device('/device:GPU:0').__enter__()

    if FLAGS.expert_dir.find(FLAGS.env) == -1:
        raise ValueError('Expert directory must contain the environment name')

    tf.set_random_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    random.seed(FLAGS.seed)

    env = gym.make(FLAGS.env)
    env.seed(FLAGS.seed)

    obs_shape = env.observation_space.shape
    act_shape = env.action_space.shape

    expert_replay_buffer_var = tfe.Variable('', name='expert_replay_buffer')

    saver = tfe.Saver([expert_replay_buffer_var])
    tf.gfile.MakeDirs(FLAGS.save_dir)

    with tf.variable_scope('actor'):
        actor = Actor(obs_shape[0], act_shape[0])
    expert_saver = tfe.Saver(actor.variables)

    best_checkpoint = None
    best_reward = float('-inf')

    checkpoint_state = tf.train.get_checkpoint_state(FLAGS.expert_dir)

    for checkpoint in checkpoint_state.all_model_checkpoint_paths:
        expert_saver.restore(checkpoint)
        expert_reward, _ = do_rollout(env,
                                      actor,
                                      replay_buffer=None,
                                      noise_scale=0.0,
                                      num_trajectories=10)

        if expert_reward > best_reward:
            best_reward = expert_reward
            best_checkpoint = checkpoint

    expert_saver.restore(best_checkpoint)

    expert_replay_buffer = ReplayBuffer()
    expert_reward, _ = do_rollout(
        env,
        actor,
        replay_buffer=expert_replay_buffer,
        noise_scale=0.0,
        num_trajectories=FLAGS.num_expert_trajectories)

    logging.info('Expert reward %f', expert_reward)
    print('Expert reward {}'.format(expert_reward))

    expert_replay_buffer_var.assign(pickle.dumps(expert_replay_buffer))
    saver.save(os.path.join(FLAGS.save_dir, 'expert_replay_buffer'))