def getRewards(self):
     # episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
     rews = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards()
     ret = -float('nan')
     if len(rews) > 100:
         ret = np.mean(rews[-100:])
     return ret
 def explore(self):
     print('Process: %d has PID: %d' % (self.procId, os.getpid()))
     #
     # For the first run, just setup a random action.
     self.lastObs = self.env.reset()
     obs, reward, done, info = self.env.step(0)
     self.retFrame.copy_(torch.from_numpy(obs))
     # self.com.send(( reward, done, 0, 0, 0))
     self.lastObs = obs
     self.retFrame.copy_(torch.from_numpy(obs))
     self.reward.copy_(torch.from_numpy(np.atleast_1d(reward)))
     self.done.copy_(torch.from_numpy(np.atleast_1d(done).astype(np.uint8)))
     self.action.copy_(torch.from_numpy(np.atleast_1d(0)))
     self.meanRewards.copy_(torch.from_numpy(np.atleast_1d(-float('nan'))))
     self.nEps.copy_(torch.from_numpy(np.atleast_1d(0)))
     #
     # Notify that remembory is ready.
     self.barrier.wait()
     # self.com.send(0)
     minEp = 100 // self.cfg.numEnv
     #
     # Loop and do work.
     while True:
         #
         # Wait for actions.
         step = self.com.recv()
         action = self.actionVec.clone().numpy().astype(
             np.int64)[self.procId]
         obs, reward, done, info = self.env.step(action)
         #
         # Step and save transition.
         if done:
             obs = self.env.reset()
         #
         # Store effects.
         lastRew = get_wrapper_by_name(self.env,
                                       "Monitor").get_episode_rewards()
         mean_episode_reward = -float('nan')
         if (len(lastRew) > minEp):
             mean_episode_reward = np.mean(lastRew[-minEp:])
         # self.com.send(( reward, done, action, mean_episode_reward, len(lastRew)))
         self.lastObs = obs
         self.retFrame.copy_(torch.from_numpy(self.lastObs))
         self.reward.copy_(torch.from_numpy(np.atleast_1d(reward)))
         self.done.copy_(
             torch.from_numpy(np.atleast_1d(done).astype(np.uint8)))
         self.action.copy_(torch.from_numpy(np.atleast_1d(action)))
         self.meanRewards.copy_(
             torch.from_numpy(np.atleast_1d(mean_episode_reward)))
         self.nEps.copy_(torch.from_numpy(np.atleast_1d(len(lastRew))))
         #
         # Notify that remembory is ready.
         self.barrier.wait()
示例#3
0
文件: dqn.py 项目: leourbina/cs285
  def log_progress(self):
    episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards()

    if len(episode_rewards) > 0:
      self.mean_episode_reward = np.mean(episode_rewards[-100:])

    if len(episode_rewards) > 100:
      self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward)

    if self.t % self.log_every_n_steps == 0 and self.model_initialized:
      logz.log_tabular("TimeStep", self.t)
      logz.log_tabular("MeanReturn", self.mean_episode_reward)
      logz.log_tabular("BestMeanReturn", max(self.best_mean_episode_reward, self.mean_episode_reward))
      logz.log_tabular("Episodes", len(episode_rewards))
      logz.log_tabular("Exploration", self.exploration.value(self.t))
      logz.log_tabular("LearningRate", self.optimizer_spec.lr_lambda(self.t))
      logz.log_tabular("Time", (time.time() - self.start_time) / 60.)
      logz.dump_tabular()
      logz.save_pytorch_model(self.q_net)
示例#4
0
 def stopping_criterion(env, t):
     # notice that here t is the number of steps of the wrapped env,
     # which is different from the number of steps in the underlying env
     return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
def learn(env,
          q_func,
          optimizer_spec,
          session,
          exploration=dqn_utils.LinearSchedule(1000000, 0.1),
          stopping_criterion=None,
          replay_buffer_size=1000000,
          batch_size=32,
          gamma=0.99,
          learning_starts=50000,
          learning_freq=4,
          frame_history_len=4,
          target_update_freq=10000,
          grad_norm_clipping=10):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            img_in: tf.Tensor
                tensorflow tensor representing the input image
            num_actions: int
                number of actions
            scope: str
                scope in which all the model related variables
                should be created
            reuse: bool
                whether previously created variables should be reused.
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    session: tf.Session
        tensorflow session to use.
    exploration: rl_algs.deepq.utils.schedules.Schedule
        schedule for probability of chosing random action.
    stopping_criterion: (env, t) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    grad_norm_clipping: float or None
        If not None gradients' norms are clipped to this value.
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_shape = env.observation_space.shape
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_shape = (img_h, img_w, frame_history_len * img_c)

    num_actions = env.action_space.n

    # set up placeholders
    # placeholder for current observation (or state)
    obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for current action
    act_t_ph = tf.placeholder(tf.int32, [None])
    # placeholder for current reward
    rew_t_ph = tf.placeholder(tf.float32, [None])
    # placeholder for next observation (or state)
    obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for end of episode mask
    # this value is 1 if the next state corresponds to the end of an episode,
    # in which case there is no Q-value at the next state; at the end of an
    # episode, only the current state reward contributes to the target, not the
    # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
    done_mask_ph = tf.placeholder(tf.float32, [None])
    # casting to float on GPU ensures lower data transfer times.
    obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0
    obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0

    # Here, you should fill in your own code to compute the Bellman error. This requires
    # evaluating the current and next Q-values and constructing the corresponding error.
    # TensorFlow will differentiate this error for you, you just need to pass it to the
    # optimizer. See assignment text for details.
    # Your code should produce one scalar-valued tensor: total_error
    # This will be passed to the optimizer in the provided code below.
    # Your code should also produce two collections of variables:
    #    q_func_vars
    #    target_q_func_vars
    # These should hold all of the variables of the Q-function network and target network,
    # respectively. A convenient way to get these is to make use of TF's "scope" feature.
    # For example, you can create your Q-function network with the scope "q_func" like this:
    # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
    # And then you can obtain the variables like this:
    # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
    # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"

    ######
    # YOUR CODE HERE
    ######

    Qfunc = q_func(obs_t_float, num_actions, scope='Qfunc')
    q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                    scope='Qfunc')

    Qtarget = q_func(obs_tp1_float, num_actions, scope='Qtarget')
    target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Qtarget')

    #total_error = (y - Q)^2
    # y = r + gamma * max_a' Qtarget(S_t+1, a')

    q_tp1 = tf.reduce_max(Qtarget)
    y = rew_t_ph + (1. - done_mask_ph) * (gamma * q_tp1)
    #total_error = tf.reduce_sum ( tf.square (y - Qfunc[act_t_ph]) )
    actions_onehot = tf.one_hot(act_t_ph, num_actions, dtype=tf.float32)
    Qfunc_action_t = tf.reduce_sum(tf.multiply(Qfunc, actions_onehot), axis=1)
    total_error = tf.reduce_sum(tf.square(y - Qfunc_action_t))

    # ---------------

    #
    # construct optimization op (with gradient clipping)
    learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
    optimizer = optimizer_spec.constructor(learning_rate=learning_rate,
                                           **optimizer_spec.kwargs)
    train_fn = minimize_and_clip(optimizer,
                                 total_error,
                                 var_list=q_func_vars,
                                 clip_val=grad_norm_clipping)

    # update_target_fn will be called periodically to copy Q network to target Q network
    update_target_fn = []
    for var, var_target in zip(
            sorted(q_func_vars, key=lambda v: v.name),
            sorted(target_q_func_vars, key=lambda v: v.name)):
        update_target_fn.append(var_target.assign(var))
    update_target_fn = tf.group(*update_target_fn)

    # construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
    print('## replay_buffer: size={}, frame_history_len={}'.format(
        replay_buffer_size, frame_history_len))

    ###############
    # RUN ENV     #
    ###############
    model_initialized = False
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')

    last_obs = env.reset()

    LOG_EVERY_N_STEPS = 10000

    episode_count = 1

    for t in itertools.count(
    ):  # itertools.count(n) generates an infinite iterator starting from n.
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env, t):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)

        #####
        # YOUR CODE HERE
        #####

        last_obs_indx = replay_buffer.store_frame(last_obs)

        action = 0
        p_random = exploration.value(t)  # sample exploration probability
        if np.random.rand() < p_random \
                or replay_buffer.can_sample(batch_size)==False\
                or t <= learning_starts:
            action = env.action_space.sample()  # random action
        else:  # exploration
            otph = replay_buffer.encode_recent_observation()
            action = session.run(tf.argmax(Qfunc), feed_dict={obs_t_ph: otph})

        obstp1, reward, done, info = env.step(action)

        replay_buffer.store_effect(last_obs_indx, action, reward, done)

        episode_count += 1

        if done:
            env.reset()
            print('## done {} with reward={}'.format(episode_count, reward))
            episode_count = 0

        last_obs = obstp1

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):

            print('### 3. Perform experience replay and train the network.')

            # Here, you should perform training. Training consists of four steps:

            # 3.a:
            # use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            print('  3.a replay_buffer.sample({})'.format(batch_size))

            bobs, baction, brew, bnobs, bdone = replay_buffer.sample(
                batch_size)

            # 3.b:
            # initialize the model if it has not been initialized yet; to do
            # that, call
            #    initialize_interdependent_variables(session, tf.global_variables(), {
            #        obs_t_ph: obs_t_batch,
            #        obs_tp1_ph: obs_tp1_batch,
            #    })
            # where obs_t_batch and obs_tp1_batch are the batches of observations at
            # the current and next time step. The boolean variable model_initialized
            # indicates whether or not the model has been initialized.
            # Remember that you have to update the target network too (see 3.d)!

            # defined in dqn_utils.py
            #if model_initialized == False:
            #    model_initialized = True
            initialize_interdependent_variables(session, tf.global_variables(),
                                                {
                                                    obs_t_ph: bobs,
                                                    obs_tp1_ph: bnobs
                                                })
            print('  3.b initialize_interdependent_variables().')

            # 3.c:
            # train the model. To do this, you'll need to use the train_fn and
            # total_error ops that were created earlier: total_error is what you
            # created to compute the total Bellman error in a batch, and train_fn
            # will actually perform a gradient step and update the network parameters
            # to reduce total_error. When calling session.run on these you'll need to
            # populate the following placeholders:
            # obs_t_ph
            # act_t_ph
            # rew_t_ph
            # obs_tp1_ph
            # done_mask_ph
            # (this is needed for computing total_error)
            # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
            # (this is needed by the optimizer to choose the learning rate)

            print('  3.c train the model.')
            lr = optimizer_spec.lr_schedule.value(t)

            feed_dict = {
                obs_t_ph: bobs,
                act_t_ph: baction,
                rew_t_ph: brew,
                obs_tp1_ph: bnobs,
                done_mask_ph: bdone,
                learning_rate: lr
            }

            session.run(train_fn, feed_dict=feed_dict)

            # 3.d:
            # periodically update the target network by calling
            # session.run(update_target_fn)
            # you should update every target_update_freq steps, and you may find the
            # variable num_param_updates useful for this (it was initialized to 0)

            #####
            # YOUR CODE HERE
            #####

            if t % target_update_freq == 0:
                print('  3.d periodically update the target network.')
                session.run(update_target_fn)
                num_param_updates += 1
                print('## Qtarget updated {} times.'.format(num_param_updates))

        ### 4. Log progress
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)
        if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
            sys.stdout.flush()
        #
    return


# eof
 def getNumEps(self):
     return len(
         get_wrapper_by_name(self.env, "Monitor").get_episode_rewards())
示例#7
0
def learn(env,
          q_func,
          initialize_model: Callable[[Tuple, int], Dict],
          batch_size=32,
          exploration=LinearSchedule(1000000, 0.1),
          frame_history_len: int = 4,
          gamma: float = 0.99,
          learning_starts=50000,
          lr_schedule=LinearSchedule(1000000, 0.1),
          learning_freq=4,
          replay_buffer_size: int = 1000000,
          start_time=time.time(),
          stopping_criterion: Callable[[wrappers.Monitor, int], bool] = None,
          target_update_freq=10000,
          checkpoint_dir='./checkpoints',
          grad_norm_clipping=10):
    """Train a two-layer neural network.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Structured after github.com/alvinwan/deep-q-learning

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            img_in: tf.Tensor
                tensorflow tensor representing the input image
            num_actions: int
                number of actions
            scope: str
                scope in which all the model related variables
                should be created
            reuse: bool
                whether previously created variables should be reused.
    exploration: rl_algs.deepq.utils.schedules.Schedule
        schedule for probability of chosing random action.
    stopping_criterion: (env, t) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    lr_schedule: rl_algs.deepq.utils.schedules.Schedule
        schedule for learning rate.
    frame_history_len: int
        How many past frames to include as input to the model.
    start_time: datetime
        The time of training start
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    img_h, img_w, img_c = env.observation_space.shape
    input_shape = (img_h, img_w, frame_history_len * img_c)
    num_actions = env.action_space.n

    # construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    def update_target_func(model_curr: Dict, model_target: Dict):
        model_curr.update(model_target)

    def train_func(obs_t: np.ndarray, act_t: np.ndarray, rew_t: np.ndarray,
                   obs_tp1: np.ndarray, done_mask: np.ndarray,
                   learning_rate: float, model_curr: Dict,
                   model_target: Dict) -> Dict:
        """Train function, minimizing loss per q-learning objective.

        This assumes the q_function is a one-layer fc neural network, where the
        loss function is squared error.
        """

        curr_q = q_func(obs_t, model_curr)
        target_q = q_func(obs_tp1, model_target)
        actions = one_hot(act_t, num_actions)
        q_target_max = np.max(target_q, axis=1)
        q_target_val = rew_t + gamma * (1. - done_mask) * q_target_max
        q_candidate_val = np.sum(curr_q * actions, axis=1)
        _ = sum((q_target_val - q_candidate_val)**2)

        d = obs_t.shape[1] * obs_t.shape[2] * obs_t.shape[3]
        obs_t = obs_t.reshape((-1, d))

        loss_gradient = -2 * (q_target_val - q_candidate_val)
        x_loss_gradient = obs_t.T * loss_gradient
        gradient = x_loss_gradient.dot(actions)
        clipped_gradient = clip_by_norm(gradient, grad_norm_clipping)
        model_curr['W0'] += learning_rate * clipped_gradient

        return model_curr

    ###########
    # RUN ENV #
    ###########

    model_initialized = False
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000
    learning_rate = exploration.value(0)
    model_curr = {}
    model_target = {}
    run_id = str(start_time)[-5:].replace('.', '')
    os.makedirs(os.path.join(checkpoint_dir, run_id), exist_ok=True)

    for t in itertools.count():

        # 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env, t):
            break

        # 2. Step the env and store the transition
        t_obs_idx = replay_buffer.store_frame(last_obs)

        if np.random.random() < exploration.value(t) \
                or not model_initialized \
                or not replay_buffer.can_sample(batch_size):
            action = env.action_space.sample()
        else:
            r_obs = replay_buffer.encode_recent_observation()[np.newaxis, ...]
            curr_q_eval = q_func(r_obs, model_curr)
            action = np.argmax(curr_q_eval)

        last_obs, reward, done, info = env.step(action)
        replay_buffer.store_effect(t_obs_idx, action, reward, done)

        if done:
            last_obs = env.reset()

        # 3. Perform experience relay and train the network.
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):

            obs_t, act_t, rew_t, obs_tp1, done_mask = \
                replay_buffer.sample(batch_size)

            if not model_initialized:
                model_initialized = True
                model_curr = initialize_model(input_shape, num_actions)
                model_target = model_curr

            learning_rate = lr_schedule.value(t)
            model_curr = train_func(obs_t=obs_t,
                                    act_t=act_t,
                                    rew_t=rew_t,
                                    obs_tp1=obs_tp1,
                                    done_mask=done_mask,
                                    learning_rate=learning_rate,
                                    model_curr=model_curr,
                                    model_target=model_target)

            if t % target_update_freq == 0:
                update_target_func(model_curr, model_target)
                num_param_updates += 1

        # 4. Log progress
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)
        if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
            if start_time is not None:
                print("Time %s s" % int(time.time() - start_time))
            start_time = time.time()
            print("Timestep %d" % t)
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            print("learning_rate %f" % learning_rate)
            sys.stdout.flush()
        scipy.io.savemat(
            os.path.join(checkpoint_dir, run_id, 'step-%d.mat' % t),
            model_curr)
    scipy.io.savemat(os.path.join(checkpoint_dir, run_id, 'step-final.mat'),
                     model_curr)
    return model_curr
示例#8
0
def main():
    arguments = docopt.docopt(__doc__)

    # Run training
    seed = int(str(time.time())[-5:])
    env = get_custom_env(arguments['--envid'], seed)
    n_episodes = int(arguments['--n_episodes'])
    save_path = arguments['--save_path']
    logdir = arguments['--logdir']

    os.makedirs(logdir, exist_ok=True)

    num_actions = env.action_space.n
    img_h, img_w, img_c = env.observation_space.shape
    frame_history_len = 4
    replay_buffer_size = 1000000
    input_shape = (img_h, img_w, frame_history_len * img_c)
    num_timesteps = 40000000

    def stopping_criterion(env, t):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    with get_session() as session:

        # set up placeholders
        # placeholder for current observation (or state)
        obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
        # casting to float on GPU ensures lower data transfer times.
        obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0

        global_vars = tf.GraphKeys.GLOBAL_VARIABLES
        curr_q = atari_model(obs_t_float, num_actions, scope='q_func')

        obs_sars = []

        saver = tf.train.Saver()
        saver.restore(session, save_path)
        print(' * Restore from', save_path)

        # construct the replay buffer
        replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
        last_obs = env.reset()

        for i in range(n_episodes):
            episode_reward = 0
            j = 0
            while True:
                t_obs_idx = replay_buffer.store_frame(last_obs)

                r_obs = replay_buffer.encode_recent_observation()[np.newaxis,
                                                                  ...]
                curr_q_eval = session.run([curr_q], {obs_t_ph: r_obs})
                action = np.argmax(curr_q_eval)

                last_obs, reward, done, info = env.step(action)
                episode_reward += reward
                replay_buffer.store_effect(t_obs_idx, action, reward, done)
                obs_sars.append(
                    np.hstack((last_obs.reshape((1, -1)), action.reshape(
                        (1, 1)), np.array([reward]).reshape((1, 1)))))

                if done:
                    j += 1
                    last_obs = env.reset()
                    episode_rewards = get_wrapper_by_name(
                        env, 'Monitor').get_episode_rewards()
                    if episode_rewards:
                        episode_reward = episode_rewards[-1]
                        if episode_reward < 0:
                            print(' * Reward too low (%d)... resetting.' %
                                  episode_reward)
                            obs_sars = []
                        else:
                            break

            print(' * Episode %d with reward %d' % (i, episode_reward))
            write_sar_log(obs_sars, logdir, episode_reward)
            obs_sars = []