Exemplo n.º 1
0
    def test_observation_zeroing(self):
        """ Tests zeroing out of frames not from current episode """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        for terminal_idx in range(5):
            obs_ = []
            obs_next_ = []
            for i in range(1, 6):
                partial_obs = np.ones(obs_shape) * i
                terminal = 1 if i == terminal_idx else 0
                er.append(partial_obs, 0, 0, terminal)

                if i <= terminal_idx:
                    partial_obs *= 0
                if i < 5:
                    obs_.append(partial_obs)
                if i > 1:
                    obs_next_.append(partial_obs)
            obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0))
            obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0))

            batch = er.sample(1)
            obs, rewards, actions, obs_next, terminals = batch
            assert np.array_equal(obs_, obs)
            assert np.array_equal(obs_next_, obs_next)
Exemplo n.º 2
0
def run_episode(plan_step_fn,
                learner,
                dataset,
                cache_subtree,
                add_returns,
                preproc_obs_fn=None,
                render=False):
    episode_done = False
    actor.reset()
    episode_rewards = []
    aux_replay = ExperienceReplay(
    )  # New auxiliary buffer to save current episode transitions
    while not episode_done:
        # Planning step
        tree_policy = plan_step_fn(len(episode_rewards))

        # Execute action (choose one node as the new root from depth 1)
        a = sample_pmf(tree_policy)
        prev_root_data, current_root_data = actor.step(a,
                                                       cache_subtree,
                                                       render,
                                                       render_size=(512, 512))
        aux_replay.append({
            "observations": prev_root_data["obs"],
            "target_policy": tree_policy
        })
        episode_rewards.append(current_root_data["r"])
        episode_done = current_root_data["done"]

        # Learning step
        if learner is not None:
            batch = dataset.sample(batch_size)
            if preproc_obs_fn is not None:
                batch["observations"] = preproc_obs_fn(batch["observations"])
            obs = tf.constant(batch["observations"], dtype=tf.float32)
            target_policy = tf.constant(batch["target_policy"],
                                        dtype=tf.float32)
            if add_returns:
                returns = tf.constant(batch["returns"], dtype=tf.float32)
                loss, _ = learner.train_step(obs, target_policy, returns)
            else:
                loss, _ = learner.train_step(obs, target_policy)

    # Add episode to the dataset
    if add_returns:
        returns = compute_returns(episode_rewards,
                                  discount_factor)  # Backpropagate rewards
        aux_replay.add_column("returns", returns)  # Add them to the dataset
    dataset.extend(
        aux_replay
    )  # Add transitions to the buffer that will be used for learning

    return episode_rewards
Exemplo n.º 3
0
    def test_sampling(self):
        """ Tests observation construction from partial observations """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        for i in range(1, 6):
            partial_obs = np.ones(obs_shape) * i
            er.append(partial_obs, 1, 1, 0)

        batch = er.sample(1)
        _, rewards, actions, _, terminals = batch
        assert np.array_equal(rewards, np.array([1]))
        assert np.array_equal(actions, np.array([1]))
        assert np.array_equal(terminals, np.array([0]))
Exemplo n.º 4
0
    def test_observation_construction(self):
        """ Tests observation construction from partial observations """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        obs_ = []
        obs_next_ = []
        for i in range(1, 6):
            partial_obs = np.ones(obs_shape) * i
            if i < 5:
                obs_.append(partial_obs)
            if i > 1:
                obs_next_.append(partial_obs)
            er.append(partial_obs, 0, 0, 0)
        obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0))
        obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0))

        batch = er.sample(1)
        obs, rewards, actions, obs_next, terminals = batch
        assert np.array_equal(obs_, obs)
        assert np.array_equal(obs_next_, obs_next)
Exemplo n.º 5
0
def main(_):
    # Reproducability
    tf.reset_default_graph()
    np.random.seed(cfg.random_seed)
    tf.set_random_seed(cfg.random_seed)

    # Logging
    summary_writer = tf.summary.FileWriter(cfg.log_dir)

    if not cfg.evaluate and not tf.gfile.Exists(cfg.save_dir):
        tf.gfile.MakeDirs(cfg.save_dir)
    else:
        assert tf.gfile.Exists(cfg.save_dir)

    # TODO handel this
    episode_results_path = os.path.join(cfg.log_dir, "episodeResults.csv")
    episode_results = tf.gfile.GFile(episode_results_path, "w")
    episode_results.write("model_freq={},save_dir={}".format(
        cfg.model_freq, cfg.save_dir))
    episode_results.write("episode,reward,steps\n")
    episode_results.flush()

    # Setup ALE and DQN graph
    obs_shape = (84, 84, 1)
    input_height, input_width, _ = obs_shape

    dqn = DQN(input_height, input_width, cfg.num_actions)

    # Global step
    global_step = tf.train.get_or_create_global_step()
    increment_step = tf.assign_add(global_step, 1)

    # Save all variables
    vars_to_save = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope="agent/q")
    vars_to_save.append(global_step)
    saver = tf.train.Saver(var_list=vars_to_save)

    # Handle loading specific variables
    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess = tf.Session(config=sess_config)

    restore_or_initialize_weights(sess, dqn, saver)
    sess.run(dqn.copy_to_target)

    if cfg.evaluate:
        # if in evaluation mode, saver is no longer needed
        saver = None

    # ##### Restoring AEs ########
    if not cfg.evaluate:
        vaes = create_generative_models(sess)
        image_summaries = []
        image_summaries_ph = tf.placeholder(tf.float32,
                                            shape=(4, 84, 84, 4),
                                            name="image_summaries_placeholder")
        for i in range(4):
            for j in range(4):
                image_summaries.append(
                    tf.summary.image(
                        "VAE_OUT_{}_{}".format(i, j),
                        tf.reshape(image_summaries_ph[i, :, :, j],
                                   (1, 84, 84, 1))))
    # ############################

    if not cfg.evaluate:
        summary_writer.add_graph(tf.get_default_graph())
        summary_writer.add_graph(vaes[0].graph)
        summary_writer.add_graph(vaes[1].graph)
        summary_writer.add_graph(vaes[2].graph)

        summary_writer.flush()

    # Initialize ALE
    postprocess_frame = lambda frame: sess.run(dqn.process_frame,
                                               feed_dict={dqn.image: frame})
    env = AtariEnvironment(obs_shape, postprocess_frame)

    # Replay buffer
    if not cfg.evaluate:
        replay_buffer = ExperienceReplay(cfg.replay_buffer_size, obs_shape)

    # Perform random policy to get some training data
    with tqdm(total=cfg.seed_frames,
              disable=cfg.disable_progress or cfg.evaluate) as pbar:
        seed_steps = 0
        while seed_steps * cfg.frame_skip < cfg.seed_frames and not cfg.evaluate:
            action = np.random.randint(cfg.num_actions)
            reward, next_state, terminal = env.act(action)
            seed_steps += 1

            replay_buffer.append(next_state[:, :, -1, np.newaxis], action,
                                 reward, terminal)

            if terminal:
                pbar.update(env.episode_frames)
                env.reset(inc_episode_count=False)

    if cfg.evaluate:
        assert cfg.max_episode_count > 0
    else:
        assert len(replay_buffer) >= cfg.seed_frames // cfg.frame_skip

    # Main training loop
    steps = tf.train.global_step(sess, global_step)
    env.reset(inc_episode_count=False)
    terminal = False

    total = cfg.max_episode_count if cfg.evaluate else cfg.num_frames
    with tqdm(total=total, disable=cfg.disable_progress) as pbar:
        # Loop while we haven't observed our max frame number
        # If we are at our max frame number we will finish the current episode
        while (not (
                # We must be evaluating or observed the last frame
                # As well as be terminal
                # As well as seen the maximum episode number
            (steps * cfg.frame_skip > cfg.num_frames or cfg.evaluate)
                and terminal and env.episode_count >= cfg.max_episode_count)):
            # Epsilon greedy policy with epsilon annealing
            if not cfg.evaluate and steps * cfg.frame_skip < cfg.eps_anneal_over:
                # Only compute epsilon step while we're still annealing epsilon
                epsilon = cfg.eps_initial - steps * (
                    (cfg.eps_initial - cfg.eps_final) / cfg.eps_anneal_over)
            else:
                epsilon = cfg.eps_final

            # Epsilon greedy policy
            if np.random.uniform() < epsilon:
                action = np.random.randint(0, cfg.num_actions)
            else:
                action = sess.run(dqn.action, feed_dict={dqn.S: [env.state]})

            # Perform environment step
            steps = sess.run(increment_step)
            reward, next_state, terminal = env.act(action)

            if not cfg.evaluate:
                replay_buffer.append(next_state[:, :, -1, np.newaxis], action,
                                     reward, terminal)

                # Sample and do gradient updates
                if steps % cfg.learning_freq == 0:
                    placeholders = [
                        dqn.S,
                        dqn.actions,
                        dqn.rewards,
                        dqn.S_p,
                        dqn.terminals,
                    ]
                    batch = replay_buffer.sample(cfg.batch_size)
                    train_op = [dqn.train]
                    if steps % (cfg.learning_freq * cfg.model_freq) == 0:
                        experience_batch = batch
                        batch = imagined_batch(vaes, batch[1])
                        if steps / (cfg.learning_freq * cfg.model_freq) < 10:
                            placeholders.append(image_summaries_ph)
                            batch = list(batch)
                            batch.append(batch[0][
                                np.random.randint(0, 32, size=4), :, :, :])
                            train_op.extend(image_summaries)
                    if steps % cfg.log_summary_every:
                        train_op.append(dqn.summary)
                    result = sess.run(
                        train_op,
                        feed_dict=dict(zip(placeholders, batch)),
                    )
                    if len(result) > 1:
                        for i in range(1, len(result)):
                            summary_writer.add_summary(result[i],
                                                       global_step=steps)
                if steps % cfg.target_update_every == 0:
                    sess.run([dqn.copy_to_target])
                if steps % cfg.model_chkpt_every == 0:
                    saver.save(sess,
                               "%s/model_epoch_%04d" % (cfg.save_dir, steps))

            if terminal:
                episode_results.write("%d,%d,%d\n" %
                                      (env.episode_count, env.episode_reward,
                                       env.episode_frames))
                episode_results.flush()
                # Log episode summaries to Tensorboard
                add_simple_summary(summary_writer, "episode/reward",
                                   env.episode_reward, env.episode_count)
                add_simple_summary(summary_writer, "episode/frames",
                                   env.episode_frames, env.episode_count)

                pbar.update(env.episode_frames if not cfg.evaluate else 1)
                env.reset()

    episode_results.close()
    tf.logging.info("Finished %d %s" % (
        cfg.max_episode_count if cfg.evaluate else cfg.num_frames,
        "episodes" if cfg.evaluate else "frames",
    ))